# **Skewness of Data**
 * 0 → Right skew

* < 0 → Left skew

* = 0 → Symmetric

In [1]:
import numpy as np
from scipy.stats import skew

data = np.random.exponential(scale=2, size=1000)

print("Skewness:", skew(data))

Skewness: 1.6704394592373728


# **Covariance**
Measures direction of relationship.
* Positive → move together
* Negative → move opposite

In [2]:
import numpy as np

x = np.array([1, 2, 3, 4, 5])
y = np.array([2, 4, 6, 8, 10])

cov_matrix = np.cov(x, y)
print("Covariance:", cov_matrix[0,1])

Covariance: 5.0


# **Pearson Correlation (Linear Relationship)**
* Range: -1 to +1
* Used when:
  1. Data is continuous
  2. Linear relationship
  3. Normally distributed (preferably)

In [3]:
from scipy.stats import pearsonr

corr, p_value = pearsonr(x, y)
print("Pearson Correlation:", corr)
print("P-value:", p_value)

Pearson Correlation: 1.0
P-value: 0.0


# **Spearman Rank Correlation**
* For non-normal or monotonic relationship.
* Better when:
   1. Outliers present
   2. Data not normal
   3. Rank-based analysis

In [4]:
from scipy.stats import spearmanr

corr, p_value = spearmanr(x, y)
print("Spearman Correlation:", corr)
print("P-value:", p_value)

Spearman Correlation: 0.9999999999999999
P-value: 1.4042654220543672e-24


# **Importance of Correlation**
In ML:
* Detect multicollinearity
* Feature selection
* Financial risk modeling

In [5]:
import pandas as pd

df = pd.DataFrame({
    "income": np.random.normal(50000, 10000, 100),
    "spending": np.random.normal(20000, 5000, 100)
})

print(df.corr())

            income  spending
income    1.000000  0.054998
spending  0.054998  1.000000


# **T-Test**
A t-test is a type of inferential statistic which is used to determine if there is a significant difference between the means of two groups which may be related in certain features.

T-test has 2 types :
1. one sampled t-test
2. two-sampled t-test.

## **One-Sample T-Test**
* Compare sample mean vs population mean.
* If p < 0.05 → reject null.

Example:
Is average transaction = 2000?

In [6]:
from scipy.stats import ttest_1samp

data = np.random.normal(2100, 300, 50)

t_stat, p_value = ttest_1samp(data, 2000)

print("T-stat:", t_stat)
print("P-value:", p_value)

T-stat: 2.420751780487859
P-value: 0.01923943781652603


In [7]:
ages=[10,20,35,50,28,40,55,18,16,55,30,25,43,18,30,28,14,24,16,17,32,35,26,27,65,18,43,23,21,20,19,70]
len(ages)

32

In [8]:
ages_mean=np.mean(ages)
print(ages_mean)

30.34375


In [9]:
## Lets take sample

sample_size=10
age_sample=np.random.choice(ages,sample_size)
age_sample

array([70, 25, 55, 30, 70, 18, 18, 20, 10, 24])

In [10]:
from scipy.stats import ttest_1samp
ttest,p_value=ttest_1samp(age_sample,30)
print(p_value)

0.5860048607695391


In [11]:
if p_value < 0.05:    # alpha value is 5%
    print("Reject null hypothesis.")
else:
    print("Fail to reject null hypothesis.")

Fail to reject null hypothesis.


## **Two-Sample T-Test (Independent)**
> Compare two group means.

The Independent Samples t Test or 2-sample t-test compares the means of two independent groups in order to determine whether there is a statistical evidence that the associated population means are significantly different. The Independent Samples t Test is a parametric test. This test is also known as Independent t Test.

Example:
Male vs Female spending.

In [12]:
from scipy.stats import ttest_ind

group1 = np.random.normal(2000, 300, 50)
group2 = np.random.normal(2200, 300, 50)

t_stat, p_value = ttest_ind(group1, group2)

print("T-stat:", t_stat)
print("P-value:", p_value)

T-stat: -3.3497673882067476
P-value: 0.001149284591740643


## **Paired T-Test**
> Before vs After comparison.

When you want to check how different samples from the same group are, you can go for a paired T-test.

In [13]:
from scipy.stats import ttest_rel

before = np.random.normal(2000, 300, 50)
after = before + np.random.normal(100, 50, 50)

t_stat, p_value = ttest_rel(before, after)

print("T-stat:", t_stat)
print("P-value:", p_value)

T-stat: -14.643180011316447
P-value: 1.5892485124560322e-19


In [14]:
if p_value < 0.05:    # alpha value is 0.05 or 5%
    print("Reject null hypothesis")
else:
    print("Fail to reject null hypothesis")

Reject null hypothesis


# **Chi-Square Test (Categorical Data)**
The test is applied when you have two categorical variables from a single population. It is used to determine whether there is a significant association between the two variables.

If:
Observed ≈ Expected → small chi-square → no relationship

If:
Observed very different from Expected → large chi-square → strong evidence against H₀

Example:
Is fraud dependent on gender?

In [15]:
from scipy.stats import chi2_contingency
import numpy as np

table = np.array([[30, 20],
                  [10, 40]])

chi2, p_value, dof, expected = chi2_contingency(table)

print("Chi-square:", chi2)
print("P-value:", p_value)

Chi-square: 15.041666666666668
P-value: 0.00010516355403363114


In [16]:
if p_value < 0.05:
   print("variables are associated")
else:
    print("variables not associated")

variables are associated


Example: Relationship between variables using tips dataset (Traditional method)

In [20]:
import seaborn as sns
import pandas as pd
import numpy as np
df = sns.load_dataset('tips')

In [21]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [22]:
df_table=pd.crosstab(df['sex'],df['smoker'])
print(df_table)

smoker  Yes  No
sex            
Male     60  97
Female   33  54


In [24]:
df_table.values

array([[60, 97],
       [33, 54]])

In [26]:
#Observed Values
observed_values = df_table.values
print("Observed Values :-\n",observed_values)

Observed Values :-
 [[60 97]
 [33 54]]


In [27]:
import scipy.stats as stats
val = stats.chi2_contingency(df_table)
val

Chi2ContingencyResult(statistic=np.float64(0.0), pvalue=np.float64(1.0), dof=1, expected_freq=array([[59.84016393, 97.15983607],
       [33.15983607, 53.84016393]]))

In [28]:
expected_values=val[3]

In [29]:
no_of_rows=len(df_table.iloc[0:2,0])
no_of_columns=len(df_table.iloc[0,0:2])
ddof=(no_of_rows-1)*(no_of_columns-1)
print("Degree of Freedom:-",ddof)
alpha = 0.05

Degree of Freedom:- 1


In [30]:
from scipy.stats import chi2
chi_square=sum([(o-e)**2./e for o,e in zip(observed_values,expected_values)])
chi_square_statistic=chi_square[0]+chi_square[1]

In [31]:
print("chi-square statistic:-",chi_square_statistic)

chi-square statistic:- 0.001934818536627623


In [32]:
critical_value=chi2.ppf(q=1-alpha,df=ddof)
print('critical_value:',critical_value)

critical_value: 3.841458820694124


In [33]:
#p-value
p_value=1-chi2.cdf(x=chi_square_statistic,df=ddof)
print('p-value:',p_value)
print('Significance level: ',alpha)
print('Degree of Freedom: ',ddof)
print('p-value:',p_value)

p-value: 0.964915107315732
Significance level:  0.05
Degree of Freedom:  1
p-value: 0.964915107315732


In [35]:
if chi_square_statistic >= critical_value:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables


In [36]:
if p_value<=alpha:
    print("Reject H0,There is a relationship between 2 categorical variables")
else:
    print("Retain H0,There is no relationship between 2 categorical variables")

Retain H0,There is no relationship between 2 categorical variables


# **Anova Test(F-Test)**
> The t-test works well when dealing with two groups, but sometimes we want to compare more than two groups at the same time.

For example, if we wanted to test whether petal_width age differs based on some categorical variable like species, we have to compare the means of each level or group the variable

## **One Way F-test(Anova) :-**
It tell whether two or more groups are similar or not based on their mean similarity and f-score.

Example : there are 3 different category of iris flowers and their petal width and need to check whether all 3 group are similar or not

In [37]:
import seaborn as sns
df1 = sns.load_dataset('iris')

In [38]:
df1.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [39]:
df_anova = df1[['petal_width','species']]

In [40]:
grps = pd.unique(df_anova.species.values)
grps

array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [41]:
d_data = {grp:df_anova['petal_width'][df_anova.species == grp] for grp in grps}
d_data

{'setosa': 0     0.2
 1     0.2
 2     0.2
 3     0.2
 4     0.2
 5     0.4
 6     0.3
 7     0.2
 8     0.2
 9     0.1
 10    0.2
 11    0.2
 12    0.1
 13    0.1
 14    0.2
 15    0.4
 16    0.4
 17    0.3
 18    0.3
 19    0.3
 20    0.2
 21    0.4
 22    0.2
 23    0.5
 24    0.2
 25    0.2
 26    0.4
 27    0.2
 28    0.2
 29    0.2
 30    0.2
 31    0.4
 32    0.1
 33    0.2
 34    0.2
 35    0.2
 36    0.2
 37    0.1
 38    0.2
 39    0.2
 40    0.3
 41    0.3
 42    0.2
 43    0.6
 44    0.4
 45    0.3
 46    0.2
 47    0.2
 48    0.2
 49    0.2
 Name: petal_width, dtype: float64,
 'versicolor': 50    1.4
 51    1.5
 52    1.5
 53    1.3
 54    1.5
 55    1.3
 56    1.6
 57    1.0
 58    1.3
 59    1.4
 60    1.0
 61    1.5
 62    1.0
 63    1.4
 64    1.3
 65    1.4
 66    1.5
 67    1.0
 68    1.5
 69    1.1
 70    1.8
 71    1.3
 72    1.5
 73    1.2
 74    1.3
 75    1.4
 76    1.4
 77    1.7
 78    1.5
 79    1.0
 80    1.1
 81    1.0
 82    1.2
 83    1.6
 84    1.5
 85  

In [42]:
F, p = stats.f_oneway(d_data['setosa'], d_data['versicolor'], d_data['virginica'])
print(p)

4.169445839443833e-85


In [44]:
if p < 0.05:
    print("reject null hypothesis")
else:
    print("fail to reject null hypothesis")

reject null hypothesis
