In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import f_oneway
from scipy.stats import chi2_contingency
from statsmodels.formula.api import ols
import warnings
warnings.filterwarnings( "ignore" )

In [2]:
df = pd.read_csv("/content/irisdata.csv")
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


## T-test

In [33]:
df_ = df.petal_width
print(np.mean(df_.sample(25)))

1.172


In [12]:
np.mean(df['petal_width'])

1.2053691275167786

H0: The mean of petal_width is 1.172

H1: The mean of petal_width is not 1.172

In [34]:
print(stats.ttest_1samp(a=df_, popmean=1.172))

Ttest_1sampResult(statistic=0.5350405478275901, pvalue=0.5934247950023986)


Analyzing the one-sample t-test, it finds out if the hypothesized mean is similar to or different from the group’s mean. From the example above, we’ve selected the petal_width as the population sample to perform the test on.

From the test, since the p-value beats the alpha level set at 0.05, we acknowledge the null hypothesis because we don’t have enough evidence to prove otherwise.

Thus the null hypothesis cannot be rejected.

## Chi-Square Test

In [3]:
def petal_cat(df):
    if df['petal_width'] <= 1.3:
        return 0
    elif df['petal_width'] > 1.3:
        return 1
    else:
        return 'Indifferent'

df['petal_width_new'] = df.apply(petal_cat, axis=1)

In [4]:
def species_cat(df):
    if df["species"] == "Iris-virginica":
        return 0
    elif df["species"] == "Iris-versicolor":
        return 1
    else:
        return 2
df["species"] = df.apply(species_cat, axis=1)

In [5]:
df.sample(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,petal_width_new
61,6.0,2.2,4.0,1.0,1,0
114,6.4,3.2,5.3,2.3,0,1
86,6.3,2.3,4.4,1.3,1,0
111,6.8,3.0,5.5,2.1,0,1
100,5.8,2.7,5.1,1.9,0,1


In [6]:
df_new = df.drop(columns=["sepal_width", "sepal_length", "petal_length", "petal_width"])
df_new.head()

Unnamed: 0,species,petal_width_new
0,2,0
1,2,0
2,2,0
3,2,0
4,2,0


H0 = Species and Petal Width are dependent on each other
<br>
H1 = Species and Petal Width are not dependent on each other

In [7]:
stat, p, dof, expected = chi2_contingency(df_new)
print("The degree of freedom is: ", dof)

The degree of freedom is:  148


In [8]:
prob = 0.95
alpha = 1.0 - prob
print('The alpha/significance level = %.3f' % alpha)
print('The p-value is = %.2f' % p)
if p <= alpha:
    print('Reject the Null Hypothesis (Reject H0)')
else:
    print('Accept the Null Hypothesis (Do not reject H0)')

The alpha/significance level = 0.050
The p-value is = 0.10
Accept the Null Hypothesis (Do not reject H0)


Here the p value is greater than 0.05, therefore we cannot reject the null hypothesis. We will have to accept the Null Hypothesis

## Univariate Regression Analysis

H0 = Species and Petal Width are dependent on each other
<br>
H1 = Species and Petal Width are not dependent on each other

In [36]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

X = df_new['petal_width_new']
y = df_new['species']

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                species   R-squared:                       0.666
Model:                            OLS   Adj. R-squared:                  0.663
Method:                 Least Squares   F-statistic:                     292.7
Date:                Tue, 22 Nov 2022   Prob (F-statistic):           8.53e-37
Time:                        16:26:05   Log-Likelihood:                -99.334
No. Observations:                 149   AIC:                             202.7
Df Residuals:                     147   BIC:                             208.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const               1.6364      0.054     

Here p-value is less than 0.005, therefore we reject the null hypothesis and conclude that Petal width and species are not related. We accept the alternate hypothesis

## Multivariate Regression Analysis

In [37]:
df['species'] = df_new['species']

In [39]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

X = df[['petal_length', 'petal_width']]
y = df['species']

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                species   R-squared:                       0.925
Model:                            OLS   Adj. R-squared:                  0.924
Method:                 Least Squares   F-statistic:                     900.0
Date:                Tue, 22 Nov 2022   Prob (F-statistic):           7.79e-83
Time:                        16:29:44   Log-Likelihood:                 11.988
No. Observations:                 149   AIC:                            -17.98
Df Residuals:                     146   BIC:                            -8.964
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            2.4273      0.055     44.333   

Here p-value is less than 0.005, therefore we reject the null hypothesis and conclude that Petal Length, Petal width and species are not related. We accept the alternate hypothesis