In [None]:
!pip install scipy




**STATISTICAL ANALYSIS**
statistical analysis using SciPy and statsmodels for hypothesis testing, regression analysis, and ANOVA

SciPy is an open-source scientific computing library for Python that builds on NumPy. It provides many additional functionalities compared to NumPy, including optimization, integration, interpolation, eigenvalue problems, signal and image processing, statistical distributions, and much more.

In [None]:
import scipy
from scipy import integrate

# Example: Integration using quad function from scipy import integrate

result, error=integrate.quad(lambda x: x**2, 0, 1)
print("Result:", result)
print("Error :",error)


Result: 0.33333333333333337
Error : 3.700743415417189e-15


In [None]:
import numpy as np
from scipy import optimize

#Optimization

result_optimization=optimize.minimize_scalar(lambda x: x**2 + 3*x + 5)

print("Optimization Result:", result_optimization.x)

Optimization Result: -1.5000000000000002


In [None]:
square=lambda x:x**2
result=square(5)
print("Result",result)

Result 25


In [None]:
add_numbers=lambda x,y:x+y
result=add_numbers(3,7)
print("Result",result)

Result 10


In [None]:
import numpy as np

from scipy import interpolate

#Interpolation

x_data = np.array([0, 1, 2, 3, 4])

y_data = np.array([0, 2, 1, 3, 5])

interp_func=interpolate.interp1d(x_data, y_data, kind='linear')

interp_result=interp_func(2.5)

print("Interpolation Result:", interp_result)

Interpolation Result: 2.0


In [None]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import statsmodels.api as sm


In [None]:
!pip install statsmodels



In [None]:
#Generate sample data
np.random.seed(0)
data =pd.DataFrame({
    'Treatment':np.random.choice(['A','B','c'],size=100),
    'Score':np.random.normal(loc=10,scale=2,size=100)
})
print(data)

   Treatment      Score
0          A   8.626821
1          B  10.029747
2          A   9.248668
3          B   9.923553
4          B  10.735949
..       ...        ...
95         A   9.933528
96         c  10.131283
97         A  10.531571
98         B  12.303684
99         c  10.276086

[100 rows x 2 columns]


In [None]:
mean_score=data.groupby('Treatment')['Score'].mean()
print(mean_score)

Treatment
A     9.611241
B    10.099304
c    10.010667
Name: Score, dtype: float64


In [None]:
choices=['A','B','c']
size=100
treatment_column=np.random.choice(choices,size=size)
print(treatment_column)

['B' 'B' 'c' 'A' 'B' 'A' 'B' 'B' 'c' 'A' 'c' 'A' 'c' 'A' 'c' 'A' 'A' 'A'
 'A' 'B' 'c' 'A' 'A' 'B' 'c' 'B' 'A' 'A' 'A' 'A' 'A' 'B' 'A' 'c' 'c' 'c'
 'A' 'A' 'c' 'A' 'c' 'c' 'A' 'B' 'B' 'B' 'A' 'A' 'B' 'A' 'A' 'B' 'c' 'c'
 'A' 'c' 'B' 'B' 'c' 'B' 'c' 'c' 'B' 'c' 'A' 'A' 'c' 'B' 'B' 'c' 'c' 'B'
 'A' 'c' 'A' 'c' 'A' 'c' 'B' 'c' 'B' 'A' 'B' 'B' 'c' 'c' 'A' 'B' 'A' 'A'
 'A' 'c' 'A' 'B' 'A' 'c' 'B' 'A' 'B' 'c']


In [None]:
choices=['A','B','c']
size=100
treatment_column=np.random.choice(choices,size=size)
print(treatment_column)

[11.04397546 12.39451973  9.23502757 11.38332384 10.70777002 12.09517067
  9.15220753  2.97046376  7.31368654 12.85101225 10.45716402  9.48467248
 10.10074143  7.23957825  9.47665587  9.64124061  8.61445885 12.27565373
  9.6616855   8.47217266  9.00385379  9.27421777 10.52792062  8.74071618
  9.05548319  6.97327793 12.21524936 10.3524775   8.11929292 11.85918866
  7.87441016  8.22718746 13.84269391  9.08043895  7.82193113 11.96823459
  7.68158735  9.12692581 12.01848906 11.42677914  8.54388456 11.67903292
 12.47804196  6.43039223  8.40762832  7.19891747  9.63129885  7.21761377
 10.07251948  8.37118888 11.39474565  6.52514152 10.2317114  10.73130289
  9.85215307  9.01296486 16.20306117 11.71750831  7.69044895 11.88366868
  9.43572972  8.04869067 10.19637338 11.81097991 12.03748287  9.77020229
 13.48607744  9.35624162 11.65914219  9.58536401 12.23599721 12.12849937
 12.30265967  8.45508459  7.41273144 11.35405362 10.8481104   9.02864765
  9.89660553 11.13411274 12.13566715 10.54319148 11

In [None]:
#Hypothesis Testing(t-test)
group_A=data[data['Treatment']=='A']['Score']
group_B=data[data['Treatment']=='B']['Score']
t_stat,p_value=stats.ttest_ind(group_A,group_B)
print("t-statistic:",t_stat)
print("p-value:",p_value)

t-statistic: -1.0341676575352792
p-value: 0.3045674995802033


In [None]:
np.random.seed(0)
data=pd.DataFrame({
    'Treatment': np.random.choice(['1','2','3'],size=10),
    'Score':np.random.normal(loc=10,scale=2,size=10)  #loc=meam   scale=standard deviation
})
print(data)

  Treatment      Score
0         1   9.653861
1         2   6.476697
2         1   9.824654
3         2  12.733759
4         2  12.250628
5         3   9.282009
6         1  12.441216
7         3   7.321009
8         1  10.856747
9         1   9.753074


In [None]:
mean_score_per_group=data.groupby('Treatment')['Score'].mean()
print(mean_score_per_group)

Treatment
1    10.505910
2    10.487028
3     8.301509
Name: Score, dtype: float64


In [None]:
mean=10
std=2
size=10
score_values=mean+std * np.random.randn(size)
print(score_values)

[12.82875438  9.75189867 14.01631418 10.45977307 11.20978747 13.25431965
 13.18912107 10.46086834  9.87017931  8.0620395 ]


In [None]:
choices=['1','2','3']
size=10
treatment_column=np.random.choice(choices,size=size)
print(treatment_column)

['3' '3' '1' '3' '1' '1' '1' '2' '2' '3']


In [None]:
group_1=data[data['Treatment']=='1']['Score']
group_2=data[data['Treatment']=='2']['Score']
t_stat,p_value=stats.ttest_ind(group_1,group_2)
print("t-statistic:",t_stat)
print("p-value",p_value)

t-statistic: 0.011588929432828264
p-value 0.9911293054731878


In [None]:

#ANOVA (one-way ANOVA)

f_stat, p_value = stats.f_oneway(data[data['Treatment'] == 'A']['Score'],
                                 data[data['Treatment'] == 'B']['Score'],
                                 data[data['Treatment'] == 'C']['Score'])

print("F-statistic:", f_stat)

print("p-value:", p_value)

F-statistic: nan
p-value: nan




In [None]:
import numpy as np
import statsmodels.api as sm

# Sample data
X = np.array([1, 2, 3, 4, 5])  # Independent variable
y = np.array([2, 4, 5, 4, 5])  # Dependent variable

# Add a constant to the independent variable (for intercept estimation)
X = sm.add_constant(X)

# Create OLS model
model = sm.OLS(y, X)

# Fit the model
results = model.fit()

# Print summary of results
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.600
Model:                            OLS   Adj. R-squared:                  0.467
Method:                 Least Squares   F-statistic:                     4.500
Date:                Tue, 20 Feb 2024   Prob (F-statistic):              0.124
Time:                        10:03:38   Log-Likelihood:                -5.2598
No. Observations:                   5   AIC:                             14.52
Df Residuals:                       3   BIC:                             13.74
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          2.2000      0.938      2.345      0.1

  warn("omni_normtest is not valid with less than 8 observations; %i "


In [None]:
x=sm.add_constant(data['Treatment'])
model=sm.OLS(data['Score'],x)
results=model.fit()
print(results.summary())

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).