In [1]:
import pandas as pd
from statsmodels.formula.api import ols
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd


In [2]:
# Load the data using pandas
data = pd.read_excel('C:/Users/ravip/Desktop/SEM2/bigD/project/ANOVA_Test_for_features.xlsx')
data = pd.DataFrame(data)
print(data.head())
data.info()

  driverid  Total_violations  Total_Miles_driven  Driver_Average_velocity  \
0       A1                 3              628507                  37.4500   
1      A10                 6              675377                  31.7625   
2     A100                 4              634338                  30.3625   
3      A11                 5              652452                  33.8375   
4      A12                 5              668241                  35.3000   

          Model  Risk_Factor  
0  Freightliner       4.7732  
1     Peterbilt       8.8839  
2     Peterbilt       6.3058  
3     Peterbilt       7.6634  
4   Caterpillar       7.4823  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   driverid                 100 non-null    object 
 1   Total_violations         100 non-null    int64  
 2   Total_Miles_driven       10

In [3]:
# Example ANOVA for Total Violations
model_total_violations = ols('Risk_Factor ~ Total_violations', data=data).fit()
print(model_total_violations.summary())


                            OLS Regression Results                            
Dep. Variable:            Risk_Factor   R-squared:                       0.997
Model:                            OLS   Adj. R-squared:                  0.997
Method:                 Least Squares   F-statistic:                 3.722e+04
Date:                Wed, 27 Dec 2023   Prob (F-statistic):          2.82e-128
Time:                        17:03:56   Log-Likelihood:                 30.720
No. Observations:                 100   AIC:                            -57.44
Df Residuals:                      98   BIC:                            -52.23
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.0268      0.040  

In [4]:

# Example for Total Miles Driven
model_total_miles = ols('Risk_Factor ~ Total_Miles_driven', data=data).fit()
print(model_total_miles.summary())


                            OLS Regression Results                            
Dep. Variable:            Risk_Factor   R-squared:                       0.006
Model:                            OLS   Adj. R-squared:                 -0.004
Method:                 Least Squares   F-statistic:                    0.6214
Date:                Wed, 27 Dec 2023   Prob (F-statistic):              0.432
Time:                        17:03:56   Log-Likelihood:                -266.08
No. Observations:                 100   AIC:                             536.2
Df Residuals:                      98   BIC:                             541.4
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept             19.4641     15

In [5]:

# Example for Average Velocity
model_avg_velocity = ols('Risk_Factor ~ Driver_Average_velocity', data=data).fit()
print(model_avg_velocity.summary())


                            OLS Regression Results                            
Dep. Variable:            Risk_Factor   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                 -0.007
Method:                 Least Squares   F-statistic:                    0.3424
Date:                Wed, 27 Dec 2023   Prob (F-statistic):              0.560
Time:                        17:03:56   Log-Likelihood:                -266.22
No. Observations:                 100   AIC:                             536.4
Df Residuals:                      98   BIC:                             541.7
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                              coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                 

In [6]:

# Perform post-hoc tests if needed (Tukey's HSD is a common choice)
# Example for Total Violations
tukey_total_violations = pairwise_tukeyhsd(data['Risk_Factor'], data['Model'])
print(tukey_total_violations.summary())

# Example for Model
model_anova = ols('Risk_Factor ~ Model', data=data).fit()
print(model_anova.summary())


       Multiple Comparison of Means - Tukey HSD, FWER=0.05       
   group1       group2    meandiff p-adj   lower    upper  reject
-----------------------------------------------------------------
 Caterpillar        Crane   2.0061 0.9995  -6.6646 10.6769  False
 Caterpillar         Ford   0.2362    1.0  -3.4407   3.913  False
 Caterpillar Freightliner   1.0938 0.9999  -4.7227  6.9103  False
 Caterpillar         Hino   0.2964    1.0  -5.1108  5.7035  False
 Caterpillar     Kenworth   -0.792    1.0  -6.6084  5.0245  False
 Caterpillar     Navistar  -0.6592    1.0  -5.5063  4.1879  False
 Caterpillar      Oshkosh   3.4584 0.7824   -2.923  9.8399  False
 Caterpillar    Peterbilt   1.1592 0.9955  -2.6982  5.0166  False
 Caterpillar        Volvo  -0.4054    1.0   -4.883  4.0721  False
 Caterpillar Western Star  -0.4165    1.0  -7.6421  6.8091  False
       Crane         Ford    -1.77 0.9998 -10.4976  6.9576  False
       Crane Freightliner  -0.9123    1.0 -10.7349  8.9102  False
       Cra

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 6 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   driverid                 100 non-null    object 
 1   Total_violations         100 non-null    int64  
 2   Total_Miles_driven       100 non-null    int64  
 3   Driver_Average_velocity  100 non-null    float64
 4   Model                    100 non-null    object 
 5   Risk_Factor              100 non-null    float64
dtypes: float64(2), int64(2), object(2)
memory usage: 4.8+ KB


In [8]:
data.head()

Unnamed: 0,driverid,Total_violations,Total_Miles_driven,Driver_Average_velocity,Model,Risk_Factor
0,A1,3,628507,37.45,Freightliner,4.7732
1,A10,6,675377,31.7625,Peterbilt,8.8839
2,A100,4,634338,30.3625,Peterbilt,6.3058
3,A11,5,652452,33.8375,Peterbilt,7.6634
4,A12,5,668241,35.3,Caterpillar,7.4823


In [9]:
# Perform one-hot encoding
df_encoded = pd.DataFrame(pd.get_dummies(data, columns=['Model'], drop_first=True))

# Display the updated DataFrame
print(df_encoded.head())

  driverid  Total_violations  Total_Miles_driven  Driver_Average_velocity  \
0       A1                 3              628507                  37.4500   
1      A10                 6              675377                  31.7625   
2     A100                 4              634338                  30.3625   
3      A11                 5              652452                  33.8375   
4      A12                 5              668241                  35.3000   

   Risk_Factor  Model_Crane  Model_Ford  Model_Freightliner  Model_Hino  \
0       4.7732            0           0                   1           0   
1       8.8839            0           0                   0           0   
2       6.3058            0           0                   0           0   
3       7.6634            0           0                   0           0   
4       7.4823            0           0                   0           0   

   Model_Kenworth  Model_Navistar  Model_Oshkosh  Model_Peterbilt  \
0               0

In [None]:
import statsmodels.api as sm

In [None]:
X = df_encoded.drop(['Risk_Factor', 'driverid'],axis=1)  # Include other features as needed
X = sm.add_constant(X)  # Add a constant term to the model


y = df_encoded['Risk_Factor'] # Target variable

# Fit the linear regression model
model = sm.OLS(y, X).fit()

# Print the regression results
print(model.summary())

In [None]:
df_encoded