In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm

In [4]:
cars = pd.read_csv('cars.csv', delimiter = ',')

In [5]:
cars

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,volvo 145e (sw),gas,std,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,volvo 144ea,gas,turbo,four,sedan,rwd,front,109.1,...,141,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,volvo 244dl,gas,std,four,sedan,rwd,front,109.1,...,173,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,volvo 246,diesel,turbo,four,sedan,rwd,front,109.1,...,145,idi,3.01,3.40,23.0,106,4800,26,27,22470.0


In [6]:
cars['company'] = cars['CarName'].apply(lambda x : x.split(' ')[0])

In [7]:
cars.pop('CarName')


0            alfa-romero giulia
1           alfa-romero stelvio
2      alfa-romero Quadrifoglio
3                   audi 100 ls
4                    audi 100ls
                 ...           
200             volvo 145e (sw)
201                 volvo 144ea
202                 volvo 244dl
203                   volvo 246
204                 volvo 264gl
Name: CarName, Length: 205, dtype: object

In [8]:
cars.pop('car_ID')

0        1
1        2
2        3
3        4
4        5
      ... 
200    201
201    202
202    203
203    204
204    205
Name: car_ID, Length: 205, dtype: int64

In [9]:
cars.company.unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'maxda', 'mazda', 'buick', 'mercury',
       'mitsubishi', 'Nissan', 'nissan', 'peugeot', 'plymouth', 'porsche',
       'porcshce', 'renault', 'saab', 'subaru', 'toyota', 'toyouta',
       'vokswagen', 'volkswagen', 'vw', 'volvo'], dtype=object)

In [10]:
cars.company =  cars.company.str.lower()

In [11]:
cars.company.unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'maxda', 'mazda', 'buick', 'mercury',
       'mitsubishi', 'nissan', 'peugeot', 'plymouth', 'porsche',
       'porcshce', 'renault', 'saab', 'subaru', 'toyota', 'toyouta',
       'vokswagen', 'volkswagen', 'vw', 'volvo'], dtype=object)

In [40]:
#задаем функцию для исправления ошибок в названиях
def fix_company (to_rep, how_rep):
    cars.company.replace(to_rep, how_rep, inplace  = True ) 

In [13]:
fix_company('maxda', 'mazda')
fix_company('porcshce', 'porsche')
fix_company('toyouta', 'toyota')
fix_company('vokswagen','volkswagen')
fix_company('vw','volkswagen')
fix_company('alfa-romero','alfaromero')




In [15]:
cols_to_keep = ['company','fueltype','aspiration','carbody','drivewheel','wheelbase','carlength',
                'carwidth','curbweight','enginetype','cylindernumber',
                'enginesize','boreratio','horsepower','price']  


In [16]:
cols_to_keep

['company',
 'fueltype',
 'aspiration',
 'carbody',
 'drivewheel',
 'wheelbase',
 'carlength',
 'carwidth',
 'curbweight',
 'enginetype',
 'cylindernumber',
 'enginesize',
 'boreratio',
 'horsepower',
 'price']

In [17]:
df = cars[cols_to_keep]

In [18]:
df.dtypes

company            object
fueltype           object
aspiration         object
carbody            object
drivewheel         object
wheelbase         float64
carlength         float64
carwidth          float64
curbweight          int64
enginetype         object
cylindernumber     object
enginesize          int64
boreratio         float64
horsepower          int64
price             float64
dtype: object

In [19]:
df_dummy = pd.get_dummies(data = df[['company','fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber']],drop_first = True)

In [20]:
test = pd.concat([df.drop(['company','fueltype','aspiration','carbody','drivewheel','enginetype','cylindernumber'],axis = 'columns'),df_dummy], axis = 1)

In [41]:
test

Unnamed: 0,wheelbase,carlength,carwidth,curbweight,enginesize,boreratio,horsepower,price,company_audi,company_bmw,...,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two
0,88.6,168.8,64.1,2548,130,3.47,111,13495.0,False,False,...,False,False,False,False,False,True,False,False,False,False
1,88.6,168.8,64.1,2548,130,3.47,111,16500.0,False,False,...,False,False,False,False,False,True,False,False,False,False
2,94.5,171.2,65.5,2823,152,2.68,154,16500.0,False,False,...,False,False,True,False,False,False,True,False,False,False
3,99.8,176.6,66.2,2337,109,3.19,102,13950.0,True,False,...,True,False,False,False,False,True,False,False,False,False
4,99.4,176.6,66.4,2824,136,3.19,115,17450.0,True,False,...,True,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,109.1,188.8,68.9,2952,141,3.78,114,16845.0,False,False,...,True,False,False,False,False,True,False,False,False,False
201,109.1,188.8,68.8,3049,141,3.78,160,19045.0,False,False,...,True,False,False,False,False,True,False,False,False,False
202,109.1,188.8,68.9,3012,173,3.58,134,21485.0,False,False,...,False,False,True,False,False,False,True,False,False,False
203,109.1,188.8,68.9,3217,145,3.01,106,22470.0,False,False,...,True,False,False,False,False,False,True,False,False,False


In [42]:
test.corr()['price'].sort_values(ascending = False).round(2) 

price                    1.00
enginesize               0.87
curbweight               0.84
horsepower               0.81
carwidth                 0.76
carlength                0.68
drivewheel_rwd           0.64
wheelbase                0.58
boreratio                0.55
company_buick            0.52
cylindernumber_six       0.47
enginetype_ohcv          0.39
company_porsche          0.36
company_jaguar           0.33
company_bmw              0.32
cylindernumber_five      0.25
carbody_hardtop          0.23
cylindernumber_twelve    0.20
aspiration_turbo         0.18
enginetype_dohcv         0.16
company_volvo            0.14
carbody_sedan            0.13
company_audi             0.11
company_peugeot          0.07
company_saab             0.04
enginetype_l             0.04
company_mercury          0.03
enginetype_ohcf          0.02
cylindernumber_two      -0.00
enginetype_rotor        -0.00
carbody_wagon           -0.04
company_renault         -0.05
cylindernumber_three    -0.07
company_is

In [23]:
results  = smf.ols('price~horsepower',test).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     382.2
Date:                Sun, 21 Jul 2024   Prob (F-statistic):           1.48e-48
Time:                        15:37:52   Log-Likelihood:                -2024.0
No. Observations:                 205   AIC:                             4052.
Df Residuals:                     203   BIC:                             4059.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -3721.7615    929.849     -4.003      0.0

In [26]:
finall = test.astype(int) 


In [30]:
finall

Unnamed: 0,wheelbase,carlength,carwidth,curbweight,enginesize,boreratio,horsepower,price,company_audi,company_bmw,...,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two
0,88,168,64,2548,130,3,111,13495,0,0,...,0,0,0,0,0,1,0,0,0,0
1,88,168,64,2548,130,3,111,16500,0,0,...,0,0,0,0,0,1,0,0,0,0
2,94,171,65,2823,152,2,154,16500,0,0,...,0,0,1,0,0,0,1,0,0,0
3,99,176,66,2337,109,3,102,13950,1,0,...,1,0,0,0,0,1,0,0,0,0
4,99,176,66,2824,136,3,115,17450,1,0,...,1,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,109,188,68,2952,141,3,114,16845,0,0,...,1,0,0,0,0,1,0,0,0,0
201,109,188,68,3049,141,3,160,19045,0,0,...,1,0,0,0,0,1,0,0,0,0
202,109,188,68,3012,173,3,134,21485,0,0,...,0,0,1,0,0,0,1,0,0,0
203,109,188,68,3217,145,3,106,22470,0,0,...,1,0,0,0,0,0,1,0,0,0


In [28]:
x = test.drop(['price'], axis  = 'columns') # удаляем прайс так как не предиктор , а зависимая переменная
x = sm.add_constant(finall) # добавляем константу
y = test['price'] #  в переменную Y зависимую переменную

In [43]:
model_1 = sm.OLS(finall.price,x).fit()
print(model_1.summary())




                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     72.46
Date:                Sun, 21 Jul 2024   Prob (F-statistic):           8.51e-81
Time:                        17:07:40   Log-Likelihood:                -1881.4
No. Observations:                 205   AIC:                             3817.
Df Residuals:                     178   BIC:                             3907.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.161e+

In [35]:
predicts_no_avto = finall[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize',
       'boreratio', 'horsepower', 'fueltype_gas',
       'aspiration_turbo', 'carbody_hardtop', 'carbody_hatchback',
       'carbody_sedan', 'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd',
       'enginetype_dohcv', 'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf',
       'enginetype_ohcv', 'enginetype_rotor', 'cylindernumber_five',
       'cylindernumber_four', 'cylindernumber_six', 'cylindernumber_three',
       'cylindernumber_twelve', 'cylindernumber_two']]

In [36]:
X2 = sm.add_constant(predicts_no_avto)  # со всеми, кроме марок машин


In [44]:
model_no_avto = sm.OLS(finall.price, X2).fit().summary()
print(model_no_avto)

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.914
Model:                            OLS   Adj. R-squared:                  0.901
Method:                 Least Squares   F-statistic:                     72.46
Date:                Sun, 21 Jul 2024   Prob (F-statistic):           8.51e-81
Time:                        17:08:04   Log-Likelihood:                -1881.4
No. Observations:                 205   AIC:                             3817.
Df Residuals:                     178   BIC:                             3907.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.161e+

ВЫВОД :

Большинство коэффициентов, связанных с марками машин, статистически незначимы
Если судить чисто по диагностическим показателям (вроде R2), то модель со всеми предикторами лучшая

Выбранная модель объясняет примерно 90% дисперсии (окр. до целого). Среди предикторов 15 из 27 оказались 
значимыми (p < 0.05). Пример интерпретации: при единичном изменении показателя horsepower, цена
возрастает на 83.3646
