In [3]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import requests
from urllib.parse import urlencode

In [4]:
# Чтение данных из облака
def download_link(public_key):
       base_url = 'https://cloud-api.yandex.net/v1/disk/public/resources/download?'
       final_url = base_url + urlencode(dict(public_key=public_key))
       response = requests.get(final_url)
       download_url = response.json()['href']
       return download_url

In [5]:
df = pd.read_csv(download_link('https://disk.yandex.ru/d/pV0bTSI8g2imOQ'))

Сколько пропущенных значений встретилось в датасете?

In [6]:
df.isna().sum()

car_ID              0
symboling           0
CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

Использовать полное название машины – не самый хороший вариант, поэтому создадим новый признак – марку автомобиля (company). Для этого используйте столбец CarName, разбейте значения ячеек по пробелу и запишите в колонку первый элемент. Например:

'audi 100 ls' → 'audi'
Сколько всего уникальных марок машины встречается в датасете? Столбец CarName с полным названием машины удалите из датасета, а также car_ID, они не пригодятся для дальнейшего анализа.

In [7]:
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [8]:
df['company'] = df.CarName.apply(lambda x: x.split(' ')[0])

In [9]:
df.company.nunique()

28

Теперь внимательнее посмотрите на уникальные значения company. Часть из них оказалась с ошибками!

'maxda' → 'mazda'
'Nissan' → 'nissan'
'porcshce' → 'porsche'
'toyouta' → 'toyota'
'vokswagen' & 'vw' → 'volkswagen'
Сколько уникальных производителей осталось в итоге?

In [10]:
df.company.unique()

array(['alfa-romero', 'audi', 'bmw', 'chevrolet', 'dodge', 'honda',
       'isuzu', 'jaguar', 'maxda', 'mazda', 'buick', 'mercury',
       'mitsubishi', 'Nissan', 'nissan', 'peugeot', 'plymouth', 'porsche',
       'porcshce', 'renault', 'saab', 'subaru', 'toyota', 'toyouta',
       'vokswagen', 'volkswagen', 'vw', 'volvo'], dtype=object)

Отлично! Чтобы не перегружать модель большим количеством предикторов, оставим только часть из них:

'company', 'fueltype', 'aspiration','carbody', 'drivewheel', 'wheelbase', 'carlength','carwidth', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio','horsepower'
также не забыв про то, что мы предсказываем – 'price'. 

После этого посчитайте корреляцию между price и другими переменными. Чему равна корреляция между price и horsepower? Ответ округлите до 2 знаков после точки.

In [11]:
df_to_analyze = df[['company', 'fueltype', 'aspiration', 'carbody', 'drivewheel', 'wheelbase', 'carlength', 
                    'carwidth', 'curbweight', 'enginetype', 'cylindernumber', 'enginesize', 'boreratio', 'horsepower', 'price']]

In [12]:
df_to_analyze.corr()

Unnamed: 0,wheelbase,carlength,carwidth,curbweight,enginesize,boreratio,horsepower,price
wheelbase,1.0,0.874587,0.795144,0.776386,0.569329,0.48875,0.353294,0.577816
carlength,0.874587,1.0,0.841118,0.877728,0.68336,0.606454,0.552623,0.68292
carwidth,0.795144,0.841118,1.0,0.867032,0.735433,0.55915,0.640732,0.759325
curbweight,0.776386,0.877728,0.867032,1.0,0.850594,0.64848,0.750739,0.835305
enginesize,0.569329,0.68336,0.735433,0.850594,1.0,0.583774,0.809769,0.874145
boreratio,0.48875,0.606454,0.55915,0.64848,0.583774,1.0,0.573677,0.553173
horsepower,0.353294,0.552623,0.640732,0.750739,0.809769,0.573677,1.0,0.808139
price,0.577816,0.68292,0.759325,0.835305,0.874145,0.553173,0.808139,1.0


In [13]:
df_to_analyze.head()

Unnamed: 0,company,fueltype,aspiration,carbody,drivewheel,wheelbase,carlength,carwidth,curbweight,enginetype,cylindernumber,enginesize,boreratio,horsepower,price
0,alfa-romero,gas,std,convertible,rwd,88.6,168.8,64.1,2548,dohc,four,130,3.47,111,13495.0
1,alfa-romero,gas,std,convertible,rwd,88.6,168.8,64.1,2548,dohc,four,130,3.47,111,16500.0
2,alfa-romero,gas,std,hatchback,rwd,94.5,171.2,65.5,2823,ohcv,six,152,2.68,154,16500.0
3,audi,gas,std,sedan,fwd,99.8,176.6,66.2,2337,ohc,four,109,3.19,102,13950.0
4,audi,gas,std,sedan,4wd,99.4,176.6,66.4,2824,ohc,five,136,3.19,115,17450.0


In [14]:
df_to_analyze.dtypes

company            object
fueltype           object
aspiration         object
carbody            object
drivewheel         object
wheelbase         float64
carlength         float64
carwidth          float64
curbweight          int64
enginetype         object
cylindernumber     object
enginesize          int64
boreratio         float64
horsepower          int64
price             float64
dtype: object

In [15]:
df_dummy = pd.get_dummies(data=df_to_analyze[['company', 'fueltype', 'aspiration', 'carbody', 'drivewheel', 'enginetype', 'cylindernumber']],
                            drop_first = True)

In [16]:
df_dummy.head()

Unnamed: 0,company_alfa-romero,company_audi,company_bmw,company_buick,company_chevrolet,company_dodge,company_honda,company_isuzu,company_jaguar,company_maxda,...,enginetype_ohc,enginetype_ohcf,enginetype_ohcv,enginetype_rotor,cylindernumber_five,cylindernumber_four,cylindernumber_six,cylindernumber_three,cylindernumber_twelve,cylindernumber_two
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0
4,0,1,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,0,0


In [17]:
non_categorical = df_to_analyze[['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'horsepower', 'price']]

In [18]:
ready_to_feet_df = pd.concat([non_categorical, df_dummy], axis=1)

In [19]:
ready_to_feet_df.columns

Index(['wheelbase', 'carlength', 'carwidth', 'curbweight', 'enginesize',
       'boreratio', 'horsepower', 'price', 'company_alfa-romero',
       'company_audi', 'company_bmw', 'company_buick', 'company_chevrolet',
       'company_dodge', 'company_honda', 'company_isuzu', 'company_jaguar',
       'company_maxda', 'company_mazda', 'company_mercury',
       'company_mitsubishi', 'company_nissan', 'company_peugeot',
       'company_plymouth', 'company_porcshce', 'company_porsche',
       'company_renault', 'company_saab', 'company_subaru', 'company_toyota',
       'company_toyouta', 'company_vokswagen', 'company_volkswagen',
       'company_volvo', 'company_vw', 'fueltype_gas', 'aspiration_turbo',
       'carbody_hardtop', 'carbody_hatchback', 'carbody_sedan',
       'carbody_wagon', 'drivewheel_fwd', 'drivewheel_rwd', 'enginetype_dohcv',
       'enginetype_l', 'enginetype_ohc', 'enginetype_ohcf', 'enginetype_ohcv',
       'enginetype_rotor', 'cylindernumber_five', 'cylindernumber_four',


Сначала построим небольшую модель всего с одним предиктором цены (price) – horsepower.

Какой процент изменчивости объясняет полученная модель? (округлите до целого)

In [20]:
results = smf.ols('price ~ horsepower', ready_to_feet_df).fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.653
Model:                            OLS   Adj. R-squared:                  0.651
Method:                 Least Squares   F-statistic:                     382.2
Date:                Wed, 06 Jul 2022   Prob (F-statistic):           1.48e-48
Time:                        22:43:09   Log-Likelihood:                -2024.0
No. Observations:                 205   AIC:                             4052.
Df Residuals:                     203   BIC:                             4059.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept  -3721.7615    929.849     -4.003      0.0

Теперь – две модели:

модель со всеми предикторами
модель со всеми предикторами, кроме марок машин
Обратите внимание на изменения в R 
2
 , коэффициентах и их значимости. Какую модель лучше оставить? 

In [21]:
X = sm.add_constant(ready_to_feet_df.drop(columns='price'))
Y = ready_to_feet_df.price
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.960
Model:                            OLS   Adj. R-squared:                  0.947
Method:                 Least Squares   F-statistic:                     70.67
Date:                Wed, 06 Jul 2022   Prob (F-statistic):           3.98e-84
Time:                        22:43:09   Log-Likelihood:                -1801.8
No. Observations:                 205   AIC:                             3710.
Df Residuals:                     152   BIC:                             3886.
Df Model:                          52                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -3.729e+

  x = pd.concat(x[::order], 1)


In [22]:
X = sm.add_constant(ready_to_feet_df.drop(columns=['price', 'company_alfa-romero',
       'company_audi', 'company_bmw', 'company_buick', 'company_chevrolet',
       'company_dodge', 'company_honda', 'company_isuzu', 'company_jaguar',
       'company_maxda', 'company_mazda', 'company_mercury',
       'company_mitsubishi', 'company_nissan', 'company_peugeot',
       'company_plymouth', 'company_porcshce', 'company_porsche',
       'company_renault', 'company_saab', 'company_subaru', 'company_toyota',
       'company_toyouta', 'company_vokswagen', 'company_volkswagen',
       'company_volvo', 'company_vw',
       'carbody_hardtop']))
Y = ready_to_feet_df.price
model = sm.OLS(Y, X)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.911
Model:                            OLS   Adj. R-squared:                  0.898
Method:                 Least Squares   F-statistic:                     72.93
Date:                Wed, 06 Jul 2022   Prob (F-statistic):           2.12e-80
Time:                        22:43:09   Log-Likelihood:                -1885.0
No. Observations:                 205   AIC:                             3822.
Df Residuals:                     179   BIC:                             3908.
Df Model:                          25                                         
Covariance Type:            nonrobust                                         
                            coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                 -1.506e+

  x = pd.concat(x[::order], 1)
