# Обработка признаков

В этом домашнем задании вы будете решать задачу предсказания стоимости автомобилей по их различным характеристикам.

In [1]:
import pandas as pd
import numpy as np


RANDOM_STATE = 42

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/evgpat/edu_stepik_practical_ml/main/datasets/cars_prices.csv", decimal='.')

In [3]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


### Описание некоторых признаков

`symboling` - rating corresponds to the degree to which the auto is more risky than its price indicates (+3 more risk and -3 is pretty safe)  
`make` - car types (i.e. car brand)  
`fuel-type` - types of fuel (gas or diesel)  
`aspiration` - engine aspiration (standard or turbo)  
`num-of-doors` - numbers of doors (two or four)  
`body-style` - car body style (sedan or hachback)  
`drive-wheels` - which types of drive wheel (forward-fwd, reversed-rwd)  
`engine-location` - engine mounted location (front or back)  
`wheel-base` - расстояние между осями передних и задних колес  
`length` - car lenght  
`weight` - car weight  
`width` - car width  
`height` - car height  

In [4]:
df.shape

(205, 26)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

## Заполнение пропусков

Пропуски в этом датасете обозначены как `?`

In [6]:
for c in df.columns:
    print(c, len(df[df[c] == '?']))

symboling 0
normalized-losses 41
make 0
fuel-type 0
aspiration 0
num-of-doors 2
body-style 0
drive-wheels 0
engine-location 0
wheel-base 0
length 0
width 0
height 0
curb-weight 0
engine-type 0
num-of-cylinders 0
engine-size 0
fuel-system 0
bore 4
stroke 4
compression-ratio 0
horsepower 2
peak-rpm 2
city-mpg 0
highway-mpg 0
price 4


Удалите строки, для которых неизвестно значение price, так как это целевая переменная.

## Вопрос для Quiz

Сколько строк осталось в данных?

In [7]:
# your code here
data = df[df['price'] != '?']

In [8]:
data.shape

(201, 26)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  201 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       201 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non-null

In [10]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [11]:
#df = df.replace('?', pd.NA)
data = data.replace('?', pd.NA)

In [12]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [13]:
round(data['peak-rpm'].dropna().astype('int').mean())

5118

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 201 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          201 non-null    int64  
 1   normalized-losses  164 non-null    object 
 2   make               201 non-null    object 
 3   fuel-type          201 non-null    object 
 4   aspiration         201 non-null    object 
 5   num-of-doors       199 non-null    object 
 6   body-style         201 non-null    object 
 7   drive-wheels       201 non-null    object 
 8   engine-location    201 non-null    object 
 9   wheel-base         201 non-null    float64
 10  length             201 non-null    float64
 11  width              201 non-null    float64
 12  height             201 non-null    float64
 13  curb-weight        201 non-null    int64  
 14  engine-type        201 non-null    object 
 15  num-of-cylinders   201 non-null    object 
 16  engine-size        201 non-null

Заполните средним значением пропуски в столбцах для числовых признаков и самым популярным значением для категориальных признаков
* `num-of-doors`
* `bore`
* `stroke`
* `horsepower`
* `peak-rpm`

In [15]:
col_to_impute = ['num-of-doors', 'bore', 'stroke', 'horsepower', 'peak-rpm' ]

for col in col_to_impute:
  val_to_impute = data[col].value_counts().index[0]
  data[col] = data[col].fillna(val_to_impute)

In [16]:
data.isna().any()

Unnamed: 0,0
symboling,False
normalized-losses,True
make,False
fuel-type,False
aspiration,False
num-of-doors,False
body-style,False
drive-wheels,False
engine-location,False
wheel-base,False


In [17]:
#попробуем импутер для категориальных данных num-of-doors, bore, stroke, horsepower, peak-rpm

# using sklearn-pandas package CategoricalImputer
###import numpy as np
#from sklearn.impute import SimpleImputer
#from sklearn.preprocessing import Imputer

#imputer = SimpleImputer(strategy="mean")
#imputer.fit_transform(data)






In [18]:
# поскольку импутер еще не изобрели нам поможет лямбда

#data = data.apply(lambda x: x.fillna(x.value_counts().index[0]))

In [19]:
#data['normalized-losses'].dtype
#round(data['normalized-losses'].dropna().astype('int').mean())

In [20]:
#data['normalized-losses'].replace(0, 122, inplace=True)


In [21]:
data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [24]:
X.head()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
0,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27
1,3,88.6,168.8,64.1,48.8,2548,130,9.0,21,27
2,1,94.5,171.2,65.5,52.4,2823,152,9.0,19,26
3,2,99.8,176.6,66.2,54.3,2337,109,10.0,24,30
4,2,99.4,176.6,66.4,54.3,2824,136,8.0,18,22


## Вопрос для Quiz

Чему равно среднее значение `peak-rpm` до заполнения пропусков? Ответ округлите до целого числа.

In [None]:
# round(data['peak-rpm'].dropna().astype('int').mean())

Пропуски в столбце `normalized-losses` предскажите при помощи линейной регрессии по признакам
`symboling`, `wheel-base`, `length`, `width`, `height`, `curb-weight`, `engine-size`, `compression-ratio`, `city-mpg`, `highway-mpg` и заполните их предсказаниями

In [42]:
#symboling make fuel-type aspiration num-of-doors body-style drive-wheels engine-location wheel-base

df_norm = data.copy()
df_norm = df_norm[['symboling', 'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-size', 'compression-ratio', 'city-mpg', 'highway-mpg', 'normalized-losses']].copy()

test_df = df_norm[df_norm['normalized-losses'].isnull() == True]
train_df = df_norm[df_norm['normalized-losses'].isnull() == False]
y = train_df['normalized-losses']

train_df.drop(['normalized-losses'], axis=1, inplace=True)
test_df.drop(['normalized-losses'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df.drop(['normalized-losses'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df.drop(['normalized-losses'], axis=1, inplace=True)


In [43]:
train_df.head()

Unnamed: 0,symboling,wheel-base,length,width,height,curb-weight,engine-size,compression-ratio,city-mpg,highway-mpg
3,2,99.8,176.6,66.2,54.3,2337,109,10.0,24,30
4,2,99.4,176.6,66.4,54.3,2824,136,8.0,18,22
6,1,105.8,192.7,71.4,55.7,2844,136,8.5,19,25
8,1,105.8,192.7,71.4,55.9,3086,131,8.3,17,20
10,2,101.2,176.8,64.8,54.3,2395,108,8.8,23,29


In [44]:
y.head()

Unnamed: 0,normalized-losses
3,164
4,164
6,158
8,158
10,192


In [44]:
# Используем машинное обучение для предсказания пропущенных значений

In [45]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

lr.fit(train_df, y)

pred = lr.predict(test_df)

In [46]:
# необходимо подключить индексы
updated_df2 = data.copy()

indices = test_df.index.to_numpy()

updated_df2.loc[indices, 'normalized-losses'] = pred

In [48]:
updated_df2.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,168.072493,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,168.072493,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,134.001799,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [None]:
# еще одно решение от человека мыслящего сразу в двух измерениях
from sklearn.linear_model import LinearRegression

X = df[df["normalized-losses"] != "?"][["symboling", "wheel-base", "length", "width", "height", "curb-weight", "engine-size", "compression-ratio", "city-mpg", "highway-mpg"]]
y = df[df["normalized-losses"] != "?"]["normalized-losses"]

model = LinearRegression()

model.fit(X, y)

df.loc[df["normalized-losses"] == "?", "normalized-losses"] = model.predict(df[df["normalized-losses"] == "?"][["symboling", "wheel-base", "length", "width", "height", "curb-weight", "engine-size", "compression-ratio", "city-mpg", "highway-mpg"]])

In [50]:
#

## Вопрос для Quiz

Чему равно предсказание линейной регрессии на первом пропущенном значении? Ответ округлите до целого числа.

In [62]:
updated_df2['normalized-losses'][0]

168.07249261651435

In [63]:
data = updated_df2.copy()

## 2. Кодирование категориальных признаков

1. Закодируйте бинарные признаки `fuel-type`, `aspiration`, `num-of-doors`, `engine-location` каждый отдельной колонкой, состоящей из 0 и 1.
Единицей кодируйте самую частую категорию.

In [64]:
# your code here
#col_to_impute = ['num-of-doors', 'bore', 'stroke', 'horsepower', 'peak-rpm' ]

#for col in col_to_impute:
#  val_to_impute = data[col].value_counts().index[0]
#  data[col] = data[col].fillna(val_to_impute)#
data['fuel-type'].value_counts()

Unnamed: 0_level_0,count
fuel-type,Unnamed: 1_level_1
gas,181
diesel,20


In [65]:
data['fuel-type'] = df['fuel-type'].apply(lambda x: 1 if x == 'gas' else 0)

In [66]:
data['aspiration'].value_counts()

Unnamed: 0_level_0,count
aspiration,Unnamed: 1_level_1
std,165
turbo,36


In [67]:
data['aspiration'] = df['aspiration'].apply(lambda x: 1 if x == 'std' else 0)

In [68]:
data['num-of-doors'].value_counts()

Unnamed: 0_level_0,count
num-of-doors,Unnamed: 1_level_1
four,115
two,86


In [69]:
data['num-of-doors'] = df['num-of-doors'].apply(lambda x: 1 if x == 'four' else 0)

In [70]:
data['engine-location'].value_counts()

Unnamed: 0_level_0,count
engine-location,Unnamed: 1_level_1
front,198
rear,3


In [71]:
data['engine-location'] = df['engine-location'].apply(lambda x: 1 if x == 'front' else 0)

2. Вынесите в переменную `y` целевую переменную `price`, а все остальные колонки - в матрицу `X`.

Закодируйте признаки `make`, `body-style`, `engine-type`, `fuel-system` при помощи LeaveOneOutEncoder.

**Дальше все время работайте с объектами `X`, `y`.**

In [152]:
X = data.drop(['price'], axis=1)
y = data['price']

In [60]:
!pip install category_encoders -q

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/81.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [153]:
from category_encoders.leave_one_out import LeaveOneOutEncoder

# your code here
loo_enc = LeaveOneOutEncoder()

X['make'] = loo_enc.fit_transform(X['make'], y)
X['body-style'] = loo_enc.fit_transform(X['body-style'], y)
X['engine-type'] = loo_enc.fit_transform(X['engine-type'], y)
X['fuel-system'] = loo_enc.fit_transform(X['fuel-system'], y)

## Вопрос для Quiz

Чему равно среднее значение в столбце `body-style` после кодирования? Ответ округлите до целого числа.

In [154]:

## Use the `apply()` function to convert the column to floats
#df['values'] = df['values'].apply(lambda x: float(''.join(filter(str.isdigit, x))) if not x.isnumeric() else float(x))

X['body-style'].mean()


13207.129353233831

3. Закодируйте признак `drive-wheels` при помощи OHE из библиотеки category_encoders.

In [155]:
from category_encoders.one_hot import OneHotEncoder

ohe_enc = OneHotEncoder(cols=['drive-wheels'])

X = ohe_enc.fit_transform(X, y)

In [157]:
X.drop('drive-wheels_1', axis=1, inplace=True)

In [158]:

X.shape

(201, 26)

In [159]:
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels_2,drive-wheels_3,engine-location,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,168.072493,16500.0,1,1,0,23569.6,0,0,1,...,four,130,17650.307692,3.47,2.68,9.0,111,5000,21,27
1,3,168.072493,14997.5,1,1,0,22968.6,0,0,1,...,four,130,17617.285714,3.47,2.68,9.0,111,5000,21,27
2,1,134.001799,14997.5,1,1,0,9859.791045,0,0,1,...,six,152,17617.285714,2.68,3.47,9.0,154,5000,19,26
3,2,164.0,18641.0,1,1,1,14465.236559,1,0,1,...,four,109,17645.307692,3.19,3.4,10.0,102,5500,24,30
4,2,164.0,17941.0,1,1,1,14427.602151,0,1,1,...,five,136,17606.846154,3.19,3.4,8.0,115,5500,18,22


4. В столбце `num-of-cylinders` категории упорядочены по смыслу. Закодируйте их подряд идущими числами, начиная с 1, согласно смыслу.

Подряд идущими числами означает - 1, 2, 3 и так далее без пропусков.

In [160]:
#from category_encoders.ordinal import OrdinalEncoder
# your code here
#ord_enc = OrdinalEncoder()
#ord_enc.fit_transform(X['num-of-cylinders'])


In [161]:
num_of_cylinders_mapping = {
    'four': 4,
    'six': 6,
    'five': 5,
    'eight': 8,
    'two': 2,
    'twelve': 12,
    'three': 3
}
X['num-of-cylinders'] = X['num-of-cylinders'].map(num_of_cylinders_mapping)

## Вопрос для Quiz

Сколько столбцов получилось в матрице `X`?

In [162]:
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels_2,drive-wheels_3,engine-location,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
0,3,168.072493,16500.0,1,1,0,23569.6,0,0,1,...,4,130,17650.307692,3.47,2.68,9.0,111,5000,21,27
1,3,168.072493,14997.5,1,1,0,22968.6,0,0,1,...,4,130,17617.285714,3.47,2.68,9.0,111,5000,21,27
2,1,134.001799,14997.5,1,1,0,9859.791045,0,0,1,...,6,152,17617.285714,2.68,3.47,9.0,154,5000,19,26
3,2,164.0,18641.0,1,1,1,14465.236559,1,0,1,...,4,109,17645.307692,3.19,3.4,10.0,102,5500,24,30
4,2,164.0,17941.0,1,1,1,14427.602151,0,1,1,...,5,136,17606.846154,3.19,3.4,8.0,115,5500,18,22


In [163]:
X.shape

(201, 26)

In [164]:
X['normalized-losses'] = X['normalized-losses'].astype(float)
X['bore'] = X['bore'].astype(float)
X['stroke'] = X['stroke'].astype(float)
X['horsepower'] = X['horsepower'].astype(float)
X['peak-rpm'] = X['peak-rpm'].astype(float)

y = y.astype(float)

Разбейте данные на тренировочную и тестовую часть в пропорции 3 к 1, зафиксируйте random_state = 42.

In [165]:
from sklearn.model_selection import train_test_split

# your code here
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

Масштабируйте данные при помощи MinMaxScaler.

Обучайте масштабирование на тренировочных данных, а потом примените и к трейну, и к тесту.

In [166]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()

X_train_scaled = pd.DataFrame(mms.fit_transform(X_train, y_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(mms.transform(X_test), columns=X_test.columns)

Обучите на тренировочных данных линейную регрессию, сделайте предсказание на тесте и вычислите значение $R^2$ на тестовых данных.

In [167]:
# your code here
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

pred = lr.predict(X_test_scaled)

## Вопрос для Quiz

Чему равно значение $R^2$ на тестовых данных? Ответ округлите до сотых.

In [168]:
from sklearn.metrics import r2_score
print('r2:',r2_score(y_test, pred))

r2: 0.9059035124016477
