In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [2]:
filenames = ['audi.csv', 'bmw.csv', 'ford.csv', 'hyundi.csv', 'merc.csv', 'skoda.csv', 'toyota.csv', 'vw.csv']

# Easy

Берем данные по skoda

In [3]:
df = pd.read_csv('../data/skoda.csv')
numeric_columns = df.select_dtypes(include='number').columns.tolist()
df[numeric_columns].head(5)

Unnamed: 0,year,price,mileage,tax,mpg,engineSize
0,2017,10550,25250,150,54.3,1.4
1,2018,8200,1264,145,67.3,1.0
2,2019,15650,6825,145,67.3,2.0
3,2015,14000,28431,165,51.4,2.0
4,2019,18350,10912,150,40.9,1.5


In [4]:
features = numeric_columns.copy()
features.remove('price')

target = ['price']

In [5]:
x = df[features]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [6]:
forest_model = RandomForestRegressor(n_jobs=-1)

forest_model.fit(x_train, y_train.values.ravel())
y_pred = forest_model.predict(x_test)
score = r2_score(y_pred, y_test)

In [7]:
print(f'r2_score: {score}')
print('feature_importances:')
for i in sorted(zip(features, forest_model.feature_importances_), key=lambda x: x[1]):
    print(f'\t{i}')

r2_score: 0.9207526210838993
feature_importances:
	('tax', 0.026485338832962918)
	('mileage', 0.11836862971518601)
	('engineSize', 0.15896484162467592)
	('year', 0.19420303064564057)
	('mpg', 0.5019781591815345)


# Medium

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
dfs = []

for filename in filenames:
    df = pd.read_csv(f'../data/{filename}')
    df['manufacturer'] = filename
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

Посмотрим на уникальные значения столбцов

In [10]:
columns = df.columns.values
for col in columns:
    print(col, df[col].unique()[:10])

model [' A1' ' A6' ' A4' ' A3' ' Q3' ' Q5' ' A5' ' S4' ' Q2' ' A7']
year [2017 2016 2019 2015 2014 2018 2013 2020 2004 2009]
price [12500 16500 11000 16800 17300 13900 13250 11750 10200 12000]
transmission ['Manual' 'Automatic' 'Semi-Auto' 'Other']
mileage [15735 36203 29946 25952  1998 32260 76788 75185 46112 22451]
fuelType ['Petrol' 'Diesel' 'Hybrid' 'Other' 'Electric']
tax [150.  20.  30. 145. 125. 200.   0. 205. 160. 235.]
mpg [55.4 64.2 67.3 49.6 58.9 61.4 70.6 60.1 57.6 52.3]
engineSize [1.4 2.  1.  3.  1.6 1.8 1.5 4.  2.5 1.2]
manufacturer ['audi.csv' 'bmw.csv' 'ford.csv' 'hyundi.csv' 'merc.csv' 'skoda.csv'
 'toyota.csv' 'vw.csv']
tax(£) [ nan 145. 235.  30.  20. 160. 125. 150. 135. 200.]


In [11]:
transmission_mapping = {
    'Manual': 0,
    'Semi-Auto': 1,
    'Other': 2,
    'Automatic': 3,
}
fuel_mapping = {
    'Petrol': 1,
    'Diesel': 2,
    'Hybrid': 3,
    'Other': 4,
    'Electric': 5,
}

Преобразуем признаки

In [12]:
df['transmission'] = df['transmission'].map(transmission_mapping)

df['fuelType'] = df['fuelType'].map(fuel_mapping)

label_encoder = LabelEncoder()
df['model'] = label_encoder.fit_transform(df['model'])

Посмотрим есть ли пропуски

In [13]:
for col in df.columns.values:
    print(f'{col}: Na count: {df[col].isna().sum()}')

model: Na count: 0
year: Na count: 0
price: Na count: 0
transmission: Na count: 0
mileage: Na count: 0
fuelType: Na count: 0
tax: Na count: 4860
mpg: Na count: 0
engineSize: Na count: 0
manufacturer: Na count: 0
tax(£): Na count: 80695


Выбросим столбцы с пропусками

In [14]:
df = df.drop(['tax(£)'], axis=1)
df = df.drop(['tax'], axis=1)

In [15]:
df_copy = df.copy()  # использую потом

In [16]:
df.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,mpg,engineSize,manufacturer
0,9,2017,12500,0,15735,1,55.4,1.4,audi.csv
1,14,2016,16500,3,36203,2,64.2,2.0,audi.csv
2,9,2016,11000,0,29946,1,55.4,1.4,audi.csv
3,12,2017,16800,3,25952,2,67.3,2.0,audi.csv
4,11,2019,17300,0,1998,1,49.6,1.0,audi.csv


In [17]:
features = list(df.columns.values)
features.remove('price')
features.remove('manufacturer')

target = ['price']

In [18]:
x = df[features]
y = df[target]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [19]:
forest_model = RandomForestRegressor(n_jobs=-1)

forest_model.fit(x_train, y_train.values.ravel())
y_pred = forest_model.predict(x_test)
score = r2_score(y_pred, y_test)

In [20]:
print(f'r2_score: {score}')
print('feature_importances:')
for i in sorted(zip(features, forest_model.feature_importances_), key=lambda x: x[1]):
    print(i)

r2_score: 0.9546956711395893
feature_importances:
('fuelType', 0.004636401382401251)
('mileage', 0.05680040225470808)
('model', 0.07875061827515999)
('mpg', 0.11967835131328601)
('transmission', 0.1759502146182593)
('year', 0.2665789741478749)
('engineSize', 0.29760503800831045)


Попробуем перебрать параметры

In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimators': [5, 10, 20, 100],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'max_features': [1, 2, 3]
}

rf_regressor = RandomForestRegressor(n_jobs=-1)

grid_search = RandomizedSearchCV(estimator=rf_regressor, param_distributions=param_grid, cv=5, verbose=1, n_iter=50)
grid_search.fit(x, y.values.ravel())

print("Лучшие параметры:", grid_search.best_params_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
Лучшие параметры: {'n_estimators': 100, 'min_samples_split': 3, 'min_samples_leaf': 2, 'max_features': 3}


In [22]:
best_forest_model = RandomForestRegressor(n_jobs=-1, **grid_search.best_params_)

best_forest_model.fit(x_train, y_train)
y_pred = best_forest_model.predict(x_test)
score = r2_score(y_pred, y_test)


  return fit_method(estimator, *args, **kwargs)


Получилось примерно так же

In [23]:
print(f'r2_score: {score}')
print('feature_importances:')
for i in sorted(zip(features, best_forest_model.feature_importances_), key=lambda x: x[1]):
    print(f'\t{i}')

r2_score: 0.9527091429802192
feature_importances:
	('fuelType', 0.018898964389879622)
	('model', 0.07369831189705954)
	('mileage', 0.0931858059654367)
	('transmission', 0.15741924536575488)
	('mpg', 0.18238845066046838)
	('year', 0.22290696628569068)
	('engineSize', 0.2515022554357103)


Теперь попробуем построить дерево для каждой модели

In [24]:
models = df['model'].unique()
len(models)

173

In [25]:
models_rate = [[len(df[df['model'] == model]), model] for model in models]
models_rate.sort()

models_rate[:40]

[[1, 10],
 [1, 17],
 [1, 19],
 [1, 49],
 [1, 114],
 [1, 115],
 [1, 146],
 [1, 169],
 [1, 170],
 [1, 171],
 [1, 172],
 [2, 107],
 [2, 133],
 [2, 139],
 [3, 33],
 [3, 122],
 [3, 151],
 [3, 153],
 [4, 38],
 [4, 53],
 [4, 123],
 [4, 149],
 [6, 36],
 [6, 64],
 [7, 34],
 [7, 48],
 [7, 165],
 [8, 37],
 [8, 75],
 [8, 93],
 [8, 127],
 [11, 41],
 [12, 121],
 [12, 135],
 [15, 40],
 [15, 55],
 [15, 97],
 [16, 54],
 [16, 126],
 [17, 117]]

Выкинем редкие модели, которых меньше 15 штук в выборке.

In [26]:
rare_models = [model for rate, model in models_rate if rate < 15]
rare_models

[10,
 17,
 19,
 49,
 114,
 115,
 146,
 169,
 170,
 171,
 172,
 107,
 133,
 139,
 33,
 122,
 151,
 153,
 38,
 53,
 123,
 149,
 36,
 64,
 34,
 48,
 165,
 37,
 75,
 93,
 127,
 41,
 121,
 135]

In [27]:
len(rare_models)

34

In [28]:
df = df[~df['model'].isin(rare_models)]

In [29]:
models = [model for model in models if model not in rare_models]

In [30]:
features = list(df.columns.values)
features.remove('price')
features.remove('manufacturer')

target = ['price']
print('features:', features)

features: ['model', 'year', 'transmission', 'mileage', 'fuelType', 'mpg', 'engineSize']


In [31]:
from tqdm import tqdm

random_forests = dict()

y_pred_total = np.empty(0)
y_test_total = pd.DataFrame()

for car_model in tqdm(models):
    random_forests[car_model] = RandomForestRegressor(n_jobs=-1, **grid_search.best_params_)
    random_forest = random_forests[car_model]

    df_car_model = df[df['model'] == car_model]

    x = df_car_model[features]
    y = df_car_model[target]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

    random_forest.fit(x_train, y_train.values.ravel())
    y_pred = random_forest.predict(x_test)

    y_pred_total = np.concatenate([y_pred_total, y_pred])
    y_test_total = pd.concat([y_test_total, y_test], axis=0)

100%|██████████████████████████████████████████| 139/139 [00:09<00:00, 14.25it/s]


Оценим score общей модели

In [32]:
score = r2_score(y_pred_total, y_test_total)
score

0.9649776492163962

Получилось чуть лучше

### Не по теме, а просто ради интереса

Важность фич для разных производителей (среднее для 10 обученных лесов)

In [33]:
features = ['year', 'transmission', 'mileage', 'fuelType', 'mpg', 'engineSize']
target = ['price']

dataframes = []
for _ in tqdm(range(10)):
    df_features = pd.DataFrame(columns=features)
    for filename in filenames:

        df = df_copy[df_copy['manufacturer'] == filename]
        x = df[features]
        y = df[target]

        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
        forest_model = RandomForestRegressor(n_jobs=-1)

        forest_model.fit(x_train, y_train.values.ravel())

        new_row = []
        for i in forest_model.feature_importances_:
            new_row.append(i)
        df_features.loc[len(df_features)] = new_row
    df_features.index = filenames
    dataframes.append(df_features)


100%|████████████████████████████████████████████| 10/10 [00:16<00:00,  1.65s/it]


In [34]:
df_average = sum(dataframes) / len(dataframes)

In [35]:
df_average

Unnamed: 0,year,transmission,mileage,fuelType,mpg,engineSize
audi.csv,0.220269,0.030846,0.068238,0.014807,0.472944,0.192895
bmw.csv,0.45346,0.020957,0.088455,0.021685,0.147878,0.267564
ford.csv,0.492121,0.008912,0.0947,0.002133,0.11153,0.290604
hyundi.csv,0.347031,0.013826,0.101511,0.025426,0.173972,0.338234
merc.csv,0.132377,0.016625,0.291013,0.012021,0.229068,0.318896
skoda.csv,0.199493,0.0266,0.102133,0.018007,0.506935,0.146833
toyota.csv,0.269899,0.009682,0.049427,0.005095,0.069953,0.595944
vw.csv,0.175688,0.01011,0.070874,0.030655,0.599325,0.113348


- Year (Год производства): Этот параметр является самым важным для моделей Ford, BMW и Hyundai, но менее важным для Mercedes.
- Mileage (Пробег): Для mercedes является одним из самых важных параметров, для других же марок этот параметр менее важен.
- MPG (Расход): Для VW, Skoda, Audi этот параметр оказывается самым выжным. Для toyota этот параметр наименее важен
- Engine Size (Объем двигателя): Это самый важный параметр для toyota, наименее важный для VW, Skoda, Audi