In [17]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, cross_val_score, \
                                    train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import GridSearchCV

In [29]:
# Загрузка данных для регрессии. price - прогнозируемый атрибут

PATH_2_TRAIN = "../Датасеты/Машины/prepared/train.csv"
PATH_2_TEST  = "../Датасеты/Машины/prepared/test.csv"

data = pd.read_csv(PATH_2_TRAIN)
test_data = pd.read_csv(PATH_2_TEST)

data

Unnamed: 0,symboling,gas,turbo,doornumber,enginelocation,wheelbase,carlength,carwidth,carheight,curbweight,...,ohcv,dohcv,4bbl,spfi,spdi,2bbl,mpfi,1bbl,mfi,idi
0,-1,1,0,4,0,102.4,175.6,66.5,54.9,2326,...,0,0,0,0,0,0,1,0,0,0
1,0,1,0,2,0,103.5,193.8,67.9,53.7,3380,...,0,0,0,0,0,0,1,0,0,0
2,0,1,0,4,0,100.4,184.6,66.5,56.1,3296,...,1,0,0,0,0,0,1,0,0,0
3,1,1,0,2,0,93.1,159.1,64.2,54.1,1905,...,0,0,0,0,0,1,0,0,0,0
4,1,1,0,2,0,94.5,168.7,64.0,52.6,2169,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1850,3,1,0,2,0,91.3,170.7,67.9,49.7,3071,...,1,0,0,0,0,0,1,0,0,0
1851,0,1,0,4,0,100.4,181.7,66.5,55.1,3095,...,1,0,0,0,0,0,1,0,0,0
1852,1,1,0,2,0,93.7,150.0,64.0,52.6,1837,...,0,0,0,0,0,0,0,1,0,0
1853,0,1,0,4,0,120.9,208.1,71.7,56.7,3900,...,1,0,0,0,0,0,1,0,0,0


In [31]:
# Ещё раз разделим на тестовую и обучающую выборки в соотношении 3:7, а также отмасштабируем

columns = list(filter(lambda x: x != "price", data.columns))

X, X_test, y, y_test = train_test_split(pd.concat((data[columns], test_data[columns]), axis=0).to_numpy(dtype=np.float64),
                                        pd.concat((data["price"], test_data["price"]), axis=0).to_numpy(dtype=np.float64),
                                        test_size=0.3, random_state=17)

scalerX = StandardScaler()
scalerY = StandardScaler()
scalerX.fit_transform(X)
scalerY.fit_transform(y.reshape(-1, 1))

X = scalerX.transform(X)
y = scalerY.transform(y.reshape(-1, 1)).reshape(-1)
X_test = scalerX.transform(X_test)
y_test = scalerY.transform(y_test.reshape(-1, 1)).reshape(-1)

In [32]:
# Линейная регрессия

linreg = LinearRegression()
linreg.fit(X, y)

print("Train MSE = {:.1e}".format(mean_squared_error(y, linreg.predict(X))))
print("Test  MSE = {:.1e}".format(mean_squared_error(y_test, linreg.predict(X_test))))

Train MSE = 3.8e-02
Test  MSE = 4.0e-02


In [33]:
# Сортируем атрибуты по важности

table = pd.DataFrame({"coef": linreg.coef_, "coef_abs": np.abs(linreg.coef_)},
                     index=columns)

table.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
gas,-1.011794e+12,1.011794e+12
idi,-8.324087e+11,8.324087e+11
ohcf,4.620012e+11,4.620012e+11
toyota,4.287406e+11,4.287406e+11
fwd,4.284098e+11,4.284098e+11
...,...,...
stroke,-5.165768e-02,5.165768e-02
citympg,4.232788e-02,4.232788e-02
doornumber,2.746921e-02,2.746921e-02
cylindernumber,-2.648926e-02,2.648926e-02


In [34]:
# Лассо-регрессия (уже лучше)

lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(X, y)

table = pd.DataFrame({"coef": lasso1.coef_, "coef_abs": np.abs(lasso1.coef_)},
                     index=columns)

table.sort_values(by="coef_abs", ascending=False)

Unnamed: 0,coef,coef_abs
enginesize,0.321830,0.321830
bmw,0.202270,0.202270
buick,0.201900,0.201900
carwidth,0.155323,0.155323
enginelocation,0.130148,0.130148
...,...,...
cylindernumber,0.000000,0.000000
carheight,-0.000000,0.000000
carlength,0.000000,0.000000
doornumber,0.000000,0.000000


In [35]:
# Настроим гиперпараметр alpha

params = {"alpha": np.logspace(-3, 3, 40)}
grid = GridSearchCV(Lasso(random_state=17), params, cv=5)
grid.fit(X, y)

print("best alpha = {:.1e}".format(grid.best_params_["alpha"]))
print("best cross validation score = {:.1e}".format(grid.best_score_))

best alpha = 1.0e-03
best cross validation score = 9.6e-01


In [36]:
# Метрики и лучшие атрибуты

lasso = Lasso(alpha=grid.best_params_["alpha"], random_state=17)
lasso.fit(X, y)

print("Train MSE = {:.1e}".format(mean_squared_error(y, lasso.predict(X))))
print("Test  MSE = {:.1e}".format(mean_squared_error(y_test, lasso.predict(X_test))))

table = pd.DataFrame({"coef": lasso.coef_, "coef_abs": np.abs(lasso.coef_)},
                     index=columns)

table.sort_values(by="coef_abs", ascending=False)

Train MSE = 3.8e-02
Test  MSE = 4.1e-02


Unnamed: 0,coef,coef_abs
enginesize,0.580424,0.580424
curbweight,0.242402,0.242402
bmw,0.196699,0.196699
wheelbase,0.190379,0.190379
peugeot,-0.150078,0.150078
...,...,...
dohc,-0.000000,0.000000
volkswagen,0.000000,0.000000
ohcf,0.000000,0.000000
ohc,-0.000000,0.000000


In [37]:
# Случайный лес

forest = RandomForestRegressor(random_state=17)
forest.fit(X, y)

print("Train MSE = {:.1e}".format(mean_squared_error(y, forest.predict(X))))
print("Test  MSE = {:.1e}".format(mean_squared_error(y_test, forest.predict(X_test))))
print("Mean squared error (cv): {:.1e}".format(np.mean(np.abs(cross_val_score(forest, X, y, scoring="neg_mean_squared_error")))))

Train MSE = 1.1e-03
Test  MSE = 1.6e-03
Mean squared error (cv): 1.3e-03


In [38]:
# Настраиваем гиперпараметры

params = {"max_depth": list(range(10, 25)), "max_features": list(range(6, len(columns)))}

grid = GridSearchCV(RandomForestRegressor(n_jobs=-1, random_state=17),
                    params,
                    scoring="neg_mean_squared_error",
                    n_jobs=-1,
                    cv=5,
                    verbose=True)

grid.fit(X, y)

Fitting 5 folds for each of 885 candidates, totalling 4425 fits


GridSearchCV(cv=5, estimator=RandomForestRegressor(n_jobs=-1, random_state=17),
             n_jobs=-1,
             param_grid={'max_depth': [10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
                                       20, 21, 22, 23, 24],
                         'max_features': [6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
                                          16, 17, 18, 19, 20, 21, 22, 23, 24,
                                          25, 26, 27, 28, 29, 30, 31, 32, 33,
                                          34, 35, ...]},
             scoring='neg_mean_squared_error', verbose=True)

In [40]:
# Метрики

print("Train MSE = {:.1e}".format(mean_squared_error(y, grid.best_estimator_.predict(X))))
print("Test  MSE = {:.1e}".format(mean_squared_error(y_test, grid.best_estimator_.predict(X_test))))
print("Mean squared error (cv): {:.1e}".format(np.mean(np.abs(cross_val_score(grid.best_estimator_, X, y, scoring="neg_mean_squared_error")))))

Train MSE = 1.1e-03
Test  MSE = 1.6e-03
Mean squared error (cv): 1.3e-03


In [43]:
# Самые важные признаки

table = pd.DataFrame({"importance": grid.best_estimator_.feature_importances_},
                      index=columns)

table.sort_values(by="importance", ascending=False)

Unnamed: 0,importance
enginesize,0.460754
curbweight,0.191899
cylindernumber,0.093633
horsepower,0.084030
highwaympg,0.047338
...,...
mfi,0.000007
1bbl,0.000007
jaguar,0.000005
dodge,0.000004
