In [2]:
import pandas as pd
import  numpy as np
import os
import pathlib
# модули для оброботки данных
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn import preprocessing
# модули для обучения модели
import lightgbm
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
# для подбора кибер параметров
from sklearn.model_selection import GridSearchCV

In [3]:
df = pd.read_csv(os.path.join('dataset', 'diamonds.csv'), index_col=0)
df.shape

(53940, 10)

In [4]:
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
2,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
3,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
4,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
5,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [5]:
# проверка пропущеных значений
df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [6]:
# перевод категориальных значений в численные
cat_columns = [cname for cname in df.columns if df[cname].dtype == 'object']

encoder = preprocessing.LabelEncoder()

for col in cat_columns:
    df[col] = encoder.fit_transform(df[col])



In [7]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1,0.23,2,1,3,61.5,55.0,326,3.95,3.98,2.43
2,0.21,3,1,2,59.8,61.0,326,3.89,3.84,2.31
3,0.23,1,1,4,56.9,65.0,327,4.05,4.07,2.31
4,0.29,3,5,5,62.4,58.0,334,4.20,4.23,2.63
5,0.31,1,6,3,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53936,0.72,2,0,2,60.8,57.0,2757,5.75,5.76,3.50
53937,0.72,1,0,2,63.1,55.0,2757,5.69,5.75,3.61
53938,0.70,4,0,2,62.8,60.0,2757,5.66,5.68,3.56
53939,0.86,3,4,3,61.0,58.0,2757,6.15,6.12,3.74


In [8]:
# разделение датасета на x и y, выделение части для валидации
x = df.drop('price', axis=1)
y = df['price']
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2, random_state=1)

In [9]:
# Linear Regression
lr = LinearRegression()
scores = cross_val_score(lr, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('Linear Regression cross validation MAE: ', - np.mean(scores))

Linear Regression cross validation MAE:  864.6521477706286


In [10]:
# Decision Tree
dt = DecisionTreeRegressor()
scores = cross_val_score(dt, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('Linear Regression cross validation MAE: ', - np.mean(scores))

Linear Regression cross validation MAE:  370.221382821603


In [11]:
# Random Forest
rf = RandomForestRegressor(random_state=0)
scores = cross_val_score(rf, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('Linear Regression cross validation MAE: ', - np.mean(scores))

Linear Regression cross validation MAE:  277.6862470233346


In [12]:
# LightGBM
lgb = lightgbm.LGBMRegressor(random_state=0)
scores = cross_val_score(lgb, x_train, y_train, cv=5, scoring='neg_mean_absolute_error')
print('Linear Regression cross validation MAE: ', - np.mean(scores))

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1269
[LightGBM] [Info] Number of data points in the train set: 34521, number of used features: 9
[LightGBM] [Info] Start training from score 3950.741664
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1260
[LightGBM] [Info] Number of data points in the train set: 34521, number of used features: 9
[LightGBM] [Info] Start training from score 3919.524434
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002672 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Tota

In [13]:
# подбор параметров и выбор модели
def cv_params(model, param_grid):
    scoring = 'neg_mean_absolute_error'

    opt_params = GridSearchCV(
        estimator=model,            # модель
        param_grid=param_grid,      # параметры
        scoring=scoring,            # стратегиия валидации
        cv=5,                       # количество слоев крос валидации
        n_jobs=-1)                  # количество потоков для обучения, -1 = все

    opt_params.fit(x_train, y_train)
    params = opt_params.best_params_
    best_score = opt_params.best_score_

    print(f'Best score: {round(-best_score, 2)}')
    print(f'Best parameters: {params}\n')

    return params

In [14]:
# LightGBM
lgb_param_grid = {
    'max_depth': [4, 10, 15, -1],
    'num_leaves': [25, 35, 45],
    'n_estimators': [41, 100, 250, 500, 600]
}

lgb_clean = lightgbm.LGBMRegressor(random_state=1)
lgb_params = cv_params(lgb_clean, lgb_param_grid)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002468 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1281
[LightGBM] [Info] Number of data points in the train set: 43152, number of used features: 9
[LightGBM] [Info] Start training from score 3931.979236
Best score: 273.85
Best parameters: {'max_depth': 10, 'n_estimators': 600, 'num_leaves': 45}



In [16]:
# Random Forest
rf_param_grid = {
    'max_depth': [20, 25],
    'n_estimators': [500, 800]
}

rf_clean = RandomForestRegressor(random_state=1)
rf_params = cv_params(rf_clean, rf_param_grid)

In [19]:
# обучаем на лучших параметрах
lightgbm.LGBMRegressor(**lgb_params)
lgb.fit(x_train, y_train)

preds = lgb.predict(x_valid)

print(f'MAPE: {round(mean_absolute_percentage_error(y_valid, preds) * 100, 2)}%')
print(f'MAE: {round(mean_absolute_error(y_valid, preds), 2)}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003929 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1281
[LightGBM] [Info] Number of data points in the train set: 43152, number of used features: 9
[LightGBM] [Info] Start training from score 3931.979236
MAPE: 8.31%
MAE: 284.61


In [20]:
results = pd.DataFrame({'Model' : np.round(preds), 'Actual': y_valid})
results = results.reset_index().drop('index', axis=1)
results.head(15)

Unnamed: 0,Model,Actual
0,557.0,564
1,5661.0,5914
2,2585.0,2562
3,609.0,537
4,6162.0,5964
5,1334.0,984
6,5002.0,5247
7,670.0,611
8,11207.0,9645
9,1067.0,1162


In [21]:
lgb.fit(x, y)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004072 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1293
[LightGBM] [Info] Number of data points in the train set: 53940, number of used features: 9
[LightGBM] [Info] Start training from score 3932.799722
