In [1]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
from sklearn.decomposition import PCA
from catboost import CatBoostRegressor, Pool
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import lightgbm as lgb

warnings.simplefilter("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2

# Data Initialization

In [9]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
#delete ids
train_data = train_data.drop(columns=["Id"])
val_ids = test_data["Id"] #remember for submission
test_data = test_data.drop(columns=["Id"])
# delete nans
for feature in train_data.columns:
    percent = train_data[feature].isnull().sum() /  train_data.shape[0]
    if (percent > 0.7):
        train_data = train_data.drop(columns = feature)
        test_data = test_data.drop(columns = feature)
# need to convert data
for feature in train_data.columns[:-1]:
    if (train_data[feature].dtype == 'object'):
        train_data[feature] = LabelEncoder().fit_transform(train_data[feature])
        test_data[feature] = LabelEncoder().fit_transform(test_data[feature])
# split
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns='SalePrice').values, np.log1p(train_data['SalePrice'].values), test_size = 0.2, random_state = 98987)
X_train = SimpleImputer(strategy='most_frequent').fit_transform(X_train)
X_test = SimpleImputer(strategy='most_frequent').fit_transform(X_test)
# This time we will use Scaler with PCA
pca = PCA(n_components = 20)
X_train = pca.fit_transform(X_train, y_train)
X_test = pca.transform(X_test)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train, y_train)
X_test = scaler.transform(X_test)

# LGBMRegressor

In [5]:
# Гридила миллион лет и недогридила(2 часа где-то) -_-... делаем попроще параметры поиска
parameters = {
    'num_leaves': [i for i in range(2,128,6)],
    'learning_rate': [ 0.001, 0.002, 0.003, 0.004, 0.005, 0.01],
    'max_depth': [2, 3, 4, 5, 6, 7, 8],
    'n_estimators': [i for i in range(100,6000,100)],
    'max_bins':[i for i in range(6,518,64)],}
model = GridSearchCV(lgb.LGBMRegressor(), parameters)
model.fit(X_train, y_train)
print("Best parameters for LGBM is: {}".format(model.best_params_))

In [4]:
parameters = {
    'num_leaves': [2, 4, 96, 128],
    'learning_rate': [ 0.001, 0.005, 0.01],
    'max_depth': [2, 4, 6, 8],
    'n_estimators': [100, 1000, 5000],
    'max_bins':[128,256,512],}
lgb_model = GridSearchCV(lgb.LGBMRegressor(), parameters)
lgb_model.fit(X_train, y_train)
print("Best parameters for LGBM is: {}".format(lgb_model.best_params_))

Best parameters: {'learning_rate': 0.01, 'max_bins': 128, 'max_depth': 2, 'n_estimators': 5000, 'num_leaves': 4}


In [6]:
lgb_model = lgb.LGBMRegressor(**lgb_model.best_params_)
lgb_model.fit(X_train, y_train)
print('AbsError train:', metrics.mean_absolute_error(lgb_model.predict(X_train), y_train))
print('AbsError test:', metrics.mean_absolute_error(lgb_model.predict(X_test), y_test))

AbsError train: 0.07261147558047464
AbsError test: 0.1384169673519346


# XGBoostRegressor

In [3]:
# Опять считаем 2 часа и результата нет
parameters = {
    "learning_rate": (0.1, 0.2, 0.3),
    "max_depth": [ 2, 3, 4, 5, 6],
    "min_child_weight": [1, 1.5, 2, 4],
    "n_estimators":[1000, 5000, 10000, 15000],
    "colsample_bytree":[ 0.2, 0.4, 0.9, 1.]} # reg_lambda??
xgb_model = GridSearchCV(xgb.XGBRegressor(), parameters)
xgb_model.fit(X_train, y_train)
print("Best parameters for XGB is: {}".format(xgb_model.best_params_))

In [3]:
# Уменьшаем кол-во параметров
# Попробать с другими значениями "n_estimators"
# добавить l1 и l2 посмотреть
parameters = {
    "learning_rate": [0.1, 0.2, 0.3],
    "max_depth": [2, 3, 4, 5, 6],
    "min_child_weight": [1, 1.5, 2, 4],
    "n_estimators":[14400],
    "colsample_bytree":[0.9]} # reg_lambda??
xgb_model = GridSearchCV(xgb.XGBRegressor(), parameters)
xgb_model.fit(X_train, y_train)
print("Best parameters for XGB is: {}".format(xgb_model.best_params_))

Best parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.2, 'max_depth': 4, 'min_child_weight': 1, 'n_estimators': 14400}


In [4]:
xgb_model = xgb.XGBRegressor(**xgb_model.best_params_)
xgb_model.fit(X_train, y_train)
print('AbsError train:', metrics.mean_absolute_error(xgb_model.predict(X_train), y_train)) # сильный оверфитинг
print('AbsError test:', metrics.mean_absolute_error(xgb_model.predict(X_test), y_test))

AbsError train: 0.0009239077080119824
AbsError test: 0.13990238390245435


# CatBoostRegressor
При использовании CatBoostRegressor нет смысла использовать GridSearch 
поскольку bpdtcnys параметры, которые подходят для нашей задачи

## Data Reinitialize

In [13]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
#delete ids
train_data = train_data.drop(columns=["Id"])
val_ids = test_data["Id"] #remember for submission
test_data = test_data.drop(columns=["Id"])
# delete nans
for feature in train_data.columns:
    percent = train_data[feature].isnull().sum() /  train_data.shape[0]
    if (percent > 0.7):
        train_data = train_data.drop(columns = feature)
        test_data = test_data.drop(columns = feature)
# fill nans(Special for cats)
train_data = train_data.fillna(train_data.median())
test_data = test_data.fillna(test_data.median())
# get categorical features list
cat_features = []
iter = 0
for feature in train_data.columns[:-1]:
    if (train_data[feature].dtype == 'object'): cat_features.append(iter)
    iter +=1
# split
X_train, X_test, y_train, y_test = train_test_split(train_data.drop(columns='SalePrice').values, np.log1p(train_data['SalePrice'].values), test_size = 0.2, random_state = 98987)
X_train = SimpleImputer(strategy='most_frequent').fit_transform(X_train)
X_test = SimpleImputer(strategy='most_frequent').fit_transform(X_test)

In [14]:
# Сделать такую же модель
# навешать регуляризацию
cat_model = CatBoostRegressor(cat_features=cat_features, depth = 2, loss_function = 'RMSE', iterations = 100000, task_type = "GPU",  devices = '0:1')
train_pool = Pool(X_train, y_train, cat_features=cat_features)
dev_pool = Pool(X_test, y_test, cat_features=cat_features)
cat_model.fit(train_pool, eval_set = dev_pool, early_stopping_rounds = 10, verbose = 0, plot = False)
print('AbsError train:', metrics.mean_absolute_error(cat_model.predict(X_train), y_train))
print('AbsError test:', metrics.mean_absolute_error(cat_model.predict(X_test), y_test))
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))


def evaluate(model, X, y):
    preds = model.predict(X)
    print("RMSLE: " + str(rmsle(preds, y)))
evaluate(cat_model, X_train, y_train)
evaluate(cat_model, X_test, y_test)

AbsError train: 0.08228153574973592
AbsError test: 0.10204139258425525
RMSLE: 0.11487674273192189
RMSLE: 0.150002504187904


# RandomTreeForest

In [10]:
parameters = {
    'criterion':['squared_error', 'absolute_error', 'poisson'], 
    'n_estimators':[10,50,75, 100],
    'max_features':['auto', 'sqrt', 'log2'],
    'min_samples_split':[2,5,9],
    'max_depth': [100,500,1000,1500],}
rfr_model = GridSearchCV(RandomForestRegressor(), parameters)
rfr_model.fit(X_train[:300], y_train[:300])
print("Best parameters for RFR is: {}".format(rfr_model.best_params_))

Best parameters: {'criterion': 'absolute_error', 'max_depth': 1500, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 75}


In [11]:
model = RandomForestRegressor(**rfr_model.best_params_)
model.fit(X_train, y_train)
print('AbsError train:', metrics.mean_absolute_error(rfr_model.predict(X_train), y_train))
print('AbsError test:', metrics.mean_absolute_error(rfr_model.predict(X_test), y_test))

AbsError train: 0.12137994057750988
AbsError test: 0.16811222302776457
