In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import lightgbm as lgb
pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/train_location.csv')

drop_cols = ['pet-friendly']
df.drop(drop_cols, axis=1, inplace=True)

# use if skipping grid search
best_params = {'max_depth': 40, 'learning_rate': 0.05, 'n_estimators': 1000, 'min_child_samples': 30, 'subsample': 1.0, 'colsample_bytree': 1.0, 'reg_alpha': 0.5, 'reg_lambda': 1.0}

df.head()

In [None]:
X = df.drop('price', axis=1)
df['price'] = df['price'].astype(float)
y = df['price']

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15)

lgb_reg = lgb.LGBMRegressor(objective='regression',
                             metric='rmse',
                            boosting_type='gbdt',
                            verbose=-1)

In [None]:
param_dist = {                
    'max_depth': [15, 25, 40],                  
    'learning_rate': [0.001, 0.01, 0.05],        
    'n_estimators': [100, 250, 500, 1000], 
    'min_child_samples': [20, 30, 40],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],  
    'reg_alpha': [0.1, 0.5, 1.0],     
    'reg_lambda': [0.1, 0.5, 1.0]      
}

cv = StratifiedKFold(n_splits=5, shuffle=True)

grid_search = GridSearchCV(
    estimator=lgb_reg,
    scoring='neg_root_mean_squared_error',
    cv=cv,
    param_grid=param_dist,
    verbose=10,
    n_jobs=-1
)

In [None]:
# This function fits the random search to tune the hyperparameters. It will take a while to run.
# skip this cell if best_params are known

grid_search.fit(
    X_train, y_train
)

best_params = grid_search.best_params_
print(best_params)

In [None]:
final_model = lgb.LGBMRegressor(objective='regression',
                             metric='rmse',
                            boosting_type='gbdt', random_state=42,  **best_params)

final_model.fit(
    X_train, y_train
)

In [None]:
# checking validation error

y_predicted = final_model.predict(X_val)

y_pred_val = np.clip(np.round(y_predicted), 0, 5)
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f'Validation RMSE: {rmse}')

sns.heatmap(confusion_matrix(y_val, y_pred_val), annot=True, fmt="d", cmap='Blues')
plt.ylabel('True')
plt.xlabel('Predicted')

plt.show()

In [None]:
# checking train error for overfitting

y_train_predicted = final_model.predict(X_train)
y_pred_train = np.clip(np.round(y_train_predicted), 0, 5)
rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f'Train RMSE: {rmse}')

sns.heatmap(confusion_matrix(y_pred_train, y_train), annot=True, fmt="d", cmap='Blues')
plt.ylabel('True')
plt.xlabel('Predicted')

plt.show()

In [None]:
# save predictions for kaggle competition

df_test = pd.read_csv('../data/test_location.csv')

X_test = df_test.drop('id', axis=1)

y_test = np.clip(np.round(final_model.predict(X_test)), 0, 5)

df_test['price'] = y_test
df_test['price'] = df_test['price'].astype(int)

df_test['id'] = df_test['id'].astype(int)

df_test[['id', 'price']].to_csv('../predictions/final.csv', index=False)

In [None]:
# plot feature importance
import matplotlib.pyplot as plt

importances = final_model.feature_importances_
indices = np.argsort(importances)[::-1]

top_n = min(50, X.shape[1])
plt.figure(figsize=(25, 5))
plt.title(f"Top {top_n} Feature Importances")
plt.bar(range(top_n), importances[indices[:top_n]], align="center")
plt.xticks(range(top_n), X.columns[indices[:top_n]], rotation=90)
plt.xlim([-1, top_n])
plt.show()

In [None]:
# print least important features
print("Least important features:")

indices = np.argsort(importances)

top_n = min(50, X.shape[1])
plt.figure(figsize=(25, 5))
plt.title(f"Least {top_n} Feature Importances")
plt.bar(range(top_n), importances[indices[:top_n]], align="center")
plt.xticks(range(top_n), X.columns[indices[:top_n]], rotation=90)
plt.xlim([-1, top_n])
plt.show()