In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
import seaborn as sns  
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

pd.set_option('display.max_columns', None)

In [None]:
df = pd.read_csv('../data/train_final.csv')
# if skipping the hyperparameter tuning, load the best model
best_params = {'bootstrap': False, 'max_depth': 30, 'max_features': 50, 'n_estimators': 500, 'max_leaf_nodes': None, 'min_samples_leaf': 5, 'ccp_alpha': 0.0}
df.head()

In [None]:
X = df.drop('price', axis=1)
df['price'] = df['price'].astype(float)

y = df['price']

In [None]:
param_grid = {
    'n_estimators': [100, 200, 500, 1000], 
    'max_depth': [20, 30, 40], 
    'min_samples_leaf': [5, 15, 30],
    'max_features': ['auto', 50, 63],
    'bootstrap': [True, False], 
    'max_leaf_nodes': [None, 30, 50, 100],
    'ccp_alpha': [0.0, 0.1, 0.2]
}

rf = RandomForestRegressor()

grid_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, cv=5, n_iter=500, verbose=2, n_jobs=-1)

grid_search.fit(X, y)

print("Best parameters found: ")
print(grid_search.best_params_)

best_rf = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [None]:
best_rf = RandomForestRegressor(**best_params, n_jobs=-1)

In [None]:
best_rf.fit(X_train, y_train)

In [None]:
# check the model performance in rmse on the validation set
y_pred = best_rf.predict(X_test)
y_pred = np.round(y_pred).astype(int)
y_pred = np.clip(y_pred, 0, 5)

rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))

print(f"RMSE: {rmse}")

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')

plt.show()

In [None]:
# check the train rmse to see if the model is overfitting
y_train_pred = best_rf.predict(X_train)

y_train_pred = np.clip(np.round(y_train_pred), 0, 5)
rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
print(f'Train RMSE: {rmse}')

conf_matrix = confusion_matrix(y_train, y_train_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap='Blues')
plt.xlabel('Predicted labels')
plt.ylabel('True labels')

plt.show()

In [None]:
importances = best_rf.feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(X.shape[1]):
    print(f"{f + 1}. {X.columns[indices[f]]} ({importances[indices[f]]})")

In [None]:
top_n = min(50, X.shape[1])
plt.figure(figsize=(25, 5))
plt.title(f"Top {top_n} Feature Importances")
plt.bar(range(top_n), importances[indices[:top_n]], align="center")
plt.xticks(range(top_n), X.columns[indices[:top_n]], rotation=90)
plt.xlim([-1, top_n])
plt.show()

In [None]:
# make predictions
df_test = pd.read_csv('../data/test_final.csv')
df_test.drop(['id'], axis=1, inplace=True)

y_pred = best_rf.predict(df_test)
y_pred_rounded = np.round(y_pred).astype(int)
y_test_pred_rounded = np.clip(y_pred_rounded, 0, 5)

# save predictions by id
df_test = pd.read_csv('../data/test_final.csv')
df_test['price'] = y_test_pred_rounded

df_test['price'] = df_test['price'].astype(int)

df_test['id'] = df_test['id'].astype(int)

df_test[['id', 'price']].to_csv('../predictions/descision_tree_2.csv', index=False)