In [None]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
in_folder = '../data/processed/6_new/'

In [None]:
train_data_path = in_folder + 'train.csv'
train_df = pd.read_csv(train_data_path)

test_data_path = in_folder + 'test.csv'
test_df = pd.read_csv(test_data_path)

final_train_data_path = in_folder + 'final_train.csv'
final_train_df = pd.read_csv(final_train_data_path)

predictive_data_path = in_folder + 'predictive.csv'
predictive_df = pd.read_csv(predictive_data_path)

price_bins = [0, 1, 2, 3, 4, 5]

X_train = train_df.drop('price', axis=1)
y_train = train_df['price']
X_test = test_df.drop('price', axis=1)
y_test = test_df['price']
X_final_train = final_train_df.drop('price', axis=1)
y_final_train = final_train_df['price']
X_predictive = predictive_df.drop('id', axis=1)
id_predictive = predictive_df['id']

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

final_scaler = StandardScaler()

X_final_train_scaled = final_scaler.fit_transform(X_final_train)
X_predictive_scaled = final_scaler.transform(X_predictive)


def plot_confusion_matrix(cm, classes, title):
    plt.figure(figsize=(8,6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=classes, yticklabels=classes)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title(title)
    plt.show()


def evaluate_model(model, X_train, y_train, X_test, y_test, model_name):

    y_train_pred = np.round(model.predict(X_train))
    y_train_pred = np.clip(y_train_pred, 0, 5)
    train_mse = mean_squared_error(y_train, y_train_pred)
    train_rmse = np.sqrt(train_mse)

    y_test_pred = np.round(model.predict(X_test))
    y_test_pred = np.clip(y_test_pred, 0, 5)
    test_mse = mean_squared_error(y_test, y_test_pred)
    test_rmse = np.sqrt(test_mse)

    print(f"--- {model_name} Performance ---")
    print(f"Train RMSE: {train_rmse:.4f}")
    print(f"Test RMSE: {test_rmse:.4f}")

    cm = confusion_matrix(y_test, y_test_pred)
    plot_confusion_matrix(cm, classes=[0,1,2,3,4,5], title=f'Confusion Matrix: {model_name}')

In [None]:
def objective(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 100),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 5),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5),
        'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }
    
    rf = RandomForestRegressor(
        random_state=314,
        **param,
        n_jobs=-1
    )
    
    score = cross_val_score(
        rf, X_train, y_train,
        cv=5,
        scoring='neg_root_mean_squared_error',
        n_jobs=-1
    ).mean()
    
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, timeout=600)

best_params = study.best_params

In [None]:
best_params = {'n_estimators': 350, 'max_depth': 29, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'bootstrap': False}

In [None]:
best_rf = RandomForestRegressor(random_state=314, **best_params)
best_rf.fit(X_train, y_train)
evaluate_model(best_rf, X_train, y_train, X_test, y_test, 'Random Forest')

In [None]:
final_rf = RandomForestRegressor(random_state=314, **best_params)
final_rf.fit(X_final_train, y_final_train)
final_predictions = final_rf.predict(X_predictive)
final_predictions = np.round(final_predictions).astype(int)
final_predictions = np.clip(final_predictions, 0, 5)

out_folder = '../data/processed/'
submission = pd.DataFrame({'id': id_predictive, 'price': final_predictions})
submission.to_csv(out_folder + 'random_forest.csv', index=False)