In [1]:
# Import helpful libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import cross_val_score, train_test_split
import optuna
import matplotlib.pyplot as plt


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
test = pd.read_csv("path\\test.csv", sep= ",")
train = pd.read_csv("path\\train.csv" , sep=",")


In [3]:
y = train["SalePrice"]
X_df = train.drop(columns=['SalePrice'])
X_df = X_df.dropna(axis=1)



In [11]:
def func(trial):
    # Automatic Feature selection
    n_features = X_df.shape[1]
    selected_features = []
    for i in range(n_features):
        if trial.suggest_categorical(f'feature_{i}', [True, False]):
            selected_features.append(X_df.columns[i])

    if not selected_features:
        selected_features = X_df.columns.tolist()  
    
   
    X_selected = X_df[selected_features]
    
    
    train_X, test_X, train_y, test_y = train_test_split(X_selected, y, test_size=0.2, random_state=1)
    
 
    non_numeric_columns = train_X.select_dtypes(include=['object']).columns
    
    # Column transformer for preprocessing
    column_trans = make_column_transformer(
        (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), non_numeric_columns),
        (StandardScaler(), train_X.select_dtypes(include=[np.number]).columns), 
        remainder="passthrough"
    )

    # pipeline
    pipe = Pipeline(steps=[
        ('preprocessor', column_trans),
        ('model', RandomForestRegressor(
            n_estimators=trial.suggest_int("n_estimators", 10, 25),  
            max_depth=trial.suggest_int("max_depth", 2, 8),  
            max_features=trial.suggest_categorical("max_features", choices=["sqrt", "log2"])
        ))
    ])

    scores = cross_val_score(pipe, train_X, train_y, cv=5, scoring='neg_mean_absolute_error')
    mean_score = -scores.mean()
    
    pipe.fit(train_X, train_y)

    train_pred = pipe.predict(train_X)
    train_mae = mean_absolute_error(train_y, train_pred)
    
    test_pred = pipe.predict(test_X)
    test_mae = mean_absolute_error(test_y, test_pred)
    
    overfitting_indicator = test_mae - train_mae

    return mean_score, overfitting_indicator, train_mae, selected_features, pipe

# Study and optimize 
def objective(trial):
    mean_score, overfitting_indicator, train_mae, selected_features, pipe = func(trial)
    return mean_score

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=1000)


print(f'Best value (test MAE): {study.best_value}')
print(f'Best parameters: {study.best_params}')

# Additional evaluation of overfitting on the best trial
best_trial = study.best_trial
mean_score, overfitting_indicator, train_mae, selected_features, pipe = func(best_trial)

print(f'Best trial test MAE: {mean_score}')
print(f'Best trial train MAE: {train_mae}')
print(f'Overfitting indicator (test MAE - train MAE): {overfitting_indicator}')
print(f'Selected features: {best_trial.params}')

# Study
best_params = study.best_params

# new selected features list
selected_features = [X_df.columns[i] for i in range(X_df.shape[1]) if best_params.get(f'feature_{i}', False)]

# subset
X_selected = X_df[selected_features]

# Column transformer for preprocessing
column_trans = make_column_transformer(
    (OneHotEncoder(sparse_output=False, handle_unknown='ignore'), X_selected.select_dtypes(include=['object']).columns),
    (StandardScaler(), X_selected.select_dtypes(include=[np.number]).columns),  
    remainder="passthrough"
)

# Creating the pipeline with the best parameters
pipe = Pipeline(steps=[
    ('preprocessor', column_trans),
    ('model', RandomForestRegressor(
        n_estimators=best_params['n_estimators'],
        max_depth=best_params['max_depth'],
        max_features=best_params['max_features']
    ))
])

# Fit the pipeline on the entire dataset
pipe.fit(X_selected, y)

# Make predictions on the entire dataset
predictions = pipe.predict(test)


submission = pd.DataFrame({"Id": test.Id, "SalePrice": predictions}).reset_index(drop=True)
submission_filename = 'submission.csv'
submission.to_csv(submission_filename, index=False)

[I 2024-07-21 17:51:02,057] A new study created in memory with name: no-name-d5e3b42b-a0ad-4b53-92c6-6d972411db41
[I 2024-07-21 17:51:02,261] Trial 0 finished with value: 28678.442717268248 and parameters: {'feature_0': False, 'feature_1': False, 'feature_2': True, 'feature_3': True, 'feature_4': True, 'feature_5': True, 'feature_6': True, 'feature_7': True, 'feature_8': False, 'feature_9': True, 'feature_10': True, 'feature_11': True, 'feature_12': False, 'feature_13': False, 'feature_14': True, 'feature_15': False, 'feature_16': True, 'feature_17': True, 'feature_18': True, 'feature_19': True, 'feature_20': True, 'feature_21': False, 'feature_22': False, 'feature_23': False, 'feature_24': False, 'feature_25': True, 'feature_26': True, 'feature_27': False, 'feature_28': True, 'feature_29': True, 'feature_30': False, 'feature_31': True, 'feature_32': True, 'feature_33': False, 'feature_34': False, 'feature_35': False, 'feature_36': True, 'feature_37': False, 'feature_38': False, 'featu

Best value (test MAE): 19378.52001523339
Best parameters: {'feature_0': False, 'feature_1': False, 'feature_2': True, 'feature_3': True, 'feature_4': True, 'feature_5': True, 'feature_6': False, 'feature_7': False, 'feature_8': False, 'feature_9': True, 'feature_10': False, 'feature_11': True, 'feature_12': False, 'feature_13': True, 'feature_14': True, 'feature_15': True, 'feature_16': False, 'feature_17': True, 'feature_18': True, 'feature_19': True, 'feature_20': False, 'feature_21': True, 'feature_22': False, 'feature_23': False, 'feature_24': False, 'feature_25': False, 'feature_26': True, 'feature_27': True, 'feature_28': False, 'feature_29': True, 'feature_30': True, 'feature_31': False, 'feature_32': False, 'feature_33': True, 'feature_34': True, 'feature_35': False, 'feature_36': True, 'feature_37': False, 'feature_38': False, 'feature_39': False, 'feature_40': False, 'feature_41': False, 'feature_42': False, 'feature_43': True, 'feature_44': True, 'feature_45': True, 'feature