# Model Selection 


In [2]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the cleaned training data 
training_data = pd.read_csv("cleaned_training_data.csv")

# Split into training and validation sets
X = training_data.drop(columns=['outcome'])
y = training_data['outcome']

print(training_data.columns)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

Index(['outcome', 'carat', 'depth', 'table', 'a1', 'a2', 'a3', 'a4', 'a5',
       'b1', 'b2', 'b3', 'b4', 'b5', 'a6', 'a7', 'a8', 'a9', 'a10', 'b6', 'b7',
       'b8', 'b9', 'b10', 'color_E', 'color_F', 'color_G', 'color_H',
       'color_I', 'color_J', 'cut_Good', 'cut_Ideal', 'cut_Premium',
       'cut_Very Good', 'clarity_IF', 'clarity_SI1', 'clarity_SI2',
       'clarity_VS1', 'clarity_VS2', 'clarity_VVS1', 'clarity_VVS2'],
      dtype='object')


In [59]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score

# Define models
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42),
    'SVR': SVR()
}

# Train and evaluate models
results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    r2 = r2_score(y_val, y_pred)
    results[name] = {'R2': r2}

In [60]:
# Convert results to DataFrame
results_df = pd.DataFrame(results).T
results_df = results_df.sort_values(by='R2', ascending=False)
print(results_df)

                         R2
Gradient Boosting  0.473559
Random Forest      0.456636
XGBoost            0.404290
Linear Regression  0.316431
Ridge              0.316406
SVR                0.314418
Lasso              0.281225
ElasticNet         0.254423
Decision Tree     -0.113727


In [61]:
from sklearn.model_selection import cross_val_score

for name, model in models.items():
    scores = cross_val_score(model, X, y, scoring='r2', cv=5)
    print(f"{name} R2 Score: {scores.mean():.4f} ± {scores.std():.4f}")

Linear Regression R2 Score: 0.3191 ± 0.0172
Ridge R2 Score: 0.3191 ± 0.0172
Lasso R2 Score: 0.2869 ± 0.0111
ElasticNet R2 Score: 0.2594 ± 0.0085
Decision Tree R2 Score: -0.1102 ± 0.0380
Random Forest R2 Score: 0.4480 ± 0.0105
Gradient Boosting R2 Score: 0.4714 ± 0.0123
XGBoost R2 Score: 0.3860 ± 0.0227
SVR R2 Score: 0.3207 ± 0.0160


## Stacking


In [None]:
# Import necessary libraries
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import r2_score

# Load the cleaned training data
training_data = pd.read_csv("cleaned_training_data6.csv")

# Split into features (X) and target (y)
X = training_data.drop(columns=['outcome'])
y = training_data['outcome']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the stacked model
estimators = [
    ('rf', RandomForestRegressor(random_state=42)),
    ('gb', GradientBoostingRegressor(random_state=42)),
    ('xgb', XGBRegressor(random_state=42))
]
stack = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

# Train the stacked model on the training set
stack.fit(X_train, y_train)

# Evaluate on the validation set
y_val_pred = stack.predict(X_val)
val_r2 = r2_score(y_val, y_val_pred)
print(f"Validation R2 Score: {val_r2:.4f}")

# Perform 5-fold cross-validation on the training set
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(stack, X_train, y_train, scoring='r2', cv=kf)

# Print cross-validation results
print(f"Cross-Validation R2 Scores: {cv_scores}")
print(f"Mean R2 Score from Cross-Validation: {cv_scores.mean():.4f}")

Validation R2 Score: 0.4724
Cross-Validation R2 Scores: [0.45895287 0.48911621 0.47764884 0.4427623  0.45915226]
Mean R2 Score from Cross-Validation: 0.4655 ± 0.0162


Stacking Results 
Validation R2 Score: 0.4724
Cross-Validation R2 Scores: [0.45895287 0.48911621 0.47764884 0.4427623  0.45915226]
Mean R2 Score from Cross-Validation: 0.4655 ± 0.0162

# Hyperparameter Tuning 
Gradient Boosting has the highest R2 score 0.47
Random Forest is second with R2 score of 0.45

## Gradient Boosting

In [None]:
# Hyperparameter Tuning for Gradient Boosting 
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.8, 0.9, 1.0]
}
# Initialize the model
gbr = GradientBoostingRegressor(random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=gbr, param_grid=param_grid, 
                           scoring='r2', cv=5, n_jobs=-1, verbose=2)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

## HyperParameter tuning results
50 min 51.3s
Fitting 5 folds for each of 729 candidates, totalling 3645 fits
Best Parameters: {'learning_rate': 0.1, 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'subsample': 0.9}

In [34]:
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

# Best hyperparameters obtained from the hyperparameter tuning process
best_params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100,
    'subsample': 0.9
}

# Initialize the GradientBoostingRegressor with the best hyperparameters
gbr = GradientBoostingRegressor(
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    random_state=42
)

# Apply RFE to select 20 features
rfe = RFE(estimator=gbr, n_features_to_select=20)
rfe.fit(X_train, y_train)

# Transform the training and validation sets
X_train_rfe = rfe.transform(X_train)
X_val_rfe = rfe.transform(X_val)

# Train the model on the transformed training data
gbr.fit(X_train_rfe, y_train)

# Predict on the transformed validation set
y_val_pred = gbr.predict(X_val_rfe)

# Calculate the R^2 score
val_r2 = r2_score(y_val, y_val_pred)
print(f"Validation R^2 Score: {val_r2}")

# Perform 5-fold cross-validation using a pipeline
pipeline = Pipeline([
    ('rfe', RFE(estimator=gbr, n_features_to_select=20)),
    ('gbr', GradientBoostingRegressor(
        learning_rate=best_params['learning_rate'],
        max_depth=best_params['max_depth'],
        min_samples_leaf=best_params['min_samples_leaf'],
        min_samples_split=best_params['min_samples_split'],
        n_estimators=best_params['n_estimators'],
        subsample=best_params['subsample'],
        random_state=42
    ))
])

# Perform cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, scoring='r2', cv=5)

# Print the cross-validation scores
print("Cross-Validation R^2 Scores:", cv_scores)
print("Mean CV R^2 Score:", cv_scores.mean())

Validation R^2 Score: 0.46919072692919117
Cross-Validation R^2 Scores: [0.45411405 0.46035867 0.47857845 0.45618376 0.48757568]
Mean CV R^2 Score: 0.4673621210563992


Results 
Validation R^2 Score: 0.46919072692919117
Cross-Validation R^2 Scores: [0.45411405 0.46035867 0.47857845 0.45618376 0.48757568]
Mean CV R^2 Score: 0.4673621210563992

## Random Forests 

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, KFold
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
import numpy as np

# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

# Initialize the Random Forest Regressor
rf_model = RandomForestRegressor(random_state=42)

# Use KFold for cross-validation
cv_strategy = KFold(n_splits=5, shuffle=True, random_state=42)

# Define scoring metrics
scoring = {
    'R2': make_scorer(r2_score),
    'MSE': make_scorer(mean_squared_error, greater_is_better=False)
}

# Perform GridSearchCV
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=cv_strategy,
    scoring=scoring,
    refit='R2',  # Refit the model using the best R2 score
    n_jobs=-1,  # Use all available CPU cores
    verbose=2  # Display progress
)

# Fit the model
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding R2 score
print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

# Optional: Evaluate on the validation set
best_rf_model = grid_search.best_estimator_
val_predictions = best_rf_model.predict(X_val)
val_r2 = r2_score(y_val, val_predictions)
print(f"Validation R2 Score: {val_r2}")

## XGBoost 

In [8]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0]
}

# Initialize the model
xgb = XGBRegressor(random_state=42)

# Perform Grid Search
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, 
                           scoring='r2', cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best R2 Score:", grid_search.best_score_)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best R2 Score: 0.471383488591291


Fitting 5 folds for each of 243 candidates, totalling 1215 fits
Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100, 'subsample': 1.0}
Best R2 Score: 0.471383488591291

# Evaluation Script

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import RFE
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Load data
training_data = pd.read_csv('cleaned_training_data.csv')
test_data = pd.read_csv('CW1_test.csv')

# Identify numerical features in the cleaned training data(excluding the target variable)
numerical_features = training_data.select_dtypes(include=[np.number]).columns.tolist()
numerical_features.remove('outcome') 

# Apply same scaling transformation
scaler = StandardScaler()
training_data[numerical_features] = scaler.fit_transform(training_data[numerical_features])
test_data[numerical_features] = scaler.transform(test_data[numerical_features])

training_data[numerical_features] = scaler.fit_transform(training_data[numerical_features])



# One hot encoding for test dataset
categorical_cols = ['cut', 'color', 'clarity']
test_data = pd.get_dummies(test_data, columns=categorical_cols, drop_first=True)

# Align columns
test_data = test_data.reindex(columns=training_data.columns.drop('outcome'), fill_value=0)

# Separate features and target
X_train = training_data.drop(columns=['outcome'])
y_train = training_data['outcome']
# Best hyperparameters obtained from the hyperameter tuning process 
best_params = {
    'learning_rate': 0.1,
    'max_depth': 3,
    'min_samples_leaf': 1,
    'min_samples_split': 2,
    'n_estimators': 100,
    'subsample': 0.9
}

# Initialize the model with the best hyperparameters
gbr = GradientBoostingRegressor(
    learning_rate=best_params['learning_rate'],
    max_depth=best_params['max_depth'],
    min_samples_leaf=best_params['min_samples_leaf'],
    min_samples_split=best_params['min_samples_split'],
    n_estimators=best_params['n_estimators'],
    subsample=best_params['subsample'],
    random_state=42
)
# Feature Selection (RFE)
base_model = gbr
rfe = RFE(estimator=base_model, n_features_to_select=20)
rfe.fit(X_train, y_train)
X_train_rfe = rfe.transform(X_train)
test_data_rfe = rfe.transform(test_data)

# Polynomial Features
poly = PolynomialFeatures(degree=1, include_bias=False)
X_train_poly = poly.fit_transform(X_train_rfe)
test_data_poly = poly.transform(test_data_rfe)

# Model Training
model = gbr
model.fit(X_train_poly, y_train)

# Cross-validation
scores = cross_val_score(model, X_train_poly, y_train, scoring='r2', cv=5)
print("Mean R^2 with cross-validation:", scores.mean())

# Generate predictions
y_test_pred = model.predict(test_data_poly)

# Save predictions
submission = pd.DataFrame({'yhat': y_test_pred})
submission.to_csv('CW1_submission_K23004648.csv', index=False)

Mean R^2 with cross-validation: 0.4727968459320494
