In [1]:
#conda update numpy

In [2]:
#conda install -c conda-forge catboost xgboost

In [3]:
'''
try:
    from catboost import CatBoostRegressor
    from xgboost import XGBRegressor
    print("CatBoost and XGBoost imported successfully.")
except Exception as e:
    print(f"Error: {e}")
'''

'\ntry:\n    from catboost import CatBoostRegressor\n    from xgboost import XGBRegressor\n    print("CatBoost and XGBoost imported successfully.")\nexcept Exception as e:\n    print(f"Error: {e}")\n'

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings

In [5]:
df = pd.read_csv('data/StudentsPerformance.csv')
df.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [6]:
X = df.drop(columns=['math score'],axis=1)
y = df['math score']

In [7]:
# trnasform the categorical data into numerical data use onehotcoder and columntransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer

y = df['math score']  
X = df.drop('math score', axis=1)


categorical_cols = X.select_dtypes(include=['object']).columns
numeric_cols = X.select_dtypes(include=['float64', 'int64']).columns

preprocessor = make_column_transformer(
    (OneHotEncoder(sparse=False, handle_unknown='ignore'), categorical_cols),
    (StandardScaler(), numeric_cols)
)


X_transformed = preprocessor.fit_transform(X)



In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((800, 19), (200, 19), (800,), (200,))

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV

# Create a dictionary of models
models = {
    'KNeighborsRegressor': KNeighborsRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'SVR': SVR(),
    'LinearRegression': LinearRegression(),
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'CatBoost': CatBoostRegressor(),
    'XGBoost': XGBRegressor()
}

model_list = []

# Set parameters for each model
model_params = {
    'KNeighborsRegressor': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']},
    'DecisionTreeRegressor': {'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 3]},
    'RandomForestRegressor': {'n_estimators': [100, 200, 300], 'max_depth': [None, 5, 10], 'min_samples_split': [2, 5, 10]},
    'AdaBoostRegressor': {'n_estimators': [50, 100, 150], 'learning_rate': [0.1, 0.5, 1.0], 'loss': ['linear', 'square', 'exponential']},
    'SVR': {'kernel': ['linear', 'rbf'], 'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'], 'epsilon': [0.1, 0.01, 0.001]},
    'LinearRegression': {},
    'Ridge': {'alpha': [0.1, 1, 10], 'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'], 'tol': [0.001, 0.0001, 0.00001]},
    'Lasso': {'alpha': [0.1, 1, 10], 'selection': ['cyclic', 'random'], 'max_iter': [1000, 2000, 3000]},
    'CatBoost': {'iterations': [100, 200, 300], 'learning_rate': [0.01, 0.1, 1.0], 'depth': [4, 6, 8]},
    'XGBoost': {'n_estimators': [100, 200, 300], 'learning_rate': [0.01, 0.1, 1.0], 'max_depth': [3, 5, 7]}
}

# Loop through the models
for model_name, model in models.items():
    # Get the parameters for the current model
    params = model_params.get(model_name, {})
    
    # Create a grid search object for the current model
    grid_search = GridSearchCV(model, params, cv=5, scoring='r2')
    
    
    # Fit the grid search object on the training data
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    model.set_params(**best_params)
    model.fit(X_train, y_train)
    r2 = cross_val_score(model, X_train, y_train, cv=5, scoring='r2').mean()
    rmse = np.sqrt(-cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean())
    mae = -cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
    
    
    y_pred = model.predict(X_test)
    final_mse = -np.sqrt(mean_squared_error(y_test, y_pred))
    final_mae = -mean_absolute_error(y_test, y_pred)
    final_r2 = r2_score(y_test, y_pred)
    
    # Append the model and its performance metrics to the model_list
    model_list.append({'model': model_name, 'mse': rmse, 'mae': mae, 'r2': r2, 'final_mse': final_mse, 'final_mae': final_mae, 'final_r2': final_r2})

# Print the model_list
for model in model_list:
    print(model['model'])
    print(f'R2 Score: {model["r2"]}')
    print(f'Mean Absolute Error: {model["mae"]}')
    print(f'Mean Squared Error: {model["mse"]}')
    print(f'Final R2 Score: {model["final_r2"]}')
    print(f'Final Mean Absolute Error: {model["final_mae"]}')
    print(f'Final Mean Squared Error: {model["final_mse"]}')
    print('\n')

0:	learn: 14.7090249	total: 75.5ms	remaining: 7.47s
1:	learn: 14.6176246	total: 75.6ms	remaining: 3.71s
2:	learn: 14.5370834	total: 75.8ms	remaining: 2.45s
3:	learn: 14.4418913	total: 75.9ms	remaining: 1.82s
4:	learn: 14.3570162	total: 76.1ms	remaining: 1.45s
5:	learn: 14.2688180	total: 76.3ms	remaining: 1.19s
6:	learn: 14.1901406	total: 76.4ms	remaining: 1.01s
7:	learn: 14.1102388	total: 76.6ms	remaining: 880ms
8:	learn: 14.0335346	total: 76.7ms	remaining: 776ms
9:	learn: 13.9481345	total: 76.9ms	remaining: 692ms
10:	learn: 13.8589886	total: 77ms	remaining: 623ms
11:	learn: 13.7705600	total: 77.2ms	remaining: 566ms
12:	learn: 13.7005768	total: 77.3ms	remaining: 518ms
13:	learn: 13.6150737	total: 77.5ms	remaining: 476ms
14:	learn: 13.5351352	total: 77.6ms	remaining: 440ms
15:	learn: 13.4506893	total: 78.1ms	remaining: 410ms
16:	learn: 13.3700798	total: 78.3ms	remaining: 382ms
17:	learn: 13.2961414	total: 79.5ms	remaining: 362ms
18:	learn: 13.2145597	total: 79.8ms	remaining: 340ms
19:	l

In [11]:
# Create a dataframe using models' names and their scores
scores_df = pd.DataFrame(model_list)
scores_df

Unnamed: 0,model,mse,mae,r2,final_mse,final_mae,final_r2
0,KNeighborsRegressor,6.947127,5.480697,0.784723,-7.248813,-5.666926,0.784065
1,DecisionTreeRegressor,6.744722,5.415681,0.796943,-6.542721,-4.933607,0.824084
2,RandomForestRegressor,5.989592,4.867517,0.837994,-5.933985,-4.530368,0.855295
3,AdaBoostRegressor,6.156622,4.911826,0.827283,-5.883654,-4.591275,0.85774
4,SVR,5.494141,4.423283,0.865435,-5.393761,-4.246736,0.880444
5,LinearRegression,5.463262,4.379209,0.867053,-5.421247,-4.234531,0.879222
6,Ridge,5.429806,4.359188,0.868615,-5.39041,-4.211116,0.880592
7,Lasso,5.472694,4.403431,0.866631,-5.368871,-4.154769,0.881544
8,CatBoost,5.693068,4.582531,0.855401,-5.574123,-4.314105,0.872314
9,XGBoost,5.833122,4.701237,0.848737,-5.661136,-4.373194,0.868297


In [16]:
from sklearn.ensemble import StackingRegressor


# Create a list of tuples containing the model name and the corresponding model object
estimators = [
    ('SVR', SVR()),
    ('LinearRegression', LinearRegression()),
    ('Ridge', Ridge()),
    ('Lasso', Lasso()),
    ('CatBoost', CatBoostRegressor())
]

# Create the stacking ensemble model
stacking_model = StackingRegressor(estimators=estimators, final_estimator=LinearRegression())

# Fit the stacking model on the training data
mse = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error').mean()
mse = np.sqrt(-mse)
mae = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error').mean()
mae = -mae
r2 = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='r2').mean()
print(f'R2 Score: {r2:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'Mean Squared Error: {mse:.2f}')


Learning rate set to 0.038155
0:	learn: 14.4355373	total: 432us	remaining: 432ms
1:	learn: 14.1347681	total: 781us	remaining: 390ms
2:	learn: 13.7912381	total: 1.12ms	remaining: 373ms
3:	learn: 13.4624652	total: 1.46ms	remaining: 363ms
4:	learn: 13.1329105	total: 1.96ms	remaining: 390ms
5:	learn: 12.8671007	total: 2.32ms	remaining: 384ms
6:	learn: 12.5787664	total: 2.66ms	remaining: 378ms
7:	learn: 12.3289514	total: 3ms	remaining: 372ms
8:	learn: 12.0816829	total: 3.34ms	remaining: 368ms
9:	learn: 11.8077460	total: 3.56ms	remaining: 352ms
10:	learn: 11.5458640	total: 3.9ms	remaining: 350ms
11:	learn: 11.3240956	total: 4.23ms	remaining: 348ms
12:	learn: 11.0908608	total: 4.56ms	remaining: 346ms
13:	learn: 10.8854420	total: 4.89ms	remaining: 345ms
14:	learn: 10.6563962	total: 5.22ms	remaining: 343ms
15:	learn: 10.4639246	total: 5.57ms	remaining: 343ms
16:	learn: 10.2660999	total: 5.91ms	remaining: 341ms
17:	learn: 10.0833808	total: 6.24ms	remaining: 341ms
18:	learn: 9.9034823	total: 6.57

In [17]:
stacking_model.fit(X_train, y_train)

y_pred = stacking_model.predict(X_test)


final_r2 = r2_score(y_test, y_pred)
final_mae = mean_absolute_error(y_test, y_pred)
final_mse = np.sqrt((mean_squared_error(y_test, y_pred)))

print(f'Final R2 Score: {final_r2:.2f}')
print(f'Final Mean Absolute Error: {final_mae:.2f}')
print(f'Final Mean Squared Error: {final_mse:.2f}')

Learning rate set to 0.039525
0:	learn: 14.5987177	total: 1.75ms	remaining: 1.75s
1:	learn: 14.2251886	total: 2.12ms	remaining: 1.06s
2:	learn: 13.8866124	total: 2.67ms	remaining: 887ms
3:	learn: 13.5235688	total: 3.11ms	remaining: 775ms
4:	learn: 13.1887021	total: 3.49ms	remaining: 695ms
5:	learn: 12.9124226	total: 3.87ms	remaining: 642ms
6:	learn: 12.6000335	total: 5.98ms	remaining: 848ms
7:	learn: 12.3299057	total: 7.5ms	remaining: 930ms
8:	learn: 12.0660619	total: 7.86ms	remaining: 866ms
9:	learn: 11.7730981	total: 8.22ms	remaining: 813ms
10:	learn: 11.4922764	total: 8.57ms	remaining: 771ms
11:	learn: 11.2626483	total: 8.93ms	remaining: 735ms
12:	learn: 11.0426039	total: 10.4ms	remaining: 788ms
13:	learn: 10.7991693	total: 11ms	remaining: 776ms
14:	learn: 10.5541002	total: 11.7ms	remaining: 765ms
15:	learn: 10.3191811	total: 12.1ms	remaining: 747ms
16:	learn: 10.1000444	total: 12.5ms	remaining: 723ms
17:	learn: 9.8945567	total: 12.9ms	remaining: 702ms
18:	learn: 9.6901741	total: 13