# Selecting the best model with best hyperparameters 

In [1]:
# import libraries 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# train test split the dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import regression algorithm 
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
# Evaluate the models 
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import grid search cv for cross validation 
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load dataset 
df= sns.load_dataset('tips')

## Regression Tasks 

In [3]:
# Select features and variables
X=df.drop('tip', axis=1)
y=df['tip']

# label encode categorical variables 
le= LabelEncoder()
X['sex']= le.fit_transform(X['sex'])
X['smoker']= le.fit_transform(X['smoker'])
X['day']= le.fit_transform(X['day'])
X['time']= le.fit_transform(X['time'])

# split the data into train and test data with 80% training dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
# create a dictionary of the list of models to evaluate the performance 
models = { 
        'Linear Regression': LinearRegression(),
        'Support Vector Regression': SVR(),
        'Decision Tree Regression': DecisionTreeRegressor(),
        'Random Forest Regression': RandomForestRegressor(),
        'Gradient Boosting Regression': GradientBoostingRegressor(),
        'KNN Regression': KNeighborsRegressor(),
        'XGBoost Regression': XGBRegressor()
        }

In [12]:
# train and predict each model with evaluation metrics as well making a for loop to iterate over the models 
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)

    # make prediction from each model 
    y_pred = model.predict(X_test)

   # MSE for each model 
    print(name, 'MSE:', mean_squared_error(y_test, y_pred)) 
    
    # MAE for each model
    print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    
    # RMSE for each model
    print(name, 'RMSE:', np.sqrt(mean_squared_error(y_test, y_pred)))

    # R2 score for each model
    print(name, 'R2:', r2_score(y_test, y_pred))

    print('\n')

Linear Regression MSE: 1825912.9915253515
Linear Regression MAE: 858.7084697710096
Linear Regression RMSE: 1351.2634796831267
Linear Regression R2: 0.885139743367963
Support Vector Regression MSE: 17791095.454944562
Support Vector Regression MAE: 2747.3256009251854
Support Vector Regression RMSE: 4217.949200137972
Support Vector Regression R2: -0.11916055102542833
Decision Tree Regression MSE: 520135.9549267705
Decision Tree Regression MAE: 351.7402206154987
Decision Tree Regression RMSE: 721.2045167126801
Decision Tree Regression R2: 0.9672805059475864
Random Forest Regression MSE: 295361.9737615915
Random Forest Regression MAE: 267.9123321933983
Random Forest Regression RMSE: 543.4721462610494
Random Forest Regression R2: 0.9814200609431776
Gradient Boosting Regression MSE: 430067.88364194834
Gradient Boosting Regression MAE: 364.9299448070813
Gradient Boosting Regression RMSE: 655.7956111792365
Gradient Boosting Regression R2: 0.9729462971600609
KNN Regression MSE: 795139.3987764182

In [None]:
models.items()

# Assignmnet: find the best model based on each metrics from above mentioned Results?
---

## Hyperparameter tuning:

In [None]:
# Create a dictionary of list of models to evaluate performance with hyperparameters
models = { 
        'Linear Regression': (LinearRegression(), {}),
        'Support Vector Regression': (SVR(), {'kernel': ['rbf','poly','sigmoid'], 'C':[0.1, 1, 10], 'gamma':[1,0.1,0.01],'epsilon':[0.1,0.01, 0.001]}),
        'Decision Tree Regression': (DecisionTreeRegressor(), {'max_depth':[None, 5, 10], 'splitter':['best', 'random']}),
        'Random Forest Regression': (RandomForestRegressor(),{'n_estimators':[10, 100, 1000], 'max_depth':[None, 5, 10]}),
        'Gradient Boosting Regression': (GradientBoostingRegressor(), {'loss':['ls', 'lad', 'hubber', 'quantile'], 'n_estimators':[10, 100, 1000]}),
        'KNN Regression': (KNeighborsRegressor(), {'n_neighbors':np.arange(3,100, 3), 'weights':['uniform', 'distance']}),
        'XGBoost Regression': (XGBRegressor(), {'n_estimators':[10, 100, 1000], 'learning_rate':[0.1, 0.01, 0.001]}),
        }

for name, (model, params) in models.items():
    
    # Create a pipeline 
    pipeline=GridSearchCV(model, params, cv=5)

    #fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model 
    y_pred = pipeline.predict(X_test)
    
    # print the performance metrics
    print(name, 'MSE:', mean_squared_error(y_test, y_pred))
    print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    print(name, 'R2:', r2_score(y_test, y_pred))
    print('\n')

## Add Preprocessor inside the pipeline

In [8]:
df.head()
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [None]:
# make a preprocessor 

preprocessor = ColumnTransformer(
    transformers=['numeric_scaling', StandardScaler(), ['total_bill', 'size']],remainder='passthrough')

models = { 
        'Linear Regression': (LinearRegression(), {}),
        'Support Vector Regression': (SVR(), {'kernel': ['rbf','poly','sigmoid'], 'C':[0.1, 1, 10], 'gamma':[1,0.1,0.01],'epsilon':[0.1,0.01, 0.001]}),
        'Decision Tree Regression': (DecisionTreeRegressor(), {'max_depth':[None, 5, 10], 'splitter':['best', 'random']}),
        'Random Forest Regression': (RandomForestRegressor(),{'n_estimators':[10, 100, 1000], 'max_depth':[None, 5, 10]}),
        'Gradient Boosting Regression': (GradientBoostingRegressor(), {'loss':['ls', 'lad', 'hubber', 'quantile'], 'n_estimators':[10, 100, 1000]}),
        'KNN Regression': (KNeighborsRegressor(), {'n_neighbors':np.arange(3,100, 3), 'weights':['uniform', 'distance']}),
        'XGBoost Regression': (XGBRegressor(), {'n_estimators':[10, 100, 1000], 'learning_rate':[0.1, 0.01, 0.001]}),
        }

for name, (model, params) in models.items():
    # create a pipeline with preprocessor
    pipeline=Pipeline(steps=[('preprocessor', preprocessor), ('model', model)])

    # make a grid search cv to tune the hyperparameter
    grid_search = GridSearchCV(pipeline, params, cv=5)

    # fit the pipeline
    grid_search.fit(X_train, y_train)

    # make prediction from each model
    y_pred = grid_search.predict(X_test)

    # print the performance metrics
    print(name, 'MSE:', mean_squared_error(y_test, y_pred))
    print(name, 'MAE:', mean_absolute_error(y_test, y_pred))
    print(name, 'R2:', r2_score(y_test, y_pred))
    print('\n')

## Classifiers

In [10]:
import numpy as np 
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Create a dictionary of classifiers to evaluate 
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree Classifier': DecisionTreeClassifier(),
    'Random Forest Classifier': RandomForestClassifier(),
    'KNN Classifier': KNeighborsClassifier(),
    'SVM Classifier': SVC()
    }
# perform kfold cross-validation and calculate the mean accuracy 
kfold=KFold(n_splits=5, shuffle=True, random_state=42)
for name, classifier in classifiers.items():
    scores = cross_val_score(classifier, X, y, cv=kfold)
    accuracy=np.mean(scores)
    print('Classifier:', name)
    print('Mean Accuracy:', accuracy)
    print()


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Classifier: Logistic Regression
Mean Accuracy: 0.9733333333333334

Classifier: Decision Tree Classifier
Mean Accuracy: 0.9600000000000002

Classifier: Random Forest Classifier
Mean Accuracy: 0.9600000000000002

Classifier: KNN Classifier
Mean Accuracy: 0.9733333333333334

Classifier: SVM Classifier
Mean Accuracy: 0.9666666666666668

