# Muhammad Usman

### usmanashraf4360@gmail.com

# Selection of Best Regression Model

In [1]:
# Import Liberaries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Import regression  models
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor

# train test split
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder


In [4]:
#Load dataset
df=sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [7]:
#Split data into x and Y
x = df.drop('tip', axis=1)
y = df['tip']

# label encode categorical variables
le = LabelEncoder()
x['sex'] = le.fit_transform(x['sex'])
x['smoker'] = le.fit_transform(x[
    'smoker'])
x['day'] = le.fit_transform(x['day'])
x['time'] = le.fit_transform(x['time'])

In [8]:

# Split data into training and testing, 80% training data 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

models={'LinearRegression':LinearRegression(),
        'SVR':SVR(),
        'GradientBoostingRegressor':GradientBoostingRegressor(),
        'RandomForestRegressor':RandomForestRegressor(),
        'DecisionTreeRegressor':DecisionTreeRegressor(),
        'XGBRegressor':XGBRegressor(),
        'KNeighborsRegressor':KNeighborsRegressor()}

# Train the model and predict values by providing testing data 
model_scores=[]
for name,model in models.items():
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)

    # Calculate mean absolute Error
    metric=mean_absolute_error(y_test,y_pred)
    model_scores.append((name,metric))

sorted_models=sorted(model_scores,key=lambda x:x[1],reverse=False)
for model in sorted_models:
    print('Mean Absolute error for', f"{model[0]} is {model[1]: .2f}") 


Mean Absolute error for SVR is  0.57
Mean Absolute error for LinearRegression is  0.67
Mean Absolute error for XGBRegressor is  0.67
Mean Absolute error for KNeighborsRegressor is  0.73
Mean Absolute error for GradientBoostingRegressor is  0.73
Mean Absolute error for RandomForestRegressor is  0.77
Mean Absolute error for DecisionTreeRegressor is  0.78


In [54]:
# Split data into training and testing, 80% training data 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

models={'LinearRegression':LinearRegression(),
        'SVR':SVR(),
        'GradientBoostingRegressor':GradientBoostingRegressor(),
        'RandomForestRegressor':RandomForestRegressor(),
        'DecisionTreeRegressor':DecisionTreeRegressor(),
        'XGBRegressor':XGBRegressor(),
        'KNeighborsRegressor':KNeighborsRegressor()}

# Train the model and predict values by providing testing data 
model_scores=[]
for name,model in models.items():
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)

    # Calculate mean squared Error
    metric=mean_squared_error(y_test,y_pred)
    model_scores.append((name,metric))

sorted_models=sorted(model_scores,key=lambda x:x[1],reverse=False)
for model in sorted_models:
    print('Mean Squared error for', f"{model[0]} is {model[1]: .2f}")

Mean Squared error for SVR is  0.54
Mean Squared error for LinearRegression is  0.69
Mean Squared error for XGBRegressor is  0.74
Mean Squared error for GradientBoostingRegressor is  0.80
Mean Squared error for KNeighborsRegressor is  0.84
Mean Squared error for RandomForestRegressor is  0.97
Mean Squared error for DecisionTreeRegressor is  1.45


In [10]:
# Split data into training and testing, 80% training data 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

models={'LinearRegression':LinearRegression(),
        'SVR':SVR(),
        'GradientBoostingRegressor':GradientBoostingRegressor(),
        'RandomForestRegressor':RandomForestRegressor(),
        'DecisionTreeRegressor':DecisionTreeRegressor(),
        'XGBRegressor':XGBRegressor(),
        'KNeighborsRegressor':KNeighborsRegressor()}

# Train the model and predict values by providing testing data 
model_scores=[]
for name,model in models.items():
    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)

    # Calculate mean squared Error
    metric=r2_score(y_test,y_pred)
    model_scores.append((name,metric))

sorted_models=sorted(model_scores,key=lambda x:x[1],reverse=False)
for model in sorted_models:
    print('R2 Score for', f"{model[0]} is {model[1]: .2f}")

R2 Score for DecisionTreeRegressor is -0.02
R2 Score for RandomForestRegressor is  0.29
R2 Score for KNeighborsRegressor is  0.33
R2 Score for GradientBoostingRegressor is  0.36
R2 Score for XGBRegressor is  0.41
R2 Score for LinearRegression is  0.44
R2 Score for SVR is  0.57


# Hyperparameter Tuning

In [19]:
# Split data into training and testing, 80% training data 
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

# Create a dictionary to train and evaluate the model by using hyperparameters.
models={'LinearRegression':(LinearRegression(),{}),
        'SVR':(SVR(),{'kernel':['rbf','poly','sigmoid']}),
        'GradientBoostingRegressor':(GradientBoostingRegressor(),{'n_estimators':[10,100]}),
        'RandomForestRegressor':(RandomForestRegressor(),{'n_estimators':[10,100]}),
        'DecisionTreeRegressor':(DecisionTreeRegressor(),{'max_depth':[None,5,10]}),
        'XGBRegressor':(XGBRegressor(),{'n_estimators':[10,100]}),
        'KNeighborsRegressor':(KNeighborsRegressor(),{'n_neighbors':np.arange(3,100,2)})}

# Train the model and predict values by providing testing data 
model_scores=[]
for name,(model,params) in models.items():
    # Create pipline
    Pipeline=GridSearchCV(model,params,cv=5)
    Pipeline.fit(x_train,y_train)
    y_pred=Pipeline.predict(x_test)

    # Calculate mean squared Error
    print(mean_squared_error(y_test,y_pred))
    print(r2_score(y_test,y_pred))
    print(mean_absolute_error(y_test,y_pred))
    print('\n')

0.6948129686287711
0.4441368826121931
0.6703807496461157


1.460718141299992
-0.1686013018011976
0.8935334948775431


0.8106801524004932
0.35144101065487676
0.7657809818712309


0.9750007406122461
0.21998152653695058
0.7849571428571431


0.8774153020453994
0.2980516670532909
0.7189481629481629


0.6624107100882575
0.4700592836840687
0.6549163442728472


0.6640950568462677
0.4687117753876745
0.6203721488595437




# Select the best model by using iris Dataset

In [31]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

In [26]:
iris = sns.load_dataset('iris')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [28]:
# Split data into x and y then encode y variable 
x=iris.drop('species',axis=1)
y=iris['species']
le=LabelEncoder()
y=le.fit_transform(y)

In [29]:
# Split data into train and test pieces
X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [32]:

models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=42)),
    ('XGBoost', XGBClassifier(random_state=42))
]
best_model = None
best_accuracy = 0.0

for name, model in models:
    # Create a pipeline for each model
    #pipeline = Pipeline([
     #   ('imputer', SimpleImputer(strategy='most_frequent')),
      #  ('encoder', OneHotEncoder(handle_unknown='ignore')),
       # ('model', model)
    #])
    # Perform cross-validation
    scores = cross_val_score(model, X_train, y_train, cv=5)
    
    # Calculate mean accuracy
    mean_accuracy = scores.mean()
    
    # Fit the pipeline on the training data
    model.fit(X_train, y_train)
    
    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Calculate accuracy score
    accuracy = accuracy_score(y_test, y_pred)
    
    # Print the performance metrics
    print("Model:", name)
    print("Cross-validation Accuracy:", mean_accuracy)
    print("Test Accuracy:", accuracy)
    print()
    
    # Check if the current model has the best accuracy
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model = model

# Retrieve the best model
print("Best Model:", best_model)

Model: Random Forest
Cross-validation Accuracy: 0.9416666666666667
Test Accuracy: 1.0

Model: Gradient Boosting
Cross-validation Accuracy: 0.9416666666666668
Test Accuracy: 1.0

Model: XGBoost
Cross-validation Accuracy: 0.9333333333333333
Test Accuracy: 1.0

Best Model: RandomForestClassifier(random_state=42)
