In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_excel('KTMData.xlsx')

In [3]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,Male,Professional,Average,180 to 220,Complicated,Not purchased
1,2,27,Female,Self Employed,Low End,No Bike,Single,Purchased
2,3,39,Female,Unemployed,Average,180 to 220,Married,Not purchased
3,4,20,Female,Unemployed,High End,No Bike,Married,Not purchased
4,5,29,Male,Student,Average,180 to 220,Complicated,Purchased


In [4]:
le=LabelEncoder()

In [5]:
df['Gender']=le.fit_transform(df['Gender'])
df['Occupation']=le.fit_transform(df['Occupation'])
df['Phone Type']=le.fit_transform(df['Phone Type'])
df['Relationship']=le.fit_transform(df['Relationship'])
df['Current Bike']=le.fit_transform(df['Current Bike'])
df['Response']=le.fit_transform(df['Response'])

In [6]:
df.head()

Unnamed: 0,ID,Age,Gender,Occupation,Phone Type,Current Bike,Relationship,Response
0,1,53,1,0,0,1,1,0
1,2,27,0,1,2,4,3,1
2,3,39,0,3,0,1,2,0
3,4,20,0,3,1,4,2,0
4,5,29,1,2,0,1,1,1


In [7]:
X=df.drop(['ID','Response'],axis=1)

In [8]:
y=df['Response']

In [9]:
def model_builder(m_name,score,X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)
    model=m_name()
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    accuracy=score(y_test,y_pred)
    #stf=StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
    cv_score=np.mean(cross_val_score(estimator=model,X=X,y=y,cv=10))
    return accuracy,cv_score

In [10]:
model_builder(GradientBoostingClassifier,accuracy_score,X,y)

(0.7697368421052632, 0.7465318926455211)

## HP Tuning on GBoost

In [11]:
param={'max_features':['auto', 'sqrt', 'log2'],'criterion':['friedman_mse', 'squared_error', 'mse'],'learning_rate':[0.1,0.01,0.2,0.3],'loss':['log_loss', 'deviance', 'exponential']}

In [12]:
gcv=GridSearchCV(estimator=GradientBoostingClassifier(),param_grid=param,cv=10)

In [13]:
gcv.fit(X,y)

GridSearchCV(cv=10, estimator=GradientBoostingClassifier(),
             param_grid={'criterion': ['friedman_mse', 'squared_error', 'mse'],
                         'learning_rate': [0.1, 0.01, 0.2, 0.3],
                         'loss': ['log_loss', 'deviance', 'exponential'],
                         'max_features': ['auto', 'sqrt', 'log2']})

In [14]:
gcv.best_params_

{'criterion': 'squared_error',
 'learning_rate': 0.1,
 'loss': 'exponential',
 'max_features': 'log2'}

In [15]:
def model_builder(m_name,score,X,y):
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)
    model=m_name
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    accuracy=score(y_test,y_pred)
    cv_score=np.mean(cross_val_score(estimator=model,X=X,y=y,cv=10))
    return accuracy,cv_score

In [16]:
model_builder(GradientBoostingClassifier(criterion='friedman_mse', loss='deviance',max_features='log2',learning_rate=0.1),accuracy_score,X,y)

(0.75, 0.7471941443011503)