In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings("ignore")

%matplotlib inline

In [4]:
df=pd.read_csv("Travel_updated.csv")
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,Designation,MonthlyIncome,TotalVisited
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3.0,Deluxe,3.0,Single,1.0,1,2,1,Manager,20993.0,3.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,Manager,20130.0,5.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,4.0,Basic,3.0,Single,7.0,1,3,0,Executive,17090.0,3.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,3.0,Basic,3.0,Divorced,2.0,1,5,1,Executive,17909.0,3.0
4,200004,0,37.622265,Self Enquiry,1,8.0,Small Business,Male,3.0,Basic,4.0,Divorced,1.0,0,5,1,Executive,18468.0,2.0


#### Train and Test split

In [19]:
X=df.drop(columns=['ProdTaken'],axis=1)
Y=df['ProdTaken']

In [21]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=.28,random_state=42)

In [23]:
classification_features=[feature for feature in X.columns if(X[feature].dtype == 'object')]
classification_features

['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [24]:
numerical_features=[feature for feature in X.columns if(X[feature].dtype != 'object')]
numerical_features

['CustomerID',
 'Age',
 'CityTier',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'MonthlyIncome',
 'TotalVisited']

#### LabourEncoding

In [25]:
from sklearn.preprocessing import LabelEncoder
for feature in classification_features:
	labour=LabelEncoder()
	labour.fit(X_train[feature])
	X_train[feature]=labour.transform(X_train[feature])
	X_test[feature]=labour.transform(X_test[feature])

#### Apply Standardization

In [28]:
from sklearn.preprocessing import StandardScaler
for feature in classification_features:
	scaler=StandardScaler()
	scaler.fit(X_train[[feature]])
	X_train[feature]=scaler.transform(X_train[[feature]])
	X_test[feature]=scaler.transform(X_test[[feature]])

#### Apply Model

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score,precision_score, recall_score, f1_score, roc_auc_score,roc_curve 

In [30]:
# Evaluate the model
def evaluate_performance(true,pred):
	accuracy=accuracy_score(true,pred) # Calculate Accuracy
	f1=f1_score(true,pred,average='weighted') # Calculate F1 score
	precision=precision_score(true,pred) # Calculate presicion
	recall= recall_score(true,pred) # Calculate recall
	rocauc_score=roc_auc_score(true,pred) #Clacualte ROU AUC score

	return accuracy,f1,precision,recall,rocauc_score

In [31]:
models={
    "Logisitic Regression":LogisticRegression(),
    "Decision Tree":DecisionTreeClassifier(),
    "Random Forest":RandomForestClassifier(),
    "Gradient Boost":GradientBoostingClassifier(),
	"Adaboost":AdaBoostClassifier(),
	"GradientBoost": GradientBoostingClassifier()
}

In [32]:

for name,model in models.items():
	model.fit(X_train,Y_train)

	# Make predictions
	Y_train_pred = model.predict(X_train)
	Y_test_pred = model.predict(X_test)

	# Training set performance
	model_train_accuracy ,model_train_f1,model_train_precision,model_train_recall ,model_train_rocauc_score = evaluate_performance(Y_train,Y_train_pred)	

    # Test set performance
	model_test_accuracy,model_test_f1,model_test_precision,model_test_recall,model_test_rocauc_score=evaluate_performance(Y_test,Y_test_pred)

	print(name)
    
	print('Model performance for Training set')
	print(f"- Accuracy: {model_train_accuracy}")
	print(f'- F1 score: {model_train_f1}')
	print(f'- Precision: {model_train_precision}')
	print(f'- Recall: {model_train_recall}')
	print(f'- Roc Auc Score: {model_train_rocauc_score}')

	print("-"*15)
	print('Model performance for Training set')
	print(f"- Accuracy: {model_test_accuracy}")
	print(f'- F1 score: {model_test_f1}')
	print(f'- Precision: {model_test_precision}')
	print(f'- Recall: {model_test_recall}')
	print(f'- Roc Auc Score: {model_test_rocauc_score}')

	print("="*15)
	print()

Logisitic Regression
Model performance for Training set
- Accuracy: 0.824381926683717
- F1 score: 0.7626157652880876
- Precision: 0.8
- Recall: 0.09049773755656108
- Roc Auc Score: 0.5426228183581124
---------------
Model performance for Training set
- Accuracy: 0.8203067932797663
- F1 score: 0.7570352447562447
- Precision: 0.6896551724137931
- Recall: 0.07782101167315175
- Roc Auc Score: 0.5348637432466478

Decision Tree
Model performance for Training set
- Accuracy: 1.0
- F1 score: 1.0
- Precision: 1.0
- Recall: 1.0
- Roc Auc Score: 1.0
---------------
Model performance for Training set
- Accuracy: 0.8970051132213295
- F1 score: 0.8961323815652226
- Precision: 0.7357723577235772
- Recall: 0.7042801556420234
- Roc Auc Score: 0.8229134591159758

Random Forest
Model performance for Training set
- Accuracy: 1.0
- F1 score: 1.0
- Precision: 1.0
- Recall: 1.0
- Roc Auc Score: 1.0
---------------
Model performance for Training set
- Accuracy: 0.9159970781592404
- F1 score: 0.907982293108904

In [33]:
#Hyperparameter Tuning
rf_params={"max_depth": [5, 8, 15, None, 10],
            "max_features": [5, 7, "auto", 8],
            "min_samples_split": [2, 8, 15, 20],
            "n_estimators": [100, 200, 500, 1000]
			}

adaboost_params={
	"n_estimators":[50,60,70,80,90],
    "algorithm":['SAMME','SAMME.R'],
	'learning_rate':[1.0,2.0,2.5]
}

gradientboost_params={
	"n_estimators":[50,60,70,80,90],
	"loss":['log_loss','exponential'],
	"criterion":['friedman_mse','squared_error','mse'],
	"min_samples_split": [2, 8, 15, 20],
	"max_depth": [5, 8, 15, None, 10]
}

In [35]:
randomcv_models = [
                   ("RF", RandomForestClassifier(), rf_params),
    				("GradientBoost", GradientBoostingClassifier(), gradientboost_params),
					("Adaboost",AdaBoostClassifier(),adaboost_params)
                   ]

In [38]:
from sklearn.model_selection import RandomizedSearchCV

model_best_params={}
for name,model,params in randomcv_models:
	random=RandomizedSearchCV(estimator=model,random_state=42,param_distributions=params)
	random.fit(X_train,Y_train)

	model_best_params[name]=random.best_params_

for model_name in model_best_params:
    print(f"---------------- Best Params for {model_name} -------------------")
    print(model_best_params[model_name])

---------------- Best Params for RF -------------------
{'n_estimators': 100, 'min_samples_split': 8, 'max_features': 8, 'max_depth': None}
---------------- Best Params for GradientBoost -------------------
{'n_estimators': 90, 'min_samples_split': 2, 'max_depth': 10, 'loss': 'log_loss', 'criterion': 'squared_error'}
---------------- Best Params for Adaboost -------------------
{'n_estimators': 50, 'learning_rate': 1.0, 'algorithm': 'SAMME.R'}


In [39]:
models={
    "Random Forest":RandomForestClassifier(n_estimators=100,min_samples_split=8,max_features=8,max_depth=None),
	"Adaboost":AdaBoostClassifier(n_estimators=50,learning_rate=1.0,algorithm='SAMME.R'),
	"GradientBoost": GradientBoostingClassifier(n_estimators=90,min_samples_split=2,max_depth=10,loss='log_loss',criterion='squared_error')
}

In [40]:

for name,model in models.items():
	model.fit(X_train,Y_train)

	# Make predictions
	Y_train_pred = model.predict(X_train)
	Y_test_pred = model.predict(X_test)

	# Training set performance
	model_train_accuracy ,model_train_f1,model_train_precision,model_train_recall ,model_train_rocauc_score = evaluate_performance(Y_train,Y_train_pred)	

    # Test set performance
	model_test_accuracy,model_test_f1,model_test_precision,model_test_recall,model_test_rocauc_score=evaluate_performance(Y_test,Y_test_pred)

	print(name)
    
	print('Model performance for Training set')
	print(f"- Accuracy: {model_train_accuracy}")
	print(f'- F1 score: {model_train_f1}')
	print(f'- Precision: {model_train_precision}')
	print(f'- Recall: {model_train_recall}')
	print(f'- Roc Auc Score: {model_train_rocauc_score}')

	print("-"*15)
	print('Model performance for Training set')
	print(f"- Accuracy: {model_test_accuracy}")
	print(f'- F1 score: {model_test_f1}')
	print(f'- Precision: {model_test_precision}')
	print(f'- Recall: {model_test_recall}')
	print(f'- Roc Auc Score: {model_test_rocauc_score}')

	print("="*15)
	print()

Random Forest
Model performance for Training set
- Accuracy: 0.9829497016197783
- F1 score: 0.9826676250157568
- Precision: 0.9950738916256158
- Recall: 0.9140271493212669
- Roc Auc Score: 0.9564883645765998
---------------
Model performance for Training set
- Accuracy: 0.9043097151205259
- F1 score: 0.8956610117167737
- Precision: 0.8841463414634146
- Recall: 0.5642023346303502
- Roc Auc Score: 0.7735580018475492

Adaboost
Model performance for Training set
- Accuracy: 0.8704177323103154
- F1 score: 0.853095637223124
- Precision: 0.8035190615835777
- Recall: 0.4132730015082956
- Roc Auc Score: 0.694906808877397
---------------
Model performance for Training set
- Accuracy: 0.8531775018261505
- F1 score: 0.8356282313955925
- Precision: 0.6971830985915493
- Recall: 0.3852140077821012
- Roc Auc Score: 0.6732724715169499

GradientBoost
Model performance for Training set
- Accuracy: 1.0
- F1 score: 1.0
- Precision: 1.0
- Recall: 1.0
- Roc Auc Score: 1.0
---------------
Model performance fo