In [83]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report,ConfusionMatrixDisplay,precision_score, recall_score, f1_score, roc_auc_score,roc_curve 
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier
%matplotlib inline

In [4]:
df = pd.read_csv("/Users/saprativasarkar/Desktop/ML/9_ML_Practicals/Datasets/Travel.csv")
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [5]:
df.dtypes

CustomerID                    int64
ProdTaken                     int64
Age                         float64
TypeofContact                object
CityTier                      int64
DurationOfPitch             float64
Occupation                   object
Gender                       object
NumberOfPersonVisiting        int64
NumberOfFollowups           float64
ProductPitched               object
PreferredPropertyStar       float64
MaritalStatus                object
NumberOfTrips               float64
Passport                      int64
PitchSatisfactionScore        int64
OwnCar                        int64
NumberOfChildrenVisiting    float64
Designation                  object
MonthlyIncome               float64
dtype: object

##### Data Cleaning

In [6]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [7]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')

In [8]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [9]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [10]:
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried')

In [11]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Unmarried    1598
Divorced      950
Name: count, dtype: int64

In [12]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [13]:
df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [14]:
df.Age.fillna(df.Age.median(), inplace=True)
#TypeofContract
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
#DurationOfPitch
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
#NumberOfFollowups
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
#PreferredPropertyStar
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
#NumberOfTrips
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
#NumberOfChildrenVisiting
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)
#MonthlyIncome
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

In [15]:
df.isnull().sum()

CustomerID                  0
ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

In [16]:
df.drop('CustomerID', inplace=True, axis=1)

In [17]:
df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(columns=['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis=1, inplace=True)

In [18]:
num_features = [feature for feature in df.columns if df[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

cat_features = [feature for feature in df.columns if df[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Numerical Features : 12
Num of Categorical Features : 6


##### Model Training

In [20]:
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
X_train.shape, X_test.shape,y_train.shape,y_test.shape

((3910, 17), (978, 17), (3910,), (978,))

In [24]:
num_features = [feature for feature in X.columns if X[feature].dtype != 'O']
print('Num of Numerical Features :', len(num_features))

cat_features = [feature for feature in X.columns if X[feature].dtype == 'O']
print('Num of Categorical Features :', len(cat_features))

Num of Numerical Features : 11
Num of Categorical Features : 6


In [25]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [27]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')
preprocessor = ColumnTransformer(
    [
         ("OneHotEncoder", oh_transformer, cat_features),
          ("StandardScaler", numeric_transformer, num_features)
    ]
)

In [28]:
X_train=preprocessor.fit_transform(X_train)

In [29]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.721400,-1.020350,1.284279,-0.725271,-0.127737,-0.632399,0.679690,0.782966,-0.382245,-0.774151
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,-0.721400,0.690023,0.282777,-0.725271,1.511598,-0.632399,0.679690,0.782966,-0.459799,0.643615
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.721400,-1.020350,0.282777,1.771041,0.418708,-0.632399,0.679690,0.782966,-0.245196,-0.065268
3,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,...,-0.721400,-1.020350,1.284279,-0.725271,-0.127737,-0.632399,1.408395,-1.277194,0.213475,-0.065268
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.721400,2.400396,-1.720227,-0.725271,1.511598,-0.632399,-0.049015,-1.277194,-0.024889,2.061382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3905,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,-0.721400,-0.653841,1.284279,-0.725271,-0.674182,-0.632399,-1.506426,0.782966,-0.536973,0.643615
3906,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.455047,-0.898180,-0.718725,1.771041,-1.220627,-0.632399,1.408395,0.782966,1.529609,-0.065268
3907,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.455047,1.545210,0.282777,-0.725271,2.058043,-0.632399,-0.777720,0.782966,-0.360576,0.643615
3908,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,...,1.455047,1.789549,1.284279,-0.725271,-0.127737,-0.632399,-1.506426,0.782966,-0.252799,0.643615


In [30]:
X_test=preprocessor.transform(X_test)

In [31]:
X_test

array([[ 0.        ,  0.        ,  0.        , ..., -1.2771941 ,
        -0.73751038, -0.77415132],
       [ 1.        ,  0.        ,  0.        , ..., -1.2771941 ,
        -0.6704111 , -0.06526803],
       [ 1.        ,  0.        ,  0.        , ...,  0.78296635,
        -0.4208322 , -0.77415132],
       ...,
       [ 0.        ,  1.        ,  0.        , ...,  0.78296635,
         0.69001249,  0.64361526],
       [ 1.        ,  0.        ,  0.        , ...,  0.78296635,
        -0.22827818, -0.77415132],
       [ 1.        ,  1.        ,  0.        , ...,  0.78296635,
        -0.44611323,  2.06138184]])

In [77]:
models={
    "Adaboost":AdaBoostClassifier(),
    "Random Forest":RandomForestClassifier(),
}

In [78]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) 
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    model_train_accuracy = accuracy_score(y_train, y_train_pred) 
    model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 
    model_train_precision = precision_score(y_train, y_train_pred) 
    model_train_recall = recall_score(y_train, y_train_pred) 
    model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)

    model_test_accuracy = accuracy_score(y_test, y_test_pred) 
    model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
    model_test_precision = precision_score(y_test, y_test_pred) 
    model_test_recall = recall_score(y_test, y_test_pred) 
    model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) 

    print(list(models.keys())[i])
    
    print('===========Model performance for Training set====================')
    print("- Accuracy: {:.4f}".format(model_train_accuracy))
    print('- F1 score: {:.4f}'.format(model_train_f1))
    print('- Precision: {:.4f}'.format(model_train_precision))
    print('- Recall: {:.4f}'.format(model_train_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
    
    print('===========Model performance for Test set======================')
    print('- Accuracy: {:.4f}'.format(model_test_accuracy))
    print('- F1 score: {:.4f}'.format(model_test_f1))
    print('- Precision: {:.4f}'.format(model_test_precision))
    print('- Recall: {:.4f}'.format(model_test_recall))
    print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))
    
    print('='*35)
    print('\n')

Adaboost
- Accuracy: 0.8565
- F1 score: 0.8365
- Precision: 0.7308
- Recall: 0.3649
- Roc Auc Score: 0.6670
- Accuracy: 0.8354
- F1 score: 0.8115
- Precision: 0.6630
- Recall: 0.3194
- Roc Auc Score: 0.6400


Random Forest
- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
- Accuracy: 0.9325
- F1 score: 0.9277
- Precision: 0.9699
- Recall: 0.6754
- Roc Auc Score: 0.8352




##### Hyperparam Tuning RF

In [71]:
params = {
    "max_depth": [2, 5, 10],  # Increase max depth to control tree complexity
    "min_samples_split": [5, 10, 15],  # Increase to prevent small splits
    "min_samples_leaf": [2, 5, 10],  # Add min_samples_leaf to avoid deep trees
    "n_estimators": [100, 200, 300],  # Increase estimators for stability
    "max_features": ['sqrt', 'log2'],  # Reduce features per tree to add randomness
    "bootstrap": [True, False]  # Use bootstrapping to add diversity
}

In [72]:
params

{'max_depth': [2, 5, 10],
 'min_samples_split': [5, 10, 15],
 'min_samples_leaf': [2, 5, 10],
 'n_estimators': [100, 200, 300],
 'max_features': ['sqrt', 'log2'],
 'bootstrap': [True, False]}

In [73]:
model = RandomForestClassifier()
rf_classifier = RandomizedSearchCV(
    estimator=model, 
    param_distributions=params,
    n_iter=100, 
    cv=3, 
    verbose=2, 
    n_jobs=-1
)
rf_classifier.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END bootstrap=True, max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=15, n_estimators=100; total time=   0.2s
[CV] END bootstrap=False, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=300; total time=   0.4s
[CV] END bootstrap=False, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=300; total time=   0.4s
[CV] END bootstrap=False, max_depth=2, max_features=log2, min_samples_leaf=10, min_samples_split=10, n_estimators=300; total time=   0.4s
[CV] END bootstrap=False, max_depth=5, max_features=log2, min_samples_leaf=5, min_samples_split=10, n_estimators=20

In [74]:
rf_classifier.best_params_

{'n_estimators': 100,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 10,
 'bootstrap': False}

In [75]:
model = RandomForestClassifier(n_estimators=200, min_samples_split=2, max_depth=None)
model.fit(X_train, y_train) 
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

model_train_accuracy = accuracy_score(y_train, y_train_pred) 
model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 
model_train_precision = precision_score(y_train, y_train_pred) 
model_train_recall = recall_score(y_train, y_train_pred) 
model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)

model_test_accuracy = accuracy_score(y_test, y_test_pred) 
model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
model_test_precision = precision_score(y_test, y_test_pred) 
model_test_recall = recall_score(y_test, y_test_pred) 
model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) 

print('===========Model performance for Training set====================')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print('- F1 score: {:.4f}'.format(model_train_f1))
print('- Precision: {:.4f}'.format(model_train_precision))
print('- Recall: {:.4f}'.format(model_train_recall))
print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
    
print('===========Model performance for Test set======================')
print('- Accuracy: {:.4f}'.format(model_test_accuracy))
print('- F1 score: {:.4f}'.format(model_test_f1))
print('- Precision: {:.4f}'.format(model_test_precision))
print('- Recall: {:.4f}'.format(model_test_recall))
print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))


- Accuracy: 1.0000
- F1 score: 1.0000
- Precision: 1.0000
- Recall: 1.0000
- Roc Auc Score: 1.0000
- Accuracy: 0.9294
- F1 score: 0.9241
- Precision: 0.9692
- Recall: 0.6597
- Roc Auc Score: 0.8273


##### Hyperparam Tuning Adaboost

In [80]:
adaboost_param={
    "n_estimators":[50,60,70,80,90],
    "algorithm":['SAMME','SAMME.R']
}

In [86]:
base_estimator = DecisionTreeClassifier()
adaboost_param = {
    "n_estimators": [50, 60, 70, 80, 90],
    "algorithm": ['SAMME', 'SAMME.R'],
}
model = AdaBoostClassifier()
adaboost_classifier = RandomizedSearchCV(
    estimator=model, 
    param_distributions=adaboost_param,
    n_iter=10,  # Reduce for quick debugging
    cv=3, 
    verbose=2, 
    n_jobs=-1
)
adaboost_classifier.fit(X_train, y_train)
adaboost_classifier.best_params_

Fitting 3 folds for each of 10 candidates, totalling 30 fits
[CV] END ...................algorithm=SAMME, n_estimators=50; total time=   0.1s
[CV] END ...................algorithm=SAMME, n_estimators=50; total time=   0.1s
[CV] END ...................algorithm=SAMME, n_estimators=50; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=60; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=60; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=70; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=60; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=70; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=70; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=80; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_estimators=80; total time=   0.2s
[CV] END ...................algorithm=SAMME, n_e

{'n_estimators': 80, 'algorithm': 'SAMME'}

In [89]:
model = AdaBoostClassifier(n_estimators=80, algorithm= 'SAMME')
model.fit(X_train, y_train) 
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

model_train_accuracy = accuracy_score(y_train, y_train_pred) 
model_train_f1 = f1_score(y_train, y_train_pred, average='weighted') 
model_train_precision = precision_score(y_train, y_train_pred) 
model_train_recall = recall_score(y_train, y_train_pred) 
model_train_rocauc_score = roc_auc_score(y_train, y_train_pred)

model_test_accuracy = accuracy_score(y_test, y_test_pred) 
model_test_f1 = f1_score(y_test, y_test_pred, average='weighted') 
model_test_precision = precision_score(y_test, y_test_pred) 
model_test_recall = recall_score(y_test, y_test_pred) 
model_test_rocauc_score = roc_auc_score(y_test, y_test_pred) 

print('===========Model performance for Training set====================')
print("- Accuracy: {:.4f}".format(model_train_accuracy))
print('- F1 score: {:.4f}'.format(model_train_f1))
print('- Precision: {:.4f}'.format(model_train_precision))
print('- Recall: {:.4f}'.format(model_train_recall))
print('- Roc Auc Score: {:.4f}'.format(model_train_rocauc_score))
    
print('===========Model performance for Test set======================')
print('- Accuracy: {:.4f}'.format(model_test_accuracy))
print('- F1 score: {:.4f}'.format(model_test_f1))
print('- Precision: {:.4f}'.format(model_test_precision))
print('- Recall: {:.4f}'.format(model_test_recall))
print('- Roc Auc Score: {:.4f}'.format(model_test_rocauc_score))


- Accuracy: 0.8465
- F1 score: 0.8132
- Precision: 0.7699
- Recall: 0.2524
- Roc Auc Score: 0.6176
- Accuracy: 0.8364
- F1 score: 0.7977
- Precision: 0.7818
- Recall: 0.2251
- Roc Auc Score: 0.6049


In [96]:
# AdaBoostClassifier builds a series of weak learners (in your case, decision trees with max_depth=1).
# Each weak learner may behave differently due to some random aspects (e.g., handling of ties, sample selection with weights).
# Setting a random_state ensures reproducibility—you can debug or retrain the model and get the same weak learners again.
print(model.estimators_) 

[DecisionTreeClassifier(max_depth=1, random_state=1204919214), DecisionTreeClassifier(max_depth=1, random_state=25446884), DecisionTreeClassifier(max_depth=1, random_state=1736923657), DecisionTreeClassifier(max_depth=1, random_state=1928467732), DecisionTreeClassifier(max_depth=1, random_state=1697850557), DecisionTreeClassifier(max_depth=1, random_state=671660429), DecisionTreeClassifier(max_depth=1, random_state=160665462), DecisionTreeClassifier(max_depth=1, random_state=257572384), DecisionTreeClassifier(max_depth=1, random_state=1491459762), DecisionTreeClassifier(max_depth=1, random_state=1587274176), DecisionTreeClassifier(max_depth=1, random_state=1579420281), DecisionTreeClassifier(max_depth=1, random_state=1582638877), DecisionTreeClassifier(max_depth=1, random_state=431999453), DecisionTreeClassifier(max_depth=1, random_state=252990437), DecisionTreeClassifier(max_depth=1, random_state=1640017201), DecisionTreeClassifier(max_depth=1, random_state=1842157236), DecisionTreeCl

In [110]:
print((model.estimator_weights_))

[1.47327716 0.64813332 0.53329539 0.37652105 0.31123854 0.35564458
 0.18234608 0.32414449 0.18659566 0.15885918 0.20813486 0.16268849
 0.11254182 0.14592382 0.1834799  0.15176697 0.14550397 0.11611633
 0.08893728 0.13605368 0.14183186 0.15059134 0.05594569 0.09485215
 0.1191262  0.06064844 0.05580929 0.06976223 0.133863   0.0526841
 0.06578103 0.06440268 0.01955713 0.01697949 0.01683655 0.016696
 0.01655777 0.01642181 0.11958729 0.04650426 0.08091043 0.12738856
 0.05456103 0.09961419 0.13726666 0.09426804 0.06400971 0.05572259
 0.07503972 0.0633563  0.01281332 0.0171869  0.01699606 0.01680855
 0.01662429 0.01273596 0.01265537 0.01257579 0.01249721 0.01644623
 0.10612754 0.05603252 0.03405299 0.10033849 0.05809118 0.01624574
 0.01243421 0.01235738 0.0122815  0.01220654 0.01213249 0.01205934
 0.01607638 0.0159056  0.01198885 0.01191741 0.01184682 0.01177705
 0.01574024 0.01557497]
