In [49]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

## Data Collection

In [50]:
df = pd.read_csv('../Data/Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


## Data Cleaning
- Handling Missing Values
- Handling Duplicates
- Checking Data Types
- Understanding the dataset

In [51]:
df.shape

(4888, 20)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4662 non-null   float64
 3   TypeofContact             4863 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4637 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4843 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4862 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4748 non-null   float64
 14  Passport

In [53]:
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [54]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [55]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [56]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried')

In [57]:
# Missing Values
features_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
features_with_na

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [58]:
df[features_with_na].describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


# Imputing Null values
- Median - Age, Duration of Pitch, NumberOfTrips, MonthlyIncome
- Mode - TypeOfContract, NumberOfFollowUp, PreferredPropertyStart, NumberofChildrenvISITING

In [59]:
df.Age.fillna(df.Age.median(), inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Age.fillna(df.Age.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4888 non-null   float64
 3   TypeofContact             4888 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4888 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4888 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4888 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4888 non-null   float64
 14  Passport

In [61]:
df.drop('CustomerID', axis=1, inplace=True)

In [62]:
df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis=1, inplace=True)

In [63]:
numeric_features = [features for features in df.columns if df[features].dtypes != 'O']
print('Number of numerical variables: ', len(numeric_features))
numeric_features

Number of numerical variables:  12


['ProdTaken',
 'Age',
 'CityTier',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'MonthlyIncome',
 'TotalVisiting']

In [64]:
categorical_features = [features for features in df.columns if df[features].dtypes == 'O']
print('Number of categorical variables: ', len(categorical_features))
categorical_features

Number of categorical variables:  6


['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [65]:
discrete_features = [features for features in numeric_features if len(df[features].unique()) <= 25]
print('Discrete Variables Count: ', len(discrete_features))
discrete_features

Discrete Variables Count:  9


['ProdTaken',
 'CityTier',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'TotalVisiting']

In [66]:
continuous_features = [features for features in numeric_features if features not in discrete_features]
print('Continuous feature Count: ', len(continuous_features))
continuous_features

Continuous feature Count:  3


['Age', 'DurationOfPitch', 'MonthlyIncome']

## Train Test Split

In [67]:
from sklearn.model_selection import train_test_split
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

In [68]:
X.shape, y.shape

((4888, 17), (4888,))

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((3910, 17), (978, 17))

In [70]:
categorical_features = [features for features in X.columns if X[features].dtypes == 'O']
numeric_features = [features for features in X.columns if X[features].dtypes != 'O']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe_encoder = OneHotEncoder(drop="first")

preprocessor = ColumnTransformer(
    [
        ('OneHotEnckder', ohe_encoder, categorical_features),
        ('StandardScaler', numeric_transformer, numeric_features)
    ]
)

In [71]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [72]:
preprocessor

In [78]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [79]:
models = {
    'Decision Tree': DecisionTreeClassifier(),
    'RandomForestClassifier': RandomForestClassifier(),
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'KNeighborsClassifier': KNeighborsClassifier(),
    'GaussianNB': GaussianNB()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(list(models.keys())[i])
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F1 Score: ', f1_score(y_test, y_pred))
    print('ROC AUC Score: ', roc_auc_score(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))
    print('\n')

Decision Tree
Accuracy:  0.9253578732106339
Precision:  0.8128654970760234
Recall:  0.7722222222222223
F1 Score:  0.792022792022792
ROC AUC Score:  0.8660609857978279
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.96      0.95       798
           1       0.81      0.77      0.79       180

    accuracy                           0.93       978
   macro avg       0.88      0.87      0.87       978
weighted avg       0.92      0.93      0.92       978



RandomForestClassifier
Accuracy:  0.9263803680981595
Precision:  0.9821428571428571
Recall:  0.6111111111111112
F1 Score:  0.7534246575342466
ROC AUC Score:  0.8043024227234754
Classification Report: 
               precision    recall  f1-score   support

           0       0.92      1.00      0.96       798
           1       0.98      0.61      0.75       180

    accuracy                           0.93       978
   macro avg       0.95      0.80      0.86       978
weigh

In [87]:
from sklearn.model_selection import RandomizedSearchCV

In [88]:
params_grid = {
    'max_depth': [3, 5, 7, 9, 11],
    'min_samples_split': [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'n_estimators': [100, 200, 300, 400, 500]
}

In [90]:
grid_model = RandomizedSearchCV(RandomForestClassifier(), params_grid, cv=5, n_jobs=-1, verbose=2)
grid_model.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=3, min_samples_leaf=2, min_samples_split=2, n_estimators=500; total time=   0.7s
[CV] END max_depth=9, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END max_depth=9, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END max_depth=9, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END max_depth=9, min_samples_leaf=4, min_samples_split=2, n_estimators=400; total time=   0.9s
[CV] END max_depth=9, min_samples_leaf=

In [91]:
grid_model.best_params_

{'n_estimators': 400,
 'min_samples_split': 3,
 'min_samples_leaf': 1,
 'max_depth': 11}

In [92]:
models = {
    'RandomForestClassifier': RandomForestClassifier(max_depth=11, min_samples_split=3, min_samples_leaf=1, n_estimators=400)
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(list(models.keys())[i])
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F1 Score: ', f1_score(y_test, y_pred))
    print('ROC AUC Score: ', roc_auc_score(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))
    print('\n')

RandomForestClassifier
Accuracy:  0.8926380368098159
Precision:  0.9310344827586207
Recall:  0.45
F1 Score:  0.6067415730337079
ROC AUC Score:  0.7212406015037593
Classification Report: 
               precision    recall  f1-score   support

           0       0.89      0.99      0.94       798
           1       0.93      0.45      0.61       180

    accuracy                           0.89       978
   macro avg       0.91      0.72      0.77       978
weighted avg       0.90      0.89      0.88       978



