In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')

In [3]:
df = pd.read_csv('Travel.csv')
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Single,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Single,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


## Data Cleaning

1. Handling missing values
2. Handling duplicates
3. Check data type
4. Understand the dataset

In [4]:
df.dtypes

CustomerID                    int64
ProdTaken                     int64
Age                         float64
TypeofContact                object
CityTier                      int64
DurationOfPitch             float64
Occupation                   object
Gender                       object
NumberOfPersonVisiting        int64
NumberOfFollowups           float64
ProductPitched               object
PreferredPropertyStar       float64
MaritalStatus                object
NumberOfTrips               float64
Passport                      int64
PitchSatisfactionScore        int64
OwnCar                        int64
NumberOfChildrenVisiting    float64
Designation                  object
MonthlyIncome               float64
dtype: object

In [5]:
# check for missing values

df.isnull().sum()

CustomerID                    0
ProdTaken                     0
Age                         226
TypeofContact                25
CityTier                      0
DurationOfPitch             251
Occupation                    0
Gender                        0
NumberOfPersonVisiting        0
NumberOfFollowups            45
ProductPitched                0
PreferredPropertyStar        26
MaritalStatus                 0
NumberOfTrips               140
Passport                      0
PitchSatisfactionScore        0
OwnCar                        0
NumberOfChildrenVisiting     66
Designation                   0
MonthlyIncome               233
dtype: int64

In [6]:
## check all the categories
df['Gender'].value_counts()

Gender
Male       2916
Female     1817
Fe Male     155
Name: count, dtype: int64

In [7]:
df['MaritalStatus'].value_counts()

MaritalStatus
Married      2340
Divorced      950
Single        916
Unmarried     682
Name: count, dtype: int64

In [8]:
df['TypeofContact'].value_counts()

TypeofContact
Self Enquiry       3444
Company Invited    1419
Name: count, dtype: int64

In [9]:
df['Designation'].value_counts()

Designation
Executive         1842
Manager           1732
Senior Manager     742
AVP                342
VP                 230
Name: count, dtype: int64

In [10]:
df['Occupation'].value_counts()

Occupation
Salaried          2368
Small Business    2084
Large Business     434
Free Lancer          2
Name: count, dtype: int64

In [11]:
df['Gender'] = df['Gender'].replace({'Fe Male':'Female'})

In [12]:
df['Gender'].value_counts()

Gender
Male      2916
Female    1972
Name: count, dtype: int64

In [13]:
df['MaritalStatus'] = df['MaritalStatus'].replace({'Single':'Unmarried'})

In [14]:
df.head()

Unnamed: 0,CustomerID,ProdTaken,Age,TypeofContact,CityTier,DurationOfPitch,Occupation,Gender,NumberOfPersonVisiting,NumberOfFollowups,ProductPitched,PreferredPropertyStar,MaritalStatus,NumberOfTrips,Passport,PitchSatisfactionScore,OwnCar,NumberOfChildrenVisiting,Designation,MonthlyIncome
0,200000,1,41.0,Self Enquiry,3,6.0,Salaried,Female,3,3.0,Deluxe,3.0,Unmarried,1.0,1,2,1,0.0,Manager,20993.0
1,200001,0,49.0,Company Invited,1,14.0,Salaried,Male,3,4.0,Deluxe,4.0,Divorced,2.0,0,3,1,2.0,Manager,20130.0
2,200002,1,37.0,Self Enquiry,1,8.0,Free Lancer,Male,3,4.0,Basic,3.0,Unmarried,7.0,1,3,0,0.0,Executive,17090.0
3,200003,0,33.0,Company Invited,1,9.0,Salaried,Female,2,3.0,Basic,3.0,Divorced,2.0,1,5,1,1.0,Executive,17909.0
4,200004,0,,Self Enquiry,1,8.0,Small Business,Male,2,3.0,Basic,4.0,Divorced,1.0,0,5,1,0.0,Executive,18468.0


In [15]:
# check for missing values
features_nan = [feature for feature in df.columns if df[feature].isna().sum()>0]
for feature in features_nan:
    print(f'{feature} : {round((df[feature].isnull().mean())*100,3)} % missimg values')

Age : 4.624 % missimg values
TypeofContact : 0.511 % missimg values
DurationOfPitch : 5.135 % missimg values
NumberOfFollowups : 0.921 % missimg values
PreferredPropertyStar : 0.532 % missimg values
NumberOfTrips : 2.864 % missimg values
NumberOfChildrenVisiting : 1.35 % missimg values
MonthlyIncome : 4.767 % missimg values


In [16]:
# statistical on numerical columns

df[features_nan].select_dtypes(exclude='O').describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


### Imputing Null Values

1. Impute Meadian value for Age column
2. Impute Mode value for TypeofContract
3. Impute Meadian value for DurationOfPitch
4. Impute Mode value for NumberofFollowup as it is discrete feature
5. Impute Mode value for PreferredPropertyStar
6. Impute Meadian value for NumberOfTrips
7. Impute Mode value for NumberOfChildren Visiting
8. Impute Meadian value for MonthlyIncome

In [17]:
#Age
df.Age.fillna(df.Age.mean(), inplace=True)

In [18]:
#TypeofContract
df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)

In [19]:
#DurationOfPitch
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)

In [20]:
#NumberOfFollowups
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)

In [21]:
#PreferredPropertyStar
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.median(), inplace=True)

In [22]:
#NumberOfTrips
df.NumberOfTrips.fillna(df.NumberOfTrips.mode()[0], inplace=True)

In [23]:
#NumberOfChildrenVisiting
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)

In [24]:
#MonthlyIncome
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

In [25]:
df.drop(columns='CustomerID', inplace=True)

In [26]:
df.isnull().sum()

ProdTaken                   0
Age                         0
TypeofContact               0
CityTier                    0
DurationOfPitch             0
Occupation                  0
Gender                      0
NumberOfPersonVisiting      0
NumberOfFollowups           0
ProductPitched              0
PreferredPropertyStar       0
MaritalStatus               0
NumberOfTrips               0
Passport                    0
PitchSatisfactionScore      0
OwnCar                      0
NumberOfChildrenVisiting    0
Designation                 0
MonthlyIncome               0
dtype: int64

## Feature Engineering

### Feature Extraction

In [27]:
# create new column TotalVisiting
df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(columns=['NumberOfPersonVisiting','NumberOfChildrenVisiting'], inplace=True)

In [28]:
# get all the numeric features

num_cols = [feature for feature in df.columns if df[feature].dtype !='O']
print('Number of numerical features:', len(num_cols))

Number of numerical features: 12


In [29]:
cat_cols = [feature for feature in df.columns if df[feature].dtype =='O']
print('Number of categorical features:', len(cat_cols))

Number of categorical features: 6


In [30]:
## discrete features

discrete = [feature for feature in num_cols if len(df[feature].unique())<=25]
print('Number of discrete features:', len(discrete))

Number of discrete features: 9


In [31]:
## continuous features

continuous = [feature for feature in num_cols if feature not in discrete]
print('Number of continuous features:', len(continuous))

Number of continuous features: 3


## Train Test Split

In [32]:
from sklearn.model_selection import train_test_split

X = df.drop(columns='ProdTaken')
y = df['ProdTaken']

In [33]:
print(X.columns)
print(num_cols)
print(cat_cols)

Index(['Age', 'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation',
       'Gender', 'NumberOfFollowups', 'ProductPitched',
       'PreferredPropertyStar', 'MaritalStatus', 'NumberOfTrips', 'Passport',
       'PitchSatisfactionScore', 'OwnCar', 'Designation', 'MonthlyIncome',
       'TotalVisiting'],
      dtype='object')
['ProdTaken', 'Age', 'CityTier', 'DurationOfPitch', 'NumberOfFollowups', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'OwnCar', 'MonthlyIncome', 'TotalVisiting']
['TypeofContact', 'Occupation', 'Gender', 'ProductPitched', 'MaritalStatus', 'Designation']


In [34]:
y.value_counts()

ProdTaken
0    3968
1     920
Name: count, dtype: int64

In [35]:
# separate dataset into train and test

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape

((3910, 17), (978, 17))

In [36]:
# Create column transformer with 3 types of transformers
cat_features = X.select_dtypes(include='O').columns
num_features = X.select_dtypes(exclude='O').columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder(drop='first')

preprocessor = ColumnTransformer([
    ('OneHotEncoder', oh_transformer, cat_features),
    ('StandardScaler', numeric_transformer, num_features)
])

In [37]:
preprocessor

In [38]:
X_train = preprocessor.fit_transform(X_train)

In [39]:
pd.set_option('display.max_columns', None)
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-0.165989,-0.721400,-1.020350,1.284279,-0.725271,-0.111835,-0.632399,0.679690,0.782966,-0.382245,-0.774151
1,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.500391,-0.721400,0.690023,0.282777,-0.725271,1.517765,-0.632399,0.679690,0.782966,-0.459799,0.643615
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.388898,-0.721400,-1.020350,0.282777,1.771041,0.431365,-0.632399,0.679690,0.782966,-0.245196,-0.065268
3,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.056138,-0.721400,-1.020350,1.284279,-0.725271,-0.111835,-0.632399,1.408395,-1.277194,0.213475,-0.065268
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.722518,-0.721400,2.400396,-1.720227,-0.725271,1.517765,-0.632399,-0.049015,-1.277194,-0.024889,2.061382
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3905,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,-1.054495,-0.721400,-0.653841,1.284279,-0.725271,-0.655035,-0.632399,-1.506426,0.782966,-0.536973,0.643615
3906,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.389328,1.455047,-0.898180,-0.718725,1.771041,-1.198235,-0.632399,1.408395,0.782966,1.529609,-0.065268
3907,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.056138,1.455047,1.545210,0.282777,-0.725271,2.060964,-0.632399,-0.777720,0.782966,-0.360576,0.643615
3908,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,-1.054495,1.455047,1.789549,1.284279,-0.725271,-0.111835,-0.632399,-1.506426,0.782966,-0.252799,0.643615


In [40]:
## applt transformation on test data using transform
X_test = preprocessor.transform(X_test)

## Random Forest classifier training

In [43]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, ConfusionMatrixDisplay, roc_auc_score

In [44]:
models = {
    'Logistic Regresion': LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    'Random Forest':RandomForestClassifier(),
    'Adaboost': AdaBoostClassifier(),
    'Gradient Boosting': GradientBoostingClassifier()
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_f1_score = f1_score(y_train, y_train_pred)
    model_train_rocauc = roc_auc_score(y_train, y_train_pred)
    
    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_f1_score = f1_score(y_test, y_test_pred)
    model_test_rocauc = roc_auc_score(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    print('----------------------------------------------')
    
    print('Model performance for Training set')
    print("-Accuracy: {:.4f}".format(model_train_accuracy))
    print("-Precision: {:.4f}".format(model_train_precision))
    print("-Recall: {:.4f}".format(model_train_recall))
    print("-f1_score: {:.4f}".format(model_train_f1_score))
    print("-ROC-AUC: {:.4f}".format(model_train_rocauc))
    
    print('----------------------------------------------')
    print('Model performance for Test set')
    print("-Accuracy: {:.4f}".format(model_test_accuracy))
    print("-Precision: {:.4f}".format(model_test_precision))
    print("-Recall: {:.4f}".format(model_test_recall))
    print("-f1_score: {:.4f}".format(model_test_f1_score))
    print("-ROC-AUC: {:.4f}".format(model_test_rocauc))
    print('=============================================')
    print()
    

Logistic Regresion
----------------------------------------------
Model performance for Training set
-Accuracy: 0.8460
-Precision: 0.7003
-Recall: 0.3045
-f1_score: 0.4245
-ROC-AUC: 0.6373
----------------------------------------------
Model performance for Test set
-Accuracy: 0.8354
-Precision: 0.6829
-Recall: 0.2932
-f1_score: 0.4103
-ROC-AUC: 0.6301

Decision Tree
----------------------------------------------
Model performance for Training set
-Accuracy: 1.0000
-Precision: 1.0000
-Recall: 1.0000
-f1_score: 1.0000
-ROC-AUC: 1.0000
----------------------------------------------
Model performance for Test set
-Accuracy: 0.9182
-Precision: 0.7968
-Recall: 0.7801
-f1_score: 0.7884
-ROC-AUC: 0.8659

Random Forest
----------------------------------------------
Model performance for Training set
-Accuracy: 1.0000
-Precision: 1.0000
-Recall: 1.0000
-f1_score: 1.0000
-ROC-AUC: 1.0000
----------------------------------------------
Model performance for Test set
-Accuracy: 0.9284
-Precision: 0

In [49]:
## Hyperparameter Tuning

rf_params = {
    'max_depth':[5, 8, 10, 15, None],
    'max_features':[5, 7, 8, None],
    'min_samples_split':[2, 8, 15, 20],
    'n_estimators':[100, 200, 500, 1000]
}

gradient_params = {
    'loss': ['log_loss','deviance','exponential'],
    'criterion':['friedman_mse','squared_error','mse'],
    'min_samples_split':[2,8,15,20],
    'n_estimators':[100,200,500,1000],
    'max_depth':[5,8,10,15,None]
}

In [50]:
# models list for hyperparameter tuning
randomcv_models = [
    ('RF', RandomForestClassifier(), rf_params),
    ('GradientBoost', GradientBoostingClassifier(), gradient_params)
]

In [51]:
randomcv_models

[('RF',
  RandomForestClassifier(),
  {'max_depth': [5, 8, 10, 15, None],
   'max_features': [5, 7, 8, None],
   'min_samples_split': [2, 8, 15, 20],
   'n_estimators': [100, 200, 500, 1000]}),
 ('GradientBoost',
  GradientBoostingClassifier(),
  {'loss': ['log_loss', 'deviance', 'exponential'],
   'criterion': ['friedman_mse', 'squared_error', 'mse'],
   'min_samples_split': [2, 8, 15, 20],
   'n_estimators': [100, 200, 500, 1000],
   'max_depth': [5, 8, 10, 15, None]})]

In [52]:
from sklearn.model_selection import RandomizedSearchCV

model_param = {}
for name, model, params in randomcv_models:
    random = RandomizedSearchCV(model, param_distributions=params,
                               n_iter = 100,
                               cv=3,
                               verbose=2,
                               n_jobs=-1)
    random.fit(X_train, y_train)
    model_param[name] = random.best_params_
    
for model_name in model_param:
    print(f"----------------------- Best Params for {model_name} -----------------------")
    print(model_param[model_name])

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
----------------------- Best Params for RF -----------------------
{'n_estimators': 1000, 'min_samples_split': 2, 'max_features': None, 'max_depth': None}
----------------------- Best Params for GradientBoost -----------------------
{'n_estimators': 200, 'min_samples_split': 15, 'max_depth': None, 'loss': 'log_loss', 'criterion': 'friedman_mse'}


In [53]:
models = {
    'Random Forest':RandomForestClassifier(n_estimators= 500, 
                                           min_samples_split= 2, 
                                           max_features= None, 
                                           max_depth= None),
    'GradientBoostclassifier':GradientBoostingClassifier(n_estimators=200, 
                                                         min_samples_split= 15, 
                                                         max_depth= None, 
                                                         loss='log_loss', 
                                                         criterion= 'friedman_mse')
}

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    
    # make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Training set performance
    model_train_accuracy = accuracy_score(y_train, y_train_pred)
    model_train_precision = precision_score(y_train, y_train_pred)
    model_train_recall = recall_score(y_train, y_train_pred)
    model_train_f1_score = f1_score(y_train, y_train_pred)
    model_train_rocauc = roc_auc_score(y_train, y_train_pred)
    
    # Test set performance
    model_test_accuracy = accuracy_score(y_test, y_test_pred)
    model_test_precision = precision_score(y_test, y_test_pred)
    model_test_recall = recall_score(y_test, y_test_pred)
    model_test_f1_score = f1_score(y_test, y_test_pred)
    model_test_rocauc = roc_auc_score(y_test, y_test_pred)
    
    print(list(models.keys())[i])
    print('----------------------------------------------')
    
    print('Model performance for Training set')
    print("-Accuracy: {:.4f}".format(model_train_accuracy))
    print("-Precision: {:.4f}".format(model_train_precision))
    print("-Recall: {:.4f}".format(model_train_recall))
    print("-f1_score: {:.4f}".format(model_train_f1_score))
    print("-ROC-AUC: {:.4f}".format(model_train_rocauc))
    
    print('----------------------------------------------')
    print('Model performance for Test set')
    print("-Accuracy: {:.4f}".format(model_test_accuracy))
    print("-Precision: {:.4f}".format(model_test_precision))
    print("-Recall: {:.4f}".format(model_test_recall))
    print("-f1_score: {:.4f}".format(model_test_f1_score))
    print("-ROC-AUC: {:.4f}".format(model_test_rocauc))
    print('=============================================')
    print()

Random Forest
----------------------------------------------
Model performance for Training set
-Accuracy: 1.0000
-Precision: 1.0000
-Recall: 1.0000
-f1_score: 1.0000
-ROC-AUC: 1.0000
----------------------------------------------
Model performance for Test set
-Accuracy: 0.9479
-Precision: 0.9545
-Recall: 0.7696
-f1_score: 0.8522
-ROC-AUC: 0.8804

GradientBoostclassifier
----------------------------------------------
Model performance for Training set
-Accuracy: 1.0000
-Precision: 1.0000
-Recall: 1.0000
-f1_score: 1.0000
-ROC-AUC: 1.0000
----------------------------------------------
Model performance for Test set
-Accuracy: 0.9581
-Precision: 0.9688
-Recall: 0.8115
-f1_score: 0.8832
-ROC-AUC: 0.9026



In [None]:
auc_models = [{
    'label': 'Random Forest Classifier',
    'model':RandomForestClassifier(n_estimators= 500, 
                                           min_samples_split= 2, 
                                           max_features= None, 
                                           max_depth= None),
    auc = roc_auc_score(y_test, y_test_pred)
}]

#create loop through all model
for algo in auc_models:
    model = algo['model']
    model.fit(X_train, y_train)
#calculate TPR and FNR