In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('../Data/Travel.csv')

In [3]:
df.shape

(4888, 20)

In [4]:
df['Gender'] = df['Gender'].replace('Fe Male', 'Female')
df['MaritalStatus'] = df['MaritalStatus'].replace('Single', 'Unmarried')

In [5]:
features_with_na = [features for features in df.columns if df[features].isnull().sum() > 0]
features_with_na

['Age',
 'TypeofContact',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'NumberOfChildrenVisiting',
 'MonthlyIncome']

In [6]:
df[features_with_na].describe()

Unnamed: 0,Age,DurationOfPitch,NumberOfFollowups,PreferredPropertyStar,NumberOfTrips,NumberOfChildrenVisiting,MonthlyIncome
count,4662.0,4637.0,4843.0,4862.0,4748.0,4822.0,4655.0
mean,37.622265,15.490835,3.708445,3.581037,3.236521,1.187267,23619.853491
std,9.316387,8.519643,1.002509,0.798009,1.849019,0.857861,5380.698361
min,18.0,5.0,1.0,3.0,1.0,0.0,1000.0
25%,31.0,9.0,3.0,3.0,2.0,1.0,20346.0
50%,36.0,13.0,4.0,3.0,3.0,1.0,22347.0
75%,44.0,20.0,4.0,4.0,4.0,2.0,25571.0
max,61.0,127.0,6.0,5.0,22.0,3.0,98678.0


In [7]:
df.Age.fillna(df.Age.median(), inplace=True)
df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
df.NumberOfTrips.fillna(df.NumberOfTrips.median(), inplace=True)
df.MonthlyIncome.fillna(df.MonthlyIncome.median(), inplace=True)

df.TypeofContact.fillna(df.TypeofContact.mode()[0], inplace=True)
df.NumberOfFollowups.fillna(df.NumberOfFollowups.mode()[0], inplace=True)
df.PreferredPropertyStar.fillna(df.PreferredPropertyStar.mode()[0], inplace=True)
df.NumberOfChildrenVisiting.fillna(df.NumberOfChildrenVisiting.mode()[0], inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.Age.fillna(df.Age.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df.DurationOfPitch.fillna(df.DurationOfPitch.median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which w

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4888 entries, 0 to 4887
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   CustomerID                4888 non-null   int64  
 1   ProdTaken                 4888 non-null   int64  
 2   Age                       4888 non-null   float64
 3   TypeofContact             4888 non-null   object 
 4   CityTier                  4888 non-null   int64  
 5   DurationOfPitch           4888 non-null   float64
 6   Occupation                4888 non-null   object 
 7   Gender                    4888 non-null   object 
 8   NumberOfPersonVisiting    4888 non-null   int64  
 9   NumberOfFollowups         4888 non-null   float64
 10  ProductPitched            4888 non-null   object 
 11  PreferredPropertyStar     4888 non-null   float64
 12  MaritalStatus             4888 non-null   object 
 13  NumberOfTrips             4888 non-null   float64
 14  Passport

In [9]:
df.drop('CustomerID', axis=1, inplace=True)

In [10]:
df['TotalVisiting'] = df['NumberOfPersonVisiting'] + df['NumberOfChildrenVisiting']
df.drop(['NumberOfPersonVisiting', 'NumberOfChildrenVisiting'], axis=1, inplace=True)

In [11]:
numeric_features = [features for features in df.columns if df[features].dtypes != 'O']
print('Number of numerical variables: ', len(numeric_features))
numeric_features

Number of numerical variables:  12


['ProdTaken',
 'Age',
 'CityTier',
 'DurationOfPitch',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'MonthlyIncome',
 'TotalVisiting']

In [12]:
categorical_features = [features for features in df.columns if df[features].dtypes == 'O']
print('Number of categorical variables: ', len(categorical_features))
categorical_features

Number of categorical variables:  6


['TypeofContact',
 'Occupation',
 'Gender',
 'ProductPitched',
 'MaritalStatus',
 'Designation']

In [13]:
discrete_features = [features for features in numeric_features if len(df[features].unique()) <= 25]
print('Discrete Variables Count: ', len(discrete_features))
discrete_features

Discrete Variables Count:  9


['ProdTaken',
 'CityTier',
 'NumberOfFollowups',
 'PreferredPropertyStar',
 'NumberOfTrips',
 'Passport',
 'PitchSatisfactionScore',
 'OwnCar',
 'TotalVisiting']

In [14]:
continuous_features = [features for features in numeric_features if features not in discrete_features]
print('Continuous feature Count: ', len(continuous_features))
continuous_features

Continuous feature Count:  3


['Age', 'DurationOfPitch', 'MonthlyIncome']

In [15]:
from sklearn.model_selection import train_test_split
X = df.drop(['ProdTaken'], axis=1)
y = df['ProdTaken']

In [16]:
X.shape, y.shape

((4888, 17), (4888,))

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train.shape, X_test.shape

((3910, 17), (978, 17))

In [18]:
categorical_features = [features for features in X.columns if X[features].dtypes == 'O']
numeric_features = [features for features in X.columns if X[features].dtypes != 'O']

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
ohe_encoder = OneHotEncoder(drop="first")

preprocessor = ColumnTransformer(
    [
        ('OneHotEnckder', ohe_encoder, categorical_features),
        ('StandardScaler', numeric_transformer, numeric_features)
    ]
)

In [19]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [20]:
preprocessor

In [21]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_auc_score, roc_curve

In [22]:
models = {
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'AdaBoost': AdaBoostClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

In [23]:
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(list(models.keys())[i])
    print('Accuracy: ', accuracy_score(y_test, y_pred))
    print('Precision: ', precision_score(y_test, y_pred))
    print('Recall: ', recall_score(y_test, y_pred))
    print('F1 Score: ', f1_score(y_test, y_pred))
    print('ROC AUC Score: ', roc_auc_score(y_test, y_pred))
    print('Classification Report: \n', classification_report(y_test, y_pred))
    print('\n')

DecisionTree
Accuracy:  0.9335378323108384
Precision:  0.844311377245509
Recall:  0.7833333333333333
F1 Score:  0.8126801152737753
ROC AUC Score:  0.8753759398496241
Classification Report: 
               precision    recall  f1-score   support

           0       0.95      0.97      0.96       798
           1       0.84      0.78      0.81       180

    accuracy                           0.93       978
   macro avg       0.90      0.88      0.89       978
weighted avg       0.93      0.93      0.93       978



RandomForest
Accuracy:  0.9284253578732107
Precision:  0.9508196721311475
Recall:  0.6444444444444445
F1 Score:  0.7682119205298014
ROC AUC Score:  0.8184628237259816
Classification Report: 
               precision    recall  f1-score   support

           0       0.93      0.99      0.96       798
           1       0.95      0.64      0.77       180

    accuracy                           0.93       978
   macro avg       0.94      0.82      0.86       978
weighted avg    

In [24]:
from sklearn.model_selection import RandomizedSearchCV

gradient_classifier = GradientBoostingClassifier()

param_dist = {
    'n_estimators': [50, 100, 150, 200, 250, 300],
    'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'min_samples_split': [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5],
    'max_features': ['auto', 'sqrt', 'log2'],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

random_search = RandomizedSearchCV(gradient_classifier, param_distributions=param_dist, n_iter=10, cv=5, n_jobs=-1, verbose=3)

In [25]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5] END learning_rate=0.001, max_depth=3, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=250, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.001, max_depth=3, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=250, subsample=0.8;, score=nan total time=   0.0s
[CV 4/5] END learning_rate=0.3, max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=150, subsample=0.7;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.001, max_depth=3, max_features=auto, min_samples_leaf=3, min_samples_split=5, n_estimators=250, subsample=0.8;, score=nan total time=   0.0s
[CV 5/5] END learning_rate=0.3, max_depth=8, max_features=auto, min_samples_leaf=5, min_samples_split=4, n_estimators=150, subsample=0.7;, score=nan total time=   0.0s
[CV 2/5] END learning_rate=0.3, max_depth=8, max_features=auto, min_samples_leaf=5, min_sampl

25 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
18 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 1382, in wrapper
    estimator._validate_params()
  File "/Users/satviksawhney/Downloads/AIML/Machine Learning/.venv/lib/python3.12/site-packages/sklearn/base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "/Users/satviks

In [26]:
random_search.best_params_

{'subsample': 1.0,
 'n_estimators': 150,
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_features': 'sqrt',
 'max_depth': 8,
 'learning_rate': 0.1}

In [27]:
random_search.best_score_

np.float64(0.9332480818414322)