In [24]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv("data1.csv", sep = ";")

In [5]:
df.head(5)

Unnamed: 0,account length,location code,user id,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,avg order value,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn
0,128,415,3824657,no,yes,25,265,45,17,110,197,87,2447,91,1101,10,3,27,1,0
1,107,415,3717191,no,yes,26,162,27,17,123,196,103,2544,103,1145,137,3,37,1,0
2,137,415,3581921,no,no,0,243,41,10,114,121,110,1626,104,732,122,5,329,0,0
3,84,408,3759999,yes,no,0,299,51,5,71,62,88,1969,89,886,66,7,178,2,0
4,75,415,3306626,yes,no,0,167,28,13,113,148,122,1869,121,841,101,3,273,3,0


In [6]:
df.shape

(3333, 20)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3333 entries, 0 to 3332
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   account length                       3333 non-null   int64 
 1   location code                        3333 non-null   int64 
 2   user id                              3333 non-null   int64 
 3   credit card info save                3333 non-null   object
 4   push status                          3333 non-null   object
 5   add to wishlist                      3333 non-null   int64 
 6   desktop sessions                     3333 non-null   int64 
 7   app sessions                         3333 non-null   int64 
 8   desktop transactions                 3333 non-null   int64 
 9   total product detail views           3333 non-null   int64 
 10  session duration                     3333 non-null   int64 
 11  promotion clicks                     3333 n

In [8]:
 df["location code"] = df["location code"].astype(str)

In [9]:
df['credit card info save'] = df["credit card info save"].replace({"yes":1,"no":0})
df['push status'] = df["push status"].replace({"yes":1,"no":0})

In [10]:
df["credit card info save"].unique()

array([0, 1])

In [11]:
df["avg order value"]= df["avg order value"].str.replace(',','.').astype(float)
df["discount rate per visited products"]= df["discount rate per visited products"].str.replace(',','.').astype(float)
df["product detail view per app session"]= df["product detail view per app session"].str.replace(',','.').astype(float)
df["add to cart per session"]= df["add to cart per session"].str.replace(',','.').astype(float)

In [12]:
df = pd.get_dummies(df, columns=["location code"])

In [13]:
df = df.drop("user id",axis = 1)

In [14]:
df.head(5)

Unnamed: 0,account length,credit card info save,push status,add to wishlist,desktop sessions,app sessions,desktop transactions,total product detail views,session duration,promotion clicks,...,sale product views,discount rate per visited products,product detail view per app session,app transactions,add to cart per session,customer service calls,churn,location code_408,location code_415,location code_510
0,128,0,1,25,265,45,17,110,197,87,...,91,11.01,10.0,3,2.7,1,0,0,1,0
1,107,0,1,26,162,27,17,123,196,103,...,103,11.45,13.7,3,3.7,1,0,0,1,0
2,137,0,0,0,243,41,10,114,121,110,...,104,7.32,12.2,5,3.29,0,0,0,1,0
3,84,1,0,0,299,51,5,71,62,88,...,89,8.86,6.6,7,1.78,2,0,1,0,0
4,75,1,0,0,167,28,13,113,148,122,...,121,8.41,10.1,3,2.73,3,0,0,1,0


In [15]:
df.columns

Index(['account length', 'credit card info save', 'push status',
       'add to wishlist', 'desktop sessions', 'app sessions',
       'desktop transactions', 'total product detail views',
       'session duration', 'promotion clicks', 'avg order value',
       'sale product views', 'discount rate per visited products',
       'product detail view per app session', 'app transactions',
       'add to cart per session', 'customer service calls', 'churn',
       'location code_408', 'location code_415', 'location code_510'],
      dtype='object')

In [18]:
cals_to_scale = ['account length',
       'add to wishlist', 'desktop sessions', 'app sessions',
       'desktop transactions', 'total product detail views',
       'session duration', 'promotion clicks', 'avg order value',
       'sale product views', 'discount rate per visited products',
       'product detail view per app session', 'app transactions',
       'add to cart per session', 'customer service calls']
scaler = Normalizer()
scaled_data = scaler.fit_transform(df[cals_to_scale])
scaled_df = pd.DataFrame(scaled_data, index=df.index, columns = cals_to_scale)

In [19]:
df = df.drop(cals_to_scale,axis=1)
df = pd.merge(df,scaled_df,left_index=True, right_index=True)

In [20]:
x = df.drop("churn",axis=1)
y = df["churn"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.33, random_state=42)

In [21]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(2233, 20) (1100, 20) (2233,) (1100,)


In [22]:
#building model
xgb_cl = xgb.XGBClassifier()
xgb_cl.fit(x_train,y_train)

In [23]:
preds = xgb_cl.predict(x_test)

In [25]:
acc = accuracy_score(y_test, preds)
sent = "model accuracy on the test dataset {}"
print(sent.format(acc))

model accuracy on the test dataset 0.92


In [26]:
param_grid = {
    "max_depth": [5],
    "learning_rate": [0, 0.01, 0.05, 0.1],
    "gamma": [1, 5, 10],
    "scale_ps_weight": [2, 5, 10, 20],
    "subsample": [1],
    "colsample_bytree": [1]
}
xgb_cl2 = xgb.XGBClassifier(objective="binary:logistic")
grid_cv = GridSearchCV(xgb_cl2, param_grid, n_jobs = -1, cv = 3, scoring = "roc_auc")
_  = grid_cv.fit(x_train, y_train)
print("The best score:", grid_cv.best_score_)
print("The Best Params:", grid_cv.best_params_)

The best score: 0.871807782261238
The Best Params: {'colsample_bytree': 1, 'gamma': 1, 'learning_rate': 0.1, 'max_depth': 5, 'scale_ps_weight': 2, 'subsample': 1}


In [29]:
final_cl = xgb.XGBClassifier(
    **grid_cv.best_params_, objective = "binary:logistic"
)
grid_final = final_cl.fit(x_train,y_train)
preds = grid_final.predict(x_test)
acc = accuracy_score(y_test,preds)
print("Accuracy of the Final Model:",acc)

Accuracy of the Final Model: 0.9136363636363637
