In [1]:
import pandas as pd
import numpy as np
import seaborn as sn
import matplotlib.pyplot as plt


# Data Cleaning and Processing

In [2]:
F1 =  pd.read_csv('C:/Users/Rahul/Desktop/PROJECT 1/datasets/customer_churn_dataset-testing-master.csv')

In [3]:
F1.head()

Unnamed: 0,CustomerID,Age,Gender,Tenure,Usage Frequency,Support Calls,Payment Delay,Subscription Type,Contract Length,Total Spend,Last Interaction,Churn
0,1,22,Female,25,14,4,27,Basic,Monthly,598,9,1
1,2,41,Female,28,28,7,13,Standard,Monthly,584,20,0
2,3,47,Male,27,10,2,29,Premium,Annual,757,21,0
3,4,35,Male,9,12,5,17,Premium,Quarterly,232,18,0
4,5,53,Female,58,24,9,2,Standard,Annual,533,18,0


In [4]:
F1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64374 entries, 0 to 64373
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   CustomerID         64374 non-null  int64 
 1   Age                64374 non-null  int64 
 2   Gender             64374 non-null  object
 3   Tenure             64374 non-null  int64 
 4   Usage Frequency    64374 non-null  int64 
 5   Support Calls      64374 non-null  int64 
 6   Payment Delay      64374 non-null  int64 
 7   Subscription Type  64374 non-null  object
 8   Contract Length    64374 non-null  object
 9   Total Spend        64374 non-null  int64 
 10  Last Interaction   64374 non-null  int64 
 11  Churn              64374 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 5.9+ MB


In [15]:
F1.rename(columns={
    'Usage Frequency': 'Usage_Frequency',
    'Support Calls': 'Support_Calls',
    'Payment Delay': 'Payment_Delay',
    'Subscription Type': 'Subscription_Type',
    'Contract Length' : 'Contract_Length',
    'Total Spend': 'Total_Spend',
    'Last Interaction': 'Last_Interaction'
}, inplace=True)
F1.columns

Index(['CustomerID', 'Age', 'Gender', 'Tenure', 'Usage_Frequency',
       'Support_Calls', 'Payment_Delay', 'Subscription_Type',
       'Contract_Length', 'Total_Spend', 'Last_Interaction', 'Churn'],
      dtype='object')

# Pipeline method for Model Deployment

In [7]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
X_train,X_test,y_train,y_test = train_test_split(F1.drop(['CustomerID','Churn'],axis=1),F1['Churn'],test_size = 0.2, random_state = 42)

categorical_features = X_train.select_dtypes(
    include=['object', 'category']
).columns
numerical_features = X_train.select_dtypes(
    exclude=['object', 'category']
).columns
preprocessor = ColumnTransformer([
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features),
    ('num', StandardScaler(), numerical_features)
])


In [8]:
from sklearn.pipeline import Pipeline

pipline = Pipeline(steps=[
    ('preprocessing',preprocessor),
    ('model',XGBClassifier(
    n_estimators = 100,
    learning_rate = 0.1,
    max_depth = 5,
    random_state = 42
))
])

In [9]:
X_train.shape,y_train.shape

((51499, 10), (51499,))

In [10]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 51499 entries, 13468 to 56422
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Age                51499 non-null  int64 
 1   Gender             51499 non-null  object
 2   Tenure             51499 non-null  int64 
 3   Usage_Frequency    51499 non-null  int64 
 4   Support_Calls      51499 non-null  int64 
 5   Payment_Delay      51499 non-null  int64 
 6   Subscription_Type  51499 non-null  object
 7   Contract_Length    51499 non-null  object
 8   Total_Spend        51499 non-null  int64 
 9   Last_Interaction   51499 non-null  int64 
dtypes: int64(7), object(3)
memory usage: 4.3+ MB


In [11]:
print(categorical_features)

Index(['Gender', 'Subscription_Type', 'Contract_Length'], dtype='object')


In [12]:
pipline.fit(X_train,y_train)

0,1,2
,steps,"[('preprocessing', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [13]:
pipline.predict(X_test)

array([0, 0, 1, ..., 1, 0, 0], shape=(12875,))

In [14]:
import pickle
with open("model_pipeline.pkl","wb") as f:
    pickle.dump(pipline,f)