In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/customer_churn.csv')

In [3]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
df.isnull().sum()

Unnamed: 0,0
customerID,0
gender,0
SeniorCitizen,0
Partner,0
Dependents,0
tenure,0
PhoneService,0
MultipleLines,0
InternetService,0
OnlineSecurity,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df.columns

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [7]:
#Covert object datatype into numeric (float/int) by using pandas.

df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors = 'coerce')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [9]:
df.dropna(inplace = True)

In [10]:
df.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
count,7032.0,7032.0,7032.0,7032.0
mean,0.1624,32.421786,64.798208,2283.300441
std,0.368844,24.54526,30.085974,2266.771362
min,0.0,1.0,18.25,18.8
25%,0.0,9.0,35.5875,401.45
50%,0.0,29.0,70.35,1397.475
75%,0.0,55.0,89.8625,3794.7375
max,1.0,72.0,118.75,8684.8


In [11]:
#Drop the column 'customerID' as it does not add value to our analysis.

df.drop(['customerID'], axis = 1, inplace = True) #axis = 1 --> column, axis = 0 --> row

**Performing Chi Square Test**

In [12]:
#Chi Square Test

from scipy.stats import chisquare
import scipy.stats
from scipy.stats import chi2_contingency

#Chi Square for independence for all object fields
col_list = list(df.columns)
col_list.remove('Churn')

#Create a DataFrame for the features p_values output
data = pd.DataFrame(columns = ['Features', 'P-values'])

for col in col_list:
  if(df[[col]][col].dtypes == 'object'):
    #Chi square test independence
    dataset_table = pd.crosstab(df[col], df['Churn'])

    #Observed Values
    observed_values = dataset_table.values
    val = chi2_contingency(dataset_table)

    #Expected Value
    expected_values = val[3]

    chi_square = sum([(o-e)**2/e for o,e in zip(observed_values, expected_values)])
    chi_square_statistics = chi_square[0] + chi_square[1]

    no_of_rows = len(dataset_table.iloc[0:2,0])
    no_of_columns = len(dataset_table.iloc[0,0:2])
    ddof = (no_of_rows -1) * (no_of_columns - 1)

    alpha = 0.5
    critical_value = scipy.stats.chi2.ppf(q = 1 - alpha , df = ddof)

    p_value = 1 - scipy.stats.chi2.sf(x = chi_square_statistics, df = ddof)

    data = pd.concat([data, pd.DataFrame([[col, p_value]],columns = ['Features', 'P-values'])],ignore_index = True)


In [13]:
data

Unnamed: 0,Features,P-values
0,gender,0.526335
1,Partner,1.0
2,Dependents,1.0
3,PhoneService,0.673114
4,MultipleLines,0.999213
5,InternetService,1.0
6,OnlineSecurity,1.0
7,OnlineBackup,1.0
8,DeviceProtection,1.0
9,TechSupport,1.0


In [14]:
df.drop(columns = ['gender', 'PhoneService'], inplace = True)

In [15]:
df.columns

Index(['SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges',
       'Churn'],
      dtype='object')

In [16]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7032 non-null   int64  
 1   Partner           7032 non-null   object 
 2   Dependents        7032 non-null   object 
 3   tenure            7032 non-null   int64  
 4   MultipleLines     7032 non-null   object 
 5   InternetService   7032 non-null   object 
 6   OnlineSecurity    7032 non-null   object 
 7   OnlineBackup      7032 non-null   object 
 8   DeviceProtection  7032 non-null   object 
 9   TechSupport       7032 non-null   object 
 10  StreamingTV       7032 non-null   object 
 11  StreamingMovies   7032 non-null   object 
 12  Contract          7032 non-null   object 
 13  PaperlessBilling  7032 non-null   object 
 14  PaymentMethod     7032 non-null   object 
 15  MonthlyCharges    7032 non-null   float64
 16  TotalCharges      7032 non-null   float64
 17  

In [17]:
#Label Encoding
df['Partner'] = df['Partner'].map({'Yes':1, 'No':0})

In [18]:
df['Dependents'] = df['Dependents'].map({'Yes':1, 'No':0})

In [19]:
df['MultipleLines'] = df['MultipleLines'].map({'Yes':1, 'No':0, 'No phone service': 2})

In [20]:
df['InternetService'] = df['InternetService'].map({'Fiber optic':1, 'No':0, 'DSL':2})

In [21]:
df['OnlineSecurity'] = df['OnlineSecurity'].map({'Yes':1, 'No':0, 'No internet service':2})

In [22]:
df['OnlineBackup'] = df['OnlineBackup'].map({'Yes':1, 'No':0, 'No internet service':2})

In [23]:
df['DeviceProtection'] = df['DeviceProtection'].map({'Yes':1, 'No':0, 'No internet service':2})

In [24]:
df['TechSupport'] = df['TechSupport'].map({'Yes':1, 'No':0, 'No internet service':2})

In [25]:
df['StreamingTV'] = df['StreamingTV'].map({'Yes':1, 'No':0, 'No internet service':2})

In [26]:
df['StreamingMovies'] = df['StreamingMovies'].map({'Yes':1, 'No':0,'No internet service':2})

In [27]:
df['Contract'] = df['Contract'].map({'Month-to-month':1, 'Two year':0, 'One year':2})

In [28]:
df['PaperlessBilling'] = df['PaperlessBilling'].map({'Yes':1, 'No':0})

In [29]:
df['PaymentMethod'] = df['PaymentMethod'].map({'Electronic check':1, 'Mailed check':0, 'Bank transfer (automatic)':2,'Credit card (automatic)':3})

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7032 non-null   int64  
 1   Partner           7032 non-null   int64  
 2   Dependents        7032 non-null   int64  
 3   tenure            7032 non-null   int64  
 4   MultipleLines     7032 non-null   int64  
 5   InternetService   7032 non-null   int64  
 6   OnlineSecurity    7032 non-null   int64  
 7   OnlineBackup      7032 non-null   int64  
 8   DeviceProtection  7032 non-null   int64  
 9   TechSupport       7032 non-null   int64  
 10  StreamingTV       7032 non-null   int64  
 11  StreamingMovies   7032 non-null   int64  
 12  Contract          7032 non-null   int64  
 13  PaperlessBilling  7032 non-null   int64  
 14  PaymentMethod     7032 non-null   int64  
 15  MonthlyCharges    7032 non-null   float64
 16  TotalCharges      7032 non-null   float64
 17  

In [31]:
df['Dependents'].value_counts()

Unnamed: 0_level_0,count
Dependents,Unnamed: 1_level_1
0,4933
1,2099


In [32]:
df['Churn'] = df['Churn'].map({'Yes':1, 'No':0})

In [33]:
df.head(2)

Unnamed: 0,SeniorCitizen,Partner,Dependents,tenure,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,1,0,1,2,2,0,1,0,0,0,0,1,1,1,29.85,29.85,0
1,0,0,0,34,0,2,1,0,1,0,0,0,2,0,0,56.95,1889.5,0


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   SeniorCitizen     7032 non-null   int64  
 1   Partner           7032 non-null   int64  
 2   Dependents        7032 non-null   int64  
 3   tenure            7032 non-null   int64  
 4   MultipleLines     7032 non-null   int64  
 5   InternetService   7032 non-null   int64  
 6   OnlineSecurity    7032 non-null   int64  
 7   OnlineBackup      7032 non-null   int64  
 8   DeviceProtection  7032 non-null   int64  
 9   TechSupport       7032 non-null   int64  
 10  StreamingTV       7032 non-null   int64  
 11  StreamingMovies   7032 non-null   int64  
 12  Contract          7032 non-null   int64  
 13  PaperlessBilling  7032 non-null   int64  
 14  PaymentMethod     7032 non-null   int64  
 15  MonthlyCharges    7032 non-null   float64
 16  TotalCharges      7032 non-null   float64
 17  

In [35]:
#Split the value
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [36]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state = 42)

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
rf = RandomForestClassifier(random_state = 42) #Create randomforest object.
rf1 = rf.fit(x_train, y_train)


In [39]:
y_pred = rf1.predict(x_test)

In [40]:
from sklearn.metrics import accuracy_score
accuracy_score(y_pred, y_test)

0.7889125799573561

In [41]:
#Hyper Parameter Tuning

In [42]:
from sklearn.model_selection import RandomizedSearchCV

In [43]:
#number of trees
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
n_estimators

[200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]

In [44]:
#number of features (max num of inputs)
max_features = ['auto', 'sqrt'] #Check both auto and sqrt of number of features
max_features

['auto', 'sqrt']

In [45]:
#maximum levels/depth
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)

In [46]:
max_depth

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None]

In [47]:
#minimum number of samples required to split
min_samples_split = [2,5,10]


In [48]:
#Minimum sample at leaf node
min_samples_leaf = [1,2,4]

In [49]:
#Create a random grid
random_grid = {'n_estimators':n_estimators,
               'max_features':max_features,
               'max_depth':max_depth,
               'min_samples_split':min_samples_split,
               'min_samples_leaf':min_samples_leaf}

random_grid

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000],
 'max_features': ['auto', 'sqrt'],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_split': [2, 5, 10],
 'min_samples_leaf': [1, 2, 4]}

In [50]:
rf = RandomForestClassifier(random_state = 42)

In [51]:
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,
                         n_iter = 5, scoring = 'neg_mean_absolute_error', random_state = 42,
                               cv = 3, return_train_score = True)

#estimators --> algo using
#param_distribution --> Hyperparameters
#n_iter ---> Combination of values
#cv --> Cross-validation (result based on majority and generalization): Changes the subset of data and checks whether accuracy will remain the same or not.
#scoring ---> 'neg_mean_absolute_error' (Why neg? Because sklearn multiply with -1 internally and i have to make it positive thats why forcefully applying negative sign to it.)
#return_train_score --> Measure Training Accuracy (How: Error = Actual - Predicted)

In [52]:
rf2 = rf_random.fit(x_train, y_train)

In [53]:
#Giving the best paramenters in all.

rf2.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 50}

In [55]:
#Giving the best score.

rf2.best_score_

-0.19964444444444443

In [56]:
rf2.cv_results_

{'mean_fit_time': array([1.04246775e+00, 6.40153885e-04, 5.00440598e-04, 4.55220540e-04,
        4.47114309e-04]),
 'std_fit_time': array([1.33965012e-01, 3.95693445e-05, 2.39378269e-05, 1.82161087e-05,
        2.27972161e-05]),
 'mean_score_time': array([0.06197453, 0.        , 0.        , 0.        , 0.        ]),
 'std_score_time': array([0.00365697, 0.        , 0.        , 0.        , 0.        ]),
 'param_n_estimators': masked_array(data=[200, 1000, 200, 1200, 1800],
              mask=[False, False, False, False, False],
        fill_value=999999),
 'param_min_samples_split': masked_array(data=[10, 2, 10, 5, 5],
              mask=[False, False, False, False, False],
        fill_value=999999),
 'param_min_samples_leaf': masked_array(data=[2, 2, 2, 1, 1],
              mask=[False, False, False, False, False],
        fill_value=999999),
 'param_max_features': masked_array(data=['sqrt', 'auto', 'auto', 'auto', 'auto'],
              mask=[False, False, False, False, False],
     

In [57]:
from sklearn.metrics import accuracy_score

In [61]:
y_train_pred = rf2.predict(x_train)
y_test_pred = rf2.predict(x_test)

from sklearn.metrics import accuracy_score
print('Training Accuracy:',accuracy_score(y_train_pred, y_train))
print('Testing Accuracy:',accuracy_score(y_test_pred, y_test))

Training Accuracy: 0.8970666666666667
Testing Accuracy: 0.7995735607675906
