### Importing all the required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import numpy as np 

seed=7 #To generate same sequence of random numbers

#### Preprocessing Libraries

In [2]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split


#### Classifiers

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import svm

#### Evaluating Libraries

In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

### To read Data File

In [5]:
#To read csv file
churn = pd.read_csv('/kaggle/input/churn-in-telecoms-dataset/bigml_59c28831336c6604c800002a.csv', sep=',')


### Basic Data Analysis

In [6]:
data_size=churn.shape
print(data_size)

churn_col_names=list(churn.columns)
print(churn_col_names)

print(churn.describe())
print(churn.head(2))

#Identifying the outcome/target variable.
#Churn = True. means customer will churn. 
churn_target=churn['churn'] 
print(churn_target.unique())


(3333, 21)
['state', 'account length', 'area code', 'phone number', 'international plan', 'voice mail plan', 'number vmail messages', 'total day minutes', 'total day calls', 'total day charge', 'total eve minutes', 'total eve calls', 'total eve charge', 'total night minutes', 'total night calls', 'total night charge', 'total intl minutes', 'total intl calls', 'total intl charge', 'customer service calls', 'churn']
       account length    area code  number vmail messages  total day minutes  \
count     3333.000000  3333.000000            3333.000000        3333.000000   
mean       101.064806   437.182418               8.099010         179.775098   
std         39.822106    42.371290              13.688365          54.467389   
min          1.000000   408.000000               0.000000           0.000000   
25%         74.000000   408.000000               0.000000         143.700000   
50%        101.000000   415.000000               0.000000         179.400000   
75%        127.000000 

### Dropping the target Column

In [7]:
#Phone number : unique number (might not influence prediction)
#churn : target variable (not required in feature set)
cols_to_drop = ['phone number','churn']
#axis=1 depicts drop along columns
churn_feature = churn.drop(cols_to_drop,axis=1)
print(churn_feature.head(2))


  state  account length  area code international plan voice mail plan  \
0    KS             128        415                 no             yes   
1    OH             107        415                 no             yes   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   

   total night minutes  total night calls  total night charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   

   total intl minutes  total intl calls  total intl charge  \
0                10.0                 3                2.7   
1                13.7              

## Data Preprocessing

#### Coverting Yes/No to Boolean values

In [8]:
#Changing the 'yes or no' values to boolean
yes_no_cols = ["international plan","voice mail plan"]
churn_feature[yes_no_cols] = churn_feature[yes_no_cols] == 'yes'
print(churn_feature.head(2))


  state  account length  area code  international plan  voice mail plan  \
0    KS             128        415               False             True   
1    OH             107        415               False             True   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   

   total night minutes  total night calls  total night charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   

   total intl minutes  total intl calls  total intl charge  \
0                10.0                 3                2.7   
1                13.7        

#### Label Encoding

In [9]:
label_encoder = preprocessing.LabelEncoder()
churn_feature['area code'] = label_encoder.fit_transform(churn_feature['area code'])
print(churn_feature.head(2))


  state  account length  area code  international plan  voice mail plan  \
0    KS             128          1               False             True   
1    OH             107          1               False             True   

   number vmail messages  total day minutes  total day calls  \
0                     25              265.1              110   
1                     26              161.6              123   

   total day charge  total eve minutes  total eve calls  total eve charge  \
0             45.07              197.4               99             16.78   
1             27.47              195.5              103             16.62   

   total night minutes  total night calls  total night charge  \
0                244.7                 91               11.01   
1                254.4                103               11.45   

   total intl minutes  total intl calls  total intl charge  \
0                10.0                 3                2.7   
1                13.7        

#### One-hot Encoding

In [10]:
print('Churn data size before one hot encoding',churn_feature.shape)
print('No of unique states',len(churn_feature['state'].unique()))
#Give the feature and columns to one hot encode in 'columns' and column rename prefix in 'prefix'
churn_dumm=pd.get_dummies(churn_feature, columns=["state"], prefix=["state"])
print('Churn data size after one hot encoding',churn_dumm.shape)
#converting to numpy matrix
churn_matrix = churn_dumm.values.astype(np.float64)


Churn data size before one hot encoding (3333, 19)
No of unique states 51
Churn data size after one hot encoding (3333, 69)


#### Handling missing values

In [11]:
#Missing values replaced by mean
imp=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None,verbose=0,copy=True)
#Fit to data, then transform it.
churn_matrix=imp.fit_transform(churn_matrix)

#### Scaling the Data

In [12]:
#Standardize the data by removing the mean and scaling to unit variance
scaler = StandardScaler()
#Fit to data, then transform it.
churn_matrix = scaler.fit_transform(churn_matrix)


#### Splitting the data for training and testing(90% train,10% test)

In [13]:

train_data,test_data, train_label, test_label = train_test_split(churn_matrix, churn_target, test_size=.1,random_state=seed)
target_names = ['False.', 'True.']


## Performing Classification

#### Decision Tree Classifier

In [14]:
classifier=DecisionTreeClassifier(random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target=classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Decision Tree Classifier : ',score)
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


Decision Tree Classifier :  0.9101796407185628
Confusion Matrix [[265  18]
 [ 12  39]]
              precision    recall  f1-score   support

      False.       0.96      0.94      0.95       283
       True.       0.68      0.76      0.72        51

    accuracy                           0.91       334
   macro avg       0.82      0.85      0.83       334
weighted avg       0.92      0.91      0.91       334



#### Naive Bayes Classifier

In [15]:

classifier=GaussianNB()
classifier = classifier.fit(train_data, train_label)
churn_predicted_target=classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Naive Bayes : ',score)
print('Accuracy Score',accuracy_score(test_label,churn_predicted_target))  
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


Naive Bayes :  0.5748502994011976
Accuracy Score 0.5748502994011976
Confusion Matrix [[169 114]
 [ 28  23]]
              precision    recall  f1-score   support

      False.       0.86      0.60      0.70       283
       True.       0.17      0.45      0.24        51

    accuracy                           0.57       334
   macro avg       0.51      0.52      0.47       334
weighted avg       0.75      0.57      0.63       334



#### Stochastic Gradient Descent Classifier

In [16]:

classifier =  SGDClassifier(loss='modified_huber', shuffle=True,random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target=classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SGD classifier : ',score)
print('Accuracy Score',accuracy_score(test_label,churn_predicted_target))  
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


SGD classifier :  0.781437125748503
Accuracy Score 0.781437125748503
Confusion Matrix [[243  40]
 [ 33  18]]
              precision    recall  f1-score   support

      False.       0.88      0.86      0.87       283
       True.       0.31      0.35      0.33        51

    accuracy                           0.78       334
   macro avg       0.60      0.61      0.60       334
weighted avg       0.79      0.78      0.79       334



#### Support Vector Machine Classifier

In [17]:
classifier = SVC(kernel="linear", C=0.025,random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target=classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('SVM Classifier : ',score)
print('Accuracy Score',accuracy_score(test_label,churn_predicted_target))  
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


SVM Classifier :  0.8473053892215568
Accuracy Score 0.8473053892215568
Confusion Matrix [[283   0]
 [ 51   0]]
              precision    recall  f1-score   support

      False.       0.85      1.00      0.92       283
       True.       0.00      0.00      0.00        51

    accuracy                           0.85       334
   macro avg       0.42      0.50      0.46       334
weighted avg       0.72      0.85      0.78       334



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Random Forest Classifier

In [18]:
classifier = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10,random_state=seed)
classifier = classifier.fit(train_data, train_label)
churn_predicted_target=classifier.predict(test_data)
score = classifier.score(test_data, test_label)
print('Random Forest Classifier : ',score)
print('Accuracy Score',accuracy_score(test_label,churn_predicted_target))  
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


Random Forest Classifier :  0.8682634730538922
Accuracy Score 0.8682634730538922
Confusion Matrix [[282   1]
 [ 43   8]]
              precision    recall  f1-score   support

      False.       0.87      1.00      0.93       283
       True.       0.89      0.16      0.27        51

    accuracy                           0.87       334
   macro avg       0.88      0.58      0.60       334
weighted avg       0.87      0.87      0.83       334



#### Further tunning the parameters of Random Forest Classifier

In [19]:
classifier = RandomForestClassifier(max_depth=5, n_estimators=15, max_features=60,random_state=seed)
classifier = classifier.fit(train_data, train_label)
score=classifier.score(test_data, test_label)
print('Random Forest classification after model tuning',score)
print('Accuracy Score',accuracy_score(test_label,churn_predicted_target))  
print('Confusion Matrix',confusion_matrix(test_label,churn_predicted_target))
print(classification_report(test_label, churn_predicted_target, target_names=target_names))


Random Forest classification after model tuning 0.937125748502994
Accuracy Score 0.8682634730538922
Confusion Matrix [[282   1]
 [ 43   8]]
              precision    recall  f1-score   support

      False.       0.87      1.00      0.93       283
       True.       0.89      0.16      0.27        51

    accuracy                           0.87       334
   macro avg       0.88      0.58      0.60       334
weighted avg       0.87      0.87      0.83       334



### Using StratifiedShuffleSplit would suit our case study as the dataset has a class imbalance


In [20]:
sss = StratifiedShuffleSplit(n_splits=1,test_size=0.1, random_state=7)
sss.get_n_splits(churn_matrix,churn_target)
print(sss)


StratifiedShuffleSplit(n_splits=1, random_state=7, test_size=0.1,
            train_size=None)


In [21]:
classifiers = [
    DecisionTreeClassifier(),
    GaussianNB(),
    SGDClassifier(loss='modified_huber', shuffle=True),
    SVC(kernel="linear", C=0.025),
    KNeighborsClassifier(),
    OneVsRestClassifier(svm.LinearSVC()),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10),
    AdaBoostClassifier(),
   ]
for clf in classifiers:
    score=0
    for train_index, test_index in sss.split(churn_matrix,churn_target):
        X_train, X_test = churn_matrix[train_index], churn_matrix[test_index]
        y_train, y_test = churn_target[train_index], churn_target[test_index]
        clf.fit(X_train, y_train)
        score=score+clf.score(X_test, y_test)
    print(score)


0.9011976047904192
0.6137724550898204
0.8173652694610778
0.8562874251497006
0.8652694610778443




0.8652694610778443
0.8892215568862275
0.8832335329341318
