In [1]:
import numpy as np
import pickle
import pandas as pd
from scipy.stats import randint
from pandas.plotting import scatter_matrix
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
from pylab import rcParams
import seaborn as sb
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.regularizers import L2
from tensorflow.keras.optimizers import Adam
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

In [3]:
df = pd.read_csv('Telco-New.csv')

In [4]:
df['MonthlyCharges'] = pd.to_numeric(df['MonthlyCharges'], errors='coerce')
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [5]:
df = df.drop(df.columns[0], axis=1)

In [6]:
#We want to replace these to just a "No" So that we can reduce all "No" to change them to 0
df.replace('No internet service','No',inplace=True)
df.replace('No phone service','No',inplace=True)
df['gender'].replace({'Female':1,'Male':0},inplace=True)

In [7]:
#converting all the yes and no to a one or zero
def convert_cat(df):
    cols = ['Partner','Dependents','PhoneService','MultipleLines','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','PaperlessBilling','Churn']
    for col in cols:
        df[col].replace({'Yes': 1,'No': 0},inplace=True)
        
convert_cat(df)
    

In [8]:
#Pandas does not have the spread function so it has the pivot and one-hot-encoding which is easy to use
df = pd.get_dummies(data=df, columns=['InternetService','Contract','PaymentMethod'])


In [9]:
df.head(10)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,OnlineSecurity,OnlineBackup,DeviceProtection,...,InternetService_DSL,InternetService_Fiber optic,InternetService_No,Contract_Month-to-month,Contract_One year,Contract_Two year,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,1,0,1,0,1,0,0,0,1,0,...,1,0,0,1,0,0,0,0,1,0
1,0,0,0,0,34,1,0,1,0,1,...,1,0,0,0,1,0,0,0,0,1
2,0,0,0,0,2,1,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,0,0,0,0,45,0,0,1,0,1,...,1,0,0,0,1,0,1,0,0,0
4,1,0,0,0,2,1,0,0,0,0,...,0,1,0,1,0,0,0,0,1,0
5,1,0,0,0,8,1,1,0,0,1,...,0,1,0,1,0,0,0,0,1,0
6,0,0,0,1,22,1,1,0,1,0,...,0,1,0,1,0,0,0,1,0,0
7,1,0,0,0,10,0,0,1,0,0,...,1,0,0,1,0,0,0,0,0,1
8,1,0,1,0,28,1,1,0,0,1,...,0,1,0,1,0,0,0,0,1,0
9,0,0,0,1,62,1,0,1,1,0,...,1,0,0,0,1,0,1,0,0,0


In [11]:
df.isnull().sum()

gender                                      0
SeniorCitizen                               0
Partner                                     0
Dependents                                  0
tenure                                      0
PhoneService                                0
MultipleLines                               0
OnlineSecurity                              0
OnlineBackup                                0
DeviceProtection                            0
TechSupport                                 0
StreamingTV                                 0
StreamingMovies                             0
PaperlessBilling                            0
MonthlyCharges                              0
TotalCharges                               11
Churn                                       0
InternetService_DSL                         0
InternetService_Fiber optic                 0
InternetService_No                          0
Contract_Month-to-month                     0
Contract_One year                 

In [12]:
df.dropna(inplace=True)

In [13]:
#setting the data into independent and dependent variables
#independent variable - > all the features except the churn column
x = df.drop(['Churn'], axis=1)

#dependent variable - > churn column our target variable
y = df['Churn']


In [14]:
X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state = 42)

In [15]:
smt = SMOTE(random_state=42)
x_train_sm, y_train_sm = smt.fit_resample(X_train, y_train)

print('Train data shape: {}'.format(x_train_sm.shape))
print('Test data shape: {}'.format(X_test.shape))
print('Original dataset shape %s' % Counter(y))

Train data shape: (8260, 26)
Test data shape: (1407, 26)
Original dataset shape Counter({0: 5163, 1: 1869})


In [26]:
rf_clf = RandomForestClassifier(criterion = 'gini', n_estimators=130, max_depth = 8, max_features="auto",random_state=0, min_samples_leaf = 4, min_samples_split = 5, bootstrap = True )
rf_clf.fit(x_train_sm, y_train_sm)
y_pred = rf_clf.predict(X_test)

In [27]:
y_pred[:10]

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0], dtype=int64)

In [19]:
y_test[:10]

2481    0
6784    0
6125    1
3052    0
4099    0
3223    0
3774    0
3469    0
3420    0
1196    0
Name: Churn, dtype: int64

In [28]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.78      0.88      0.82       912
           1       0.70      0.53      0.61       495

    accuracy                           0.76      1407
   macro avg       0.74      0.70      0.71      1407
weighted avg       0.75      0.76      0.75      1407



In [29]:
pickle.dump(rf_clf, open('rf_clf_telco.pkl', 'wb'))
rf_columns_telco = list(x.columns)
pickle.dump(rf_columns_telco, open('rf_columns_telco.pkl', 'wb'))