### importing packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

###  Exploring dataset

In [2]:
df = pd.read_csv("churn.csv")

In [3]:
df.shape

(3333, 21)

### columns

In [4]:
df.columns

Index(['State', 'Account Length', 'Area Code', 'Phone', 'Int'l Plan',
       'VMail Plan', 'VMail Message', 'Day Mins', 'Day Calls', 'Day Charge',
       'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 'Night Calls',
       'Night Charge', 'Intl Mins', 'Intl Calls', 'Intl Charge',
       'CustServ Calls', 'Churn?'],
      dtype='object')

### Describe

In [5]:
df.describe()

Unnamed: 0,Account Length,Area Code,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls
count,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0,3333.0
mean,101.064806,437.182418,8.09901,179.775098,100.435644,30.562307,200.980348,100.114311,17.08354,200.872037,100.107711,9.039325,10.237294,4.479448,2.764581,1.562856
std,39.822106,42.37129,13.688365,54.467389,20.069084,9.259435,50.713844,19.922625,4.310668,50.573847,19.568609,2.275873,2.79184,2.461214,0.753773,1.315491
min,1.0,408.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.2,33.0,1.04,0.0,0.0,0.0,0.0
25%,74.0,408.0,0.0,143.7,87.0,24.43,166.6,87.0,14.16,167.0,87.0,7.52,8.5,3.0,2.3,1.0
50%,101.0,415.0,0.0,179.4,101.0,30.5,201.4,100.0,17.12,201.2,100.0,9.05,10.3,4.0,2.78,1.0
75%,127.0,510.0,20.0,216.4,114.0,36.79,235.3,114.0,20.0,235.3,113.0,10.59,12.1,6.0,3.27,2.0
max,243.0,510.0,51.0,350.8,165.0,59.64,363.7,170.0,30.91,395.0,175.0,17.77,20.0,20.0,5.4,9.0


### first three rows

In [6]:
df.head(3)

Unnamed: 0,State,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,...,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?
0,KS,128,415,382-4657,no,yes,25,265.1,110,45.07,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,False.
1,OH,107,415,371-7191,no,yes,26,161.6,123,27.47,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,False.
2,NJ,137,415,358-1921,no,no,0,243.4,114,41.38,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,False.


In [7]:
# target column is "Churn?"
df["Churn?"]

0       False.
1       False.
2       False.
3       False.
4       False.
         ...  
3328    False.
3329    False.
3330    False.
3331    False.
3332    False.
Name: Churn?, Length: 3333, dtype: object

In [8]:
# categorical variables are 
#     ['State', 'Int'l Plan', 'VMail Plan', 'Churn?']

### Handling categorical variables

In [9]:
# One hot encoding for "state"
states = list(df["State"])
dummie_state = pd.get_dummies(df.State)
dummie_state

Unnamed: 0,AK,AL,AR,AZ,CA,CO,CT,DC,DE,FL,...,SD,TN,TX,UT,VA,VT,WA,WI,WV,WY
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3329,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3331,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
df = pd.concat([df, dummie_state], axis='columns')
df.drop(["State", states[-1]], axis=1, inplace=True)

In [11]:
df.head()

Unnamed: 0,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,...,SC,SD,TX,UT,VA,VT,WA,WI,WV,WY
0,128,415,382-4657,no,yes,25,265.1,110,45.07,197.4,...,0,0,0,0,0,0,0,0,0,0
1,107,415,371-7191,no,yes,26,161.6,123,27.47,195.5,...,0,0,0,0,0,0,0,0,0,0
2,137,415,358-1921,no,no,0,243.4,114,41.38,121.2,...,0,0,0,0,0,0,0,0,0,0
3,84,408,375-9999,yes,no,0,299.4,71,50.9,61.9,...,0,0,0,0,0,0,0,0,0,0
4,75,415,330-6626,yes,no,0,166.7,113,28.34,148.3,...,0,0,0,0,0,0,0,0,0,0


### Boolean conversion to 0 and 1 (int)

In [12]:
df["Int'l Plan"] = df["Int'l Plan"]=='yes'
df['VMail Plan'] = df['VMail Plan']=='yes' 
df['Churn?'] = df['Churn?']=='True.'

In [13]:
df["Int'l Plan"] = df["Int'l Plan"].astype(int)
df['VMail Plan'] = df['VMail Plan'].astype(int)
df['Churn?'] = df['Churn?'].astype(int)

In [14]:
df.iloc[:,:21]

Unnamed: 0,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,...,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?,AK
0,128,415,382-4657,0,1,25,265.1,110,45.07,197.4,...,16.78,244.7,91,11.01,10.0,3,2.70,1,0,0
1,107,415,371-7191,0,1,26,161.6,123,27.47,195.5,...,16.62,254.4,103,11.45,13.7,3,3.70,1,0,0
2,137,415,358-1921,0,0,0,243.4,114,41.38,121.2,...,10.30,162.6,104,7.32,12.2,5,3.29,0,0,0
3,84,408,375-9999,1,0,0,299.4,71,50.90,61.9,...,5.26,196.9,89,8.86,6.6,7,1.78,2,0,0
4,75,415,330-6626,1,0,0,166.7,113,28.34,148.3,...,12.61,186.9,121,8.41,10.1,3,2.73,3,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,192,415,414-4276,0,1,36,156.2,77,26.55,215.5,...,18.32,279.1,83,12.56,9.9,6,2.67,2,0,0
3329,68,415,370-3271,0,0,0,231.1,57,39.29,153.4,...,13.04,191.3,123,8.61,9.6,4,2.59,3,0,0
3330,28,510,328-8230,0,0,0,180.8,109,30.74,288.8,...,24.55,191.9,91,8.64,14.1,6,3.81,2,0,0
3331,184,510,364-6381,1,0,0,213.8,105,36.35,159.6,...,13.57,139.2,137,6.26,5.0,10,1.35,2,0,0


###  missing values

In [15]:
df.isna()

Unnamed: 0,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,...,SC,SD,TX,UT,VA,VT,WA,WI,WV,WY
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3328,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3329,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3330,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3331,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### column wise missing values

In [16]:
df.isna().sum(axis=0)

Account Length    0
Area Code         0
Phone             0
Int'l Plan        0
VMail Plan        0
                 ..
VT                0
WA                0
WI                0
WV                0
WY                0
Length: 70, dtype: int64

### overall missing values

In [17]:
df.isna().sum().sum()

0

We don't have any missing values in our dataset

### Standardization

In [18]:
# Standardization required by columns

cols = ['VMail Message', 'Day Mins', 'Day Calls', 'Day Charge', 
        'Eve Mins', 'Eve Calls', 'Eve Charge', 'Night Mins', 
        'Night Calls', 'Night Charge', 'Intl Mins', 'Intl Calls',
        'Intl Charge']

In [19]:
for col in cols:
    max_ = df[col].max()
    df[col] = df[col] / max_

In [20]:
df.iloc[:5,:21]

Unnamed: 0,Account Length,Area Code,Phone,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,...,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn?,AK
0,128,415,382-4657,0,1,0.490196,0.755701,0.666667,0.755701,0.542755,...,0.542866,0.619494,0.52,0.619584,0.5,0.15,0.5,1,0,0
1,107,415,371-7191,0,1,0.509804,0.460661,0.745455,0.460597,0.537531,...,0.53769,0.644051,0.588571,0.644344,0.685,0.15,0.685185,1,0,0
2,137,415,358-1921,0,0,0.0,0.693843,0.690909,0.69383,0.333242,...,0.333225,0.411646,0.594286,0.41193,0.61,0.25,0.609259,0,0,0
3,84,408,375-9999,1,0,0.0,0.853478,0.430303,0.853454,0.170195,...,0.170171,0.498481,0.508571,0.498593,0.33,0.35,0.32963,2,0,0
4,75,415,330-6626,1,0,0.0,0.4752,0.684848,0.475184,0.407754,...,0.407959,0.473165,0.691429,0.47327,0.505,0.15,0.505556,3,0,0


In [21]:
# columns to be dropped before training
dcols = ['Account Length', 'Phone', 'Churn?']
X = df.drop(dcols, axis=1)
X.head()

Unnamed: 0,Area Code,Int'l Plan,VMail Plan,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,...,SC,SD,TX,UT,VA,VT,WA,WI,WV,WY
0,415,0,1,0.490196,0.755701,0.666667,0.755701,0.542755,0.582353,0.542866,...,0,0,0,0,0,0,0,0,0,0
1,415,0,1,0.509804,0.460661,0.745455,0.460597,0.537531,0.605882,0.53769,...,0,0,0,0,0,0,0,0,0,0
2,415,0,0,0.0,0.693843,0.690909,0.69383,0.333242,0.647059,0.333225,...,0,0,0,0,0,0,0,0,0,0
3,408,1,0,0.0,0.853478,0.430303,0.853454,0.170195,0.517647,0.170171,...,0,0,0,0,0,0,0,0,0,0
4,415,1,0,0.0,0.4752,0.684848,0.475184,0.407754,0.717647,0.407959,...,0,0,0,0,0,0,0,0,0,0


In [22]:
y = df['Churn?']

### train_test split

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### SVM model

In [24]:
seed = 7

In [25]:
svc_model = SVC(kernel='linear', C=0.025, random_state=seed) 

In [26]:
svc_model.fit(X_train, y_train)

SVC(C=0.025, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=-1, probability=False, random_state=7, shrinking=True, tol=0.001,
    verbose=False)

In [27]:
svc_y_predicted = svc_model.predict(X_test)

In [28]:
svc_model.score(X_test, y_test)

0.8537170263788969

### Decision Tree Classifier

In [29]:
dtc_model = DecisionTreeClassifier(random_state=seed)
dtc_model.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=7, splitter='best')

In [30]:
y_predicted_dtc = dtc_model.predict(X_test)

In [31]:
dtc_model.score(X_test, y_test)

0.9232613908872902

### Random Forest Classifier

In [32]:
RFC = RandomForestClassifier(max_depth=5, n_estimators=10, max_features=10, random_state=seed)

In [33]:
RFC.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features=10,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=7, verbose=0,
                       warm_start=False)

In [34]:
y_predicted_rfc = RFC.predict(X_test)

In [35]:
RFC.score(X_test, y_test)

0.8836930455635491