In [1]:
import numpy as np
from sklearn import model_selection
from sklearn import tree
from sklearn import metrics
from sklearn import ensemble
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
import pandas as pd

In [2]:
df = pd.read_csv("churn.csv")
df.head()


Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,No,128,no,yes,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,No,107,no,yes,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,No,137,no,no,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,No,84,yes,no,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,No,75,yes,no,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [3]:
df.shape

(5000, 18)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 18 columns):
churn                         5000 non-null object
accountlength                 5000 non-null int64
internationalplan             5000 non-null object
voicemailplan                 5000 non-null object
numbervmailmessages           5000 non-null int64
totaldayminutes               5000 non-null float64
totaldaycalls                 5000 non-null int64
totaldaycharge                5000 non-null float64
totaleveminutes               5000 non-null float64
totalevecalls                 5000 non-null int64
totalevecharge                5000 non-null float64
totalnightminutes             5000 non-null float64
totalnightcalls               5000 non-null int64
totalnightcharge              5000 non-null float64
totalintlminutes              5000 non-null float64
totalintlcalls                5000 non-null int64
totalintlcharge               5000 non-null float64
numbercustomerservicecal

In [5]:
# for decision trees we can use label encodig (for categories)

In [6]:
columnsToEncode = list(df.select_dtypes(include=['category','object']))
columnsToEncode

['churn', 'internationalplan', 'voicemailplan']

In [7]:
columnsToEncode = list(df.select_dtypes(include=['category','object']))
le = LabelEncoder()
for feature in columnsToEncode:
    try:
        df[feature] = le.fit_transform(df[feature])
    except:
        print('Error encoding ' + feature)
df.head()

Unnamed: 0,churn,accountlength,internationalplan,voicemailplan,numbervmailmessages,totaldayminutes,totaldaycalls,totaldaycharge,totaleveminutes,totalevecalls,totalevecharge,totalnightminutes,totalnightcalls,totalnightcharge,totalintlminutes,totalintlcalls,totalintlcharge,numbercustomerservicecalls
0,0,128,0,1,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1
1,0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
2,0,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
3,0,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
4,0,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3


In [8]:
X = df.iloc[:,1:]
# y = 2*df.iloc[:,0]-1
y = df.iloc[:,0]


In [9]:
y

0       0
1       0
2       0
3       0
4       0
       ..
4995    0
4996    1
4997    0
4998    0
4999    0
Name: churn, Length: 5000, dtype: int32

In [10]:
X=X.values

In [11]:
y=y.values

In [12]:
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(X, y)
scores = cross_val_score(clf, X, y)
scores.mean()



0.917998633166524

In [13]:
########################################
# is the "fit" necessary?
# examine the scores
# calculate cv score for the recall and precision
# calculation classification report

In [14]:
scores

array([0.92021596, 0.92261548, 0.91116447])

In [15]:
scores.std()

0.004930774584741113

In [16]:
cross_val_score(clf, X, y, scoring='recall')



array([0.74152542, 0.74576271, 0.69787234])

In [17]:
cross_val_score(clf, X, y, scoring='precision')



array([0.70850202, 0.71836735, 0.68049793])

In [18]:
cross_val_score(clf, X, y, scoring='f1')



array([0.72463768, 0.73180873, 0.68907563])

In [19]:
from sklearn.metrics import classification_report

In [20]:
print(classification_report(y, clf.predict(X)))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      4293
           1       1.00      1.00      1.00       707

    accuracy                           1.00      5000
   macro avg       1.00      1.00      1.00      5000
weighted avg       1.00      1.00      1.00      5000



In [21]:
######################################################

In [22]:
clf = AdaBoostClassifier(tree.DecisionTreeClassifier(random_state=0))
clf.fit(X, y)
scores = cross_val_score(clf, X, y)
scores.mean()



0.9169988331265321

In [23]:
##############################################################
# DO SAME PROCESS BUT WITH SPLITTING X,Y TO TRAIN/TEST - for DT classifier
# cv - will be used for on-going development, parameters tuning, etc.. - play with DT params
# test set - will be used for final evaluation
#

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [26]:
clf = tree.DecisionTreeClassifier(random_state=0)
clf.fit(X_train, y_train)
scores = cross_val_score(clf, X_train, y_train)
scores.mean()



0.9128377662902745

In [27]:
print(classification_report(y_test, clf.predict(X_test)))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1423
           1       0.70      0.71      0.70       227

    accuracy                           0.92      1650
   macro avg       0.83      0.83      0.83      1650
weighted avg       0.92      0.92      0.92      1650



In [28]:
scores

array([0.92121755, 0.89794091, 0.91935484])