In [1]:
import pandas as pd

In [2]:
ch = pd.read_csv('Churn.csv')

In [3]:
ch.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [4]:
ch.isnull().sum()[ch.isnull().sum()>0]

Series([], dtype: int64)

In [5]:
ch = ch.drop(['customerID'],axis=1)

In [6]:
ch.select_dtypes(include='object').columns

Index(['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines',
       'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'PaperlessBilling', 'PaymentMethod', 'TotalCharges', 'Churn'],
      dtype='object')

### Label Encode

In [7]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [8]:
ch[ch.select_dtypes(include='object').columns] = ch[ch.select_dtypes(include='object').columns].apply(le.fit_transform)

In [9]:
ch.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,0,0,1,0,1,0,1,0,0,2,0,0,0,0,0,1,2,29.85,2505,0
1,1,0,0,0,34,1,0,0,2,0,2,0,0,0,1,0,3,56.95,1466,0
2,1,0,0,0,2,1,0,0,2,2,0,0,0,0,0,1,3,53.85,157,1
3,1,0,0,0,45,0,1,0,2,0,2,2,0,0,1,0,0,42.3,1400,0
4,0,0,0,0,2,1,0,1,0,0,0,0,0,0,0,1,2,70.7,925,1


### Sampling

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train_ch,test_ch = train_test_split(ch,test_size=0.2)

In [12]:
print(ch.shape,"  ",train_ch.shape,"  ",test_ch.shape)

# data distributed in train and test...original_size...train_size...test_size

(7043, 20)    (5634, 20)    (1409, 20)


In [13]:
train_ch_x = train_ch.iloc[ : , : -1]
train_ch_y = train_ch.iloc[ : ,-1]

In [14]:
test_ch_x = test_ch.iloc[ : , :-1]
test_ch_y = test_ch.iloc[ : , -1]

### Model Building

In [23]:
from sklearn.tree import DecisionTreeClassifier

In [39]:
dt = DecisionTreeClassifier()

In [40]:
dt.fit(train_ch_x,train_ch_y)

DecisionTreeClassifier()

### Prediction

In [41]:
dt_pred = dt.predict(test_ch_x)
dt_pred

array([0, 0, 0, ..., 0, 1, 0])

### Confusion Matrix

In [42]:
from sklearn.metrics import confusion_matrix

In [43]:
tab = confusion_matrix(test_ch_y,dt_pred)
tab

array([[821, 208],
       [186, 194]], dtype=int64)

### Accuracy

In [35]:
Accuracy = tab.diagonal().sum()*100/tab.sum()
Accuracy

71.46912704045423

### Important Features

In [44]:
dt.feature_importances_

array([0.02001528, 0.01751342, 0.02427362, 0.02131704, 0.13104888,
       0.0035786 , 0.01440336, 0.02715303, 0.04712257, 0.01231017,
       0.01826797, 0.01330752, 0.01373897, 0.00905476, 0.16751839,
       0.02396553, 0.04602   , 0.17726051, 0.21213037])

In [45]:
train_ch_x.columns

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'MonthlyCharges', 'TotalCharges'],
      dtype='object')

In [46]:
feat_imp_df = pd.DataFrame()

In [47]:
feat_imp_df['Features'] = train_ch_x.columns
feat_imp_df['Imp'] = dt.feature_importances_

In [49]:
feat_imp_df.sort_values(['Imp'],ascending=False)

Unnamed: 0,Features,Imp
18,TotalCharges,0.21213
17,MonthlyCharges,0.177261
14,Contract,0.167518
4,tenure,0.131049
8,OnlineSecurity,0.047123
16,PaymentMethod,0.04602
7,InternetService,0.027153
2,Partner,0.024274
15,PaperlessBilling,0.023966
3,Dependents,0.021317
