In [6]:
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,roc_curve

In [3]:
from matplotlib import pyplot as plt
%matplotlib inline

In [57]:
churnData = pd.read_csv('http://www.dataminingconsultant.com/data/churn.txt')

In [58]:
churnData['Churn'] = churnData['Churn?'].map(lambda x: 1 if x=='True.' else 0)

In [59]:
# remove columns that we won't use 
churnData1 = churnData.drop(['Area Code','Phone','State','Churn?'],axis=1)

In [60]:
# find out which columns are object type and convert them to dummy vars
list_ = list(churnData1.select_dtypes(include=['object']).columns)
churnData2 = pd.get_dummies(churnData1, prefix=list_)

In [61]:
churnData2.head(4)

Unnamed: 0,Account Length,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn,Int'l Plan_no,Int'l Plan_yes,VMail Plan_no,VMail Plan_yes
0,128,25,265.1,110,45.07,197.4,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0,1,0,0,1
1,107,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0,1,0,0,1
2,137,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0,1,0,1,0
3,84,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0,0,1,1,0


In [62]:
# merge with the other columns
list_nonObj = list(churnData.select_dtypes(exclude=['object']).columns)
churnData3 = churnData[list_nonObj]

In [63]:
inputData = pd.concat([churnData2,churnData3], axis = 1)

In [64]:
inputData.head()

Unnamed: 0,Account Length,VMail Message,Day Mins,Day Calls,Day Charge,Eve Mins,Eve Calls,Eve Charge,Night Mins,Night Calls,...,Eve Calls.1,Eve Charge.1,Night Mins.1,Night Calls.1,Night Charge,Intl Mins,Intl Calls,Intl Charge,CustServ Calls,Churn
0,128,25,265.1,110,45.07,197.4,99,16.78,244.7,91,...,99,16.78,244.7,91,11.01,10.0,3,2.7,1,0
1,107,26,161.6,123,27.47,195.5,103,16.62,254.4,103,...,103,16.62,254.4,103,11.45,13.7,3,3.7,1,0
2,137,0,243.4,114,41.38,121.2,110,10.3,162.6,104,...,110,10.3,162.6,104,7.32,12.2,5,3.29,0,0
3,84,0,299.4,71,50.9,61.9,88,5.26,196.9,89,...,88,5.26,196.9,89,8.86,6.6,7,1.78,2,0
4,75,0,166.7,113,28.34,148.3,122,12.61,186.9,121,...,122,12.61,186.9,121,8.41,10.1,3,2.73,3,0


In [65]:
# split the input data set into train and test 
churnData_train, churnData_test = train_test_split(inputData, test_size =0.3)

In [66]:
features = inputData.drop(['Churn'], axis=1).columns

In [67]:
rcClassifier = RandomForestClassifier(n_estimators=40)
rcClassifier.fit(churnData_train[features], churnData_train['Churn'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [69]:
# Make Predictions
predictions = rcClassifier.predict(churnData_test[features])
probabilities = rcClassifier.predict_proba(churnData_test[features])
display(predictions)

array([[ 1.,  1.],
       [ 0.,  0.],
       [ 0.,  0.],
       ..., 
       [ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])

In [70]:
score = rcClassifier.score(churnData_test[features],churnData_test['Churn'])
print "Accuracy : ", score

Accuracy :  0.958


In [76]:
featImport = zip(features, rcClassifier.feature_importances_)

In [81]:
feature_sorted = sorted(featImport, key=lambda x: x[1], reverse=True)

In [82]:
print feature_sorted

[('Night Calls', 0.049067698625923861), ('Intl Mins', 0.041630420853698183), ('Day Charge', 0.040237423402506342), ('Night Charge', 0.0395028813321937), ('Night Calls', 0.034099509088253786), ('Eve Mins', 0.031955059970856306), ('Night Mins', 0.029584587284064746), ("Int'l Plan_no", 0.024332283953951855), ('Night Mins', 0.021363757164939417), ('Intl Mins', 0.02107878773288805), ('Day Charge', 0.017982460218914368), ('Intl Charge', 0.015981229121046017), ('Eve Mins', 0.015835557940729033), ('Night Charge', 0.015308431785946795), ('Day Mins', 0.014372365801680911), ('Day Calls', 0.013879004522473487), ('CustServ Calls', 0.012534780624976155), ('Day Calls', 0.011747201134626741), ('Eve Calls', 0.011691039436195641), ('Day Mins', 0.01065633208296991), ('Eve Charge', 0.01024835338347015), ('Area Code', 0.0100411278064064), ('Intl Calls', 0.009410029587741401), ('CustServ Calls', 0.0092806856636186066), ('Eve Charge', 0.0085737466022840596), ('VMail Plan_no', 0.0083341548121900321), ('VMail 

In [121]:
# function that takes a input dataset without the churn column and returns predictions, probabilities
def predictCustChurn(x):
    x = pd.read_json(x,orient='split')
    print x
    pred = rcClassifier.predict(x)
    probs = rcClassifier.predict_proba(x)
    return {'pred':pred,'probs':probs}
    
    

In [122]:
retVal = predictCustChurn(churnData_test[features][:10].to_json(orient='split'))

      Account Length  Account Length  VMail Message  VMail Message  Day Mins  \
2376              42              42              0              0     303.9   
829              120             120              0              0     198.8   
1166              72              72              0              0     118.2   
3118             131             131             33             33     177.1   
225               65              65              0              0     213.4   
1478             118             118              0              0     253.2   
1697             132             132              0              0     169.9   
3182             109             109              0              0     180.0   
620              163             163              0              0     191.3   
20               147             147              0              0     155.1   

      Day Mins  Day Calls  Day Calls  Day Charge  Day Charge       ...        \
2376     303.9        106        106   

'{"columns":["Account Length","Account Length","VMail Message","VMail Message","Day Mins","Day Mins","Day Calls","Day Calls","Day Charge","Day Charge","Eve Mins","Eve Mins","Eve Calls","Eve Calls","Eve Charge","Eve Charge","Night Mins","Night Mins","Night Calls","Night Calls","Night Charge","Night Charge","Intl Mins","Intl Mins","Intl Calls","Intl Calls","Intl Charge","Intl Charge","CustServ Calls","CustServ Calls","Int\'l Plan_no","Int\'l Plan_yes","VMail Plan_no","VMail Plan_yes","Account Length","Account Length","Area Code","VMail Message","VMail Message","Day Mins","Day Mins","Day Calls","Day Calls","Day Charge","Day Charge","Eve Mins","Eve Mins","Eve Calls","Eve Calls","Eve Charge","Eve Charge","Night Mins","Night Mins","Night Calls","Night Calls","Night Charge","Night Charge","Intl Mins","Intl Mins","Intl Calls","Intl Calls","Intl Charge","Intl Charge","CustServ Calls","CustServ Calls"],"index":[2376,829,1166,3118,225,1478,1697,3182,620,20],"data":[[42,42,0,0,303.9,303.9,106,106,

AttributeError: 'list' object has no attribute 'tolist'