In [18]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import tree
from inspect import getmembers
#from StringIO import StringIO
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.stats.stats import pearsonr  
import seaborn as sns
import pylab as pyl
from sklearn.externals.six import StringIO
import os
from IPython.display import Image


getDataset = pd.read_csv("D:\\Advanced Data Science\\Assignment1\\Churn.csv")
# print (getDataset.head())
# print (getDataset.describe())

D = {'very_unsat': 0, 'unsat': 1, 'avg': 2, 'sat': 3, 'very_sat': 4, 'very_little': 0, 'little': 1, 'high': 3, 'very_high': 4, 'actively_looking_into_it': 0, 
     'considering': 1, 'never_thought': 2, 'no': 3, 'perhaps': 4,  'zero': 0, 'one': 1,  'STAY': 0, 'LEAVE': 1}

# Getting the values from input dataset and assigning the numbers to string values
S = getDataset.applymap(lambda x:D.get(x) if x in D else x)

features = S[["COLLEGE","INCOME","OVERAGE","LEFTOVER","HOUSE","HANDSET_PRICE","OVER_15MINS_CALLS_PER_MONTH","AVERAGE_CALL_DURATION",
                      "REPORTED_SATISFACTION","REPORTED_USAGE_LEVEL","CONSIDERING_CHANGE_OF_PLAN"]]
# Setting the target variable
targetVarible = S.LEAVE

# Initiating Dataframe
df = pd.DataFrame(S)

# Calculating the Correlation Coefficient
corrList = []
for i in range(0,11):
    corr = np.corrcoef(df.iloc[:,i],targetVarible)
    corrList.append(corr.item(1))
print("Correlation Coefficient are:",corrList[0:10])

# Plotting bar graph for Correlation Coefficient values
listX = [1,2,3,4,5,6,7,8,9,10,11]
plt.barh(listX,corrList[:11],align='center')
plt.yticks(listX, list(df.columns.values)[0:11])
plt.tight_layout()
plt.show()

# Separating 70% of the data for training and remaining 30% for testing
featureTrain, featureTest, targetTrain, targetTest = train_test_split(features, targetVarible, test_size=.30)

# Creating Decision tree with entropy as criteria.
model = tree.DecisionTreeClassifier(criterion="entropy", max_depth=5, min_samples_leaf=5)
fittedModel = model.fit(featureTrain, targetTrain)   #This will have 70 percent of training data fed with all columns other than target fed as first parameter and target fed as second parameter

# Getting the decision tree
dot_data = StringIO()  
with open("tree.dot", 'w') as dt:
  dt = tree.export_graphviz(fittedModel, out_file=dt, feature_names=list(df.columns.values)) 

# Getting the predictions for training data and finding accuracy of them
predictions_for_train = fittedModel.predict(featureTrain) 
print ("Accuracy of Training data is:",accuracy_score(targetTrain, predictions_for_train))

# Getting the predictions for testing data and finding accuracy of them
predictions = fittedModel.predict(featureTest)  #This will predict the test data that has column other than target

# Getting the confusion matrix and accuracy of training data
print ("Confusion Matrix for Testing data is:",confusion_matrix(targetTest, predictions))
print ("Accuracy of Testing data is:",accuracy_score(targetTest, predictions))



Correlation Coefficient are: [0.01527699095730025, 0.0883277396640967, 0.2333335724846086, 0.06352555428621241, -0.2123675943089142, 0.08809820019342364, 0.20347804628946436, -0.014822814034809808, -0.013532107820768103, -0.008221253246693117]
Accuracy of Training data is: 0.704591836735
Confusion Matrix for Testing data is: [[1488  607]
 [ 648 1457]]
Accuracy of Testing data is: 0.70119047619


In [22]:
# Information gain calculation
def entropy(p_):
  p = p_.copy()
  p[p != 0] = - p[p != 0] * np.log2(p[p != 0] )
  return p.sum()

def information_gain(data):
    entropy_smaller_val = entropy(data[0]/data[0].sum())
    entropy_bigger_val = entropy(data[1]/data[1].sum())
    entropy_before_split = entropy(data.sum(axis=0)/data.sum(axis=0).sum())
    weights = data.sum(axis=1)/data.sum()
    entropy_after_split = weights[0] * entropy_smaller_val + weights[1] * entropy_bigger_val
    return entropy_before_split - entropy_after_split

gainList = []
for i in range(0,11):
    inputdata = np.array([df.iloc[:,i],targetVarible])
    IG = information_gain(inputdata)
    #print(IG)
    #print ("Information Gain:", information_gain(data))
    gainList.append(IG)

# Plotting the graph for Information Gain
listXAxis = [1,2,3,4,5,6,7,8,9,10,11]

plt.barh(listX,gainList[:11],align='center')
plt.yticks(listX, list(df.columns.values)[0:11])
plt.tight_layout()
plt.show()
