In [505]:
import pandas as pd
import numpy as np
from collections import Counter
from math import sqrt
import random
import warnings


In [506]:
#Reading the data set into a data frame with the name df and assigning header to it.

hd=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label']
df = pd.read_csv('train.txt',header=None, names=hd)


#display the top 5 instances in data frame
df.head()



Unnamed: 0,Type,LifeStyle,Vacation,eCredit,Salary,Property,Label
0,student,spend>saving,6,40,13.62,3.2804,C1
1,student,spend>saving,11,21,15.32,2.0232,C1
2,student,spend>saving,7,64,16.55,3.1202,C1
3,student,spend>saving,3,47,15.71,3.4022,C1
4,student,spend>saving,15,10,16.96,2.2825,C1


In [507]:
#we are going to select our cross validation testing data set as 20% of traning data
test_size=0.2 


# converting the data frame into a list which will be helpful for the building of tree.
dataframesplit= df.values.tolist()
for row in dataframesplit:
    #print row
    row[2]=float(row[2])
    row[3]=float(row[3])


#we have to shuffle the training data so that we can get different instances while building the model every time
random.shuffle(dataframesplit)

# assigning 80% of traning data to build the model
dataframelist= dataframesplit[:-int(test_size*len(dataframesplit))]

# assigning 20% of traning data to build the model
dataframecrossvalidate=dataframesplit[-int(test_size*len(dataframesplit)):]



In [508]:
#counts how many number of instances are there for each and every category
def class_counts(rows):
    counts = {}  
    for row in rows:
        label = row[-1]
        if label not in counts:
            counts[label] = 0
        counts[label] += 1
    return counts

In [509]:
#this class is mainly to display the question which we are going to ask at a node
#this also helps to find out to compare to objects.

class Question:
    def __init__(self, column, value):
        #to select column in the data frame we are sending the column number
        #the value with which we are going to compare the rest of the data set.
        self.column = column
        self.value = value
        
    # when a new instance is compared with this object it returns true or false
    def match(self, example):
        val = example[self.column]
        #if is_numeric(val):
        if(type(val)==int or type(val)==float):
            return float(val) >= float(self.value)
        else:
            return val == self.value

    def __repr__(self):
        # This is just a helper method to print
        # the question in a readable format.
        condition = "=="
        if (type(self.value)==int or type(self.value)==float):
            condition = ">="
        return "Is %s %s %s?" % (
            hd[self.column], condition, str(self.value))

In [510]:
#this function helps to segregate the instances into two lists depending on the question
#this branching will help in building tree.
def results_partition(rows, question):
    true_instances, false_instances = [], []
    for row in rows:
        # with the help of match function in the question class we are going to compare rows and question
        if question.match(row):
            true_instances.append(row)
        else:
            false_instances.append(row)
    #print true_instances
    return true_instances, false_instances

In [511]:
#gini impurity is to find how much mixed is the data at a particular node using the data frame
# for the root node the entire data frame will be sent to gini function inorder to find the gini impurity.
# on further development true_instances and flase instances of a particular node will be sent individually.
def gini(rows):
    counts = class_counts(rows)
    impurity = 1
    for lbl in counts:
        prob_of_lbl = counts[lbl] / float(len(rows))
        impurity -= prob_of_lbl**2
    return impurity

In [512]:
#information gained is helpful to find how much best is the decision taken at the node
def info_gain(left, right, current_uncertainty):
    p = float(len(left)) / (len(left) + len(right))
    return current_uncertainty - p * gini(left) - (1 - p) * gini(right)

In [513]:
#this method takes help of info_gain to decide whether the condition best suits for the futher branch of data
def find_best_split(rows):
    best_gain = 0  
    best_question = None  
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1  
    for col in range(n_features):  
        values = set([row[col] for row in rows])  
        for val in values:  
            question = Question(col, val)
            true_rows, false_rows = results_partition(rows, question)# calculating true and flase rows
            if len(true_rows) == 0 or len(false_rows) == 0:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain: # if the calculated gain is best then we change the decision
                best_gain, best_question = gain, question

    return best_gain, best_question

In [514]:
#Leaf node which consists of rows depending on the condition at the node
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

In [515]:
#eacha and every node has true an flase branch which spllits the data
class Decision_Node:
    def __init__(self,question,true_branch,false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

In [516]:
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = results_partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

In [517]:
my_tree = build_tree(dataframelist) # creating a tree with 80% of training data
my_complete_tree=build_tree(dataframesplit) # creating a tree with 100% of training data

In [518]:
def classify(row, node):
    """See the 'rules of recursion' above."""

    # Base case: we've reached a leaf
    if isinstance(node, Leaf):
        return node.predictions
    # Decide whether to follow the true-branch or the false-branch.
    # Compare the feature / value stored in the node,
    # to the example we're considering.
    if node.question.match(row):
        return classify(row, node.true_branch)
    else:
        return classify(row, node.false_branch)

In [519]:
def print_leaf(counts):
    """A nicer way to print the predictions at a leaf."""
    total = sum(counts.values()) * 1.0
    probs = {}
    for lbl in counts.keys():
        return lbl
        #print lbl
        #probs[lbl] = str(int(counts[lbl] / total * 100)) + "%"
    #return probs

In [520]:

# reading the testing data set into a data frame
dft = pd.read_table('test.csv', sep=',', header=None, names=['Type', 'LifeStyle', 'Vacation', 'eCredit', 'Salary', 'Property', 'Label'])
dft.head()


Unnamed: 0,Type,LifeStyle,Vacation,eCredit,Salary,Property,Label
0,student,spend<saving,12,19,14.79,3.7697,C1
1,student,spend>>saving,29,10,16.19,2.4839,C1
2,student,spend<<saving,28,60,15.46,1.1885,C1
3,engineer,spend>saving,15,41,21.26,1.4379,C1
4,librarian,spend<saving,2,9,19.7207,0.6913,C1


In [521]:

#converting the testing data into list
dataframetest= dft.values.tolist()
for row in dataframetest:
    #print row
    row[2]=float(row[2])
    row[3]=float(row[3])



In [522]:
count=0
total=0
for row in dataframecrossvalidate:
    if(row[-1]==print_leaf(classify(row, my_tree))):
        count+=1
    total+=1
    
print 'Accuarcy for cross validation :',(count*100)/total


Accuarcy for cross validation : 89


In [523]:
count=0
total=0
for row in dataframetest:
    if(row[-1]==print_leaf(classify(row, my_tree))):
        count+=1
    total+=1
    
print 'Accuarcy for testing data set  using 80% of data :',(count*100)/total


Accuarcy for testing data set  using 80% of data : 14


In [524]:
count=0
total=0
for row in dataframetest:
    if(row[-1]==print_leaf(classify(row, my_complete_tree))):
        count+=1
    total+=1
    
print 'Accuarcy for testing data set using 100% of data :',(count*100)/total


Accuarcy for testing data set using 100% of data : 14


In [525]:
""" In order to use default method in sklearn we have to make sure that our dont contain Strings.
    So we have to convert all the required columns."""




#converting both the testing and the training data into vlaues.

df['Type'] = df.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
df['LifeStyle'] = df.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 
df['Label'] = df.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})


dft['Type'] = dft.Type.map({'student':1,'engineer':2,'librarian':3,'professor':4,'doctor':5 }) 
dft['LifeStyle'] = dft.LifeStyle.map({'spend<<saving':1, 'spend<saving':2, 'spend>saving':3, 'spend>>saving':4}) 
dft['Label'] = dft.Label.map({'C1':1, 'C2':2 ,'C3':3 ,'C4':4 ,'C5':5})



In [526]:
from sklearn import tree

y_train=df['Label']

#removing the label from the data frame
x_train=df.drop('Label',axis=1)

y_test=dft['Label']

#removing the label from the test data frame.
x_test=dft.drop('Label',axis=1)


In [527]:
clf = tree.DecisionTreeClassifier(max_depth=3)
clf.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [528]:
# predicitng the clusters for our testing data
y_pred = clf.predict(x_test)
#print y_pred
count=0.0
for i in y_pred:
    if(i==1):
        count+=1
print 'Accuracy of testing data by using default decision tree classifier:', (count*100)/len(y_pred)



Accuracy of testing data by using default decision tree classifier: 23.8095238095
