In [3]:
import pandas as pd
import math
import numpy as np

In [4]:
# reading from csv file

data = pd.read_csv("../Datasets/playtennis.csv")
print(data)

features = [feat for feat in data]
features.remove("play")
print(features)

     outlook  temp humidity  windy play
0      sunny   hot     high  False   no
1      sunny   hot     high   True   no
2   overcast   hot     high  False  yes
3      rainy  mild     high  False  yes
4      rainy  cool   normal  False  yes
5      rainy  cool   normal   True   no
6   overcast  cool   normal   True  yes
7      sunny  mild     high  False   no
8      sunny  cool   normal  False  yes
9      rainy  mild   normal  False  yes
10     sunny  mild   normal   True  yes
11  overcast  mild     high   True  yes
12  overcast   hot   normal  False  yes
13     rainy  mild     high   True   no
['outlook', 'temp', 'humidity', 'windy']


In [5]:
class Node:
    def __init__(self):
        self.children = []
        self.value = "" 
        self.isLeaf = False
        self.pred = "" 

In [6]:
def entropy(examples):
    pos = 0.0
    neg = 0.0
    for _, row in examples.iterrows():
        if row["play"] == "yes":
            pos += 1
        else:
            neg += 1
    if pos == 0.0 or neg == 0.0:
        return 0.0
    else:
        p = pos / (pos + neg)
        n = neg / (pos + neg)
    return -(p * math.log(p, 2) + n * math.log(n, 2))

In [7]:
def info_gain(examples, attr):
    uniq = np.unique(examples[attr])
    #print ("\n",uniq)
    gain = entropy(examples)
    #print ("\n",gain)
    for u in uniq:
        subdata = examples[examples[attr] == u]
        #print ("\n",subdata)
        sub_e = entropy(subdata)
        gain -= (float(len(subdata)) / float(len(examples))) * sub_e
        #print ("\n",gain)
    return gain

In [8]:
def ID3(examples, attrs):
    root = Node()

    max_gain = 0
    max_feat = ""
    for feature in attrs:
    #print ("\n",examples)
        gain = info_gain(examples, feature)
        if gain > max_gain:
            max_gain = gain
            max_feat = feature
    root.value = max_feat
    #print ("\nMax feature attr",max_feat)
    uniq = np.unique(examples[max_feat])
    #print ("\n",uniq)
    for u in uniq:
        #print ("\n",u)
        subdata = examples[examples[max_feat] == u]
        #print ("\n",subdata)
        if entropy(subdata) == 0.0:
            newNode = Node()
            newNode.isLeaf = True
            newNode.value = u
            newNode.pred = np.unique(subdata["play"])
            root.children.append(newNode)
        else:
            dummyNode = Node()
            dummyNode.value = u
            new_attrs = attrs.copy()
            new_attrs.remove(max_feat)
            child = ID3(subdata, new_attrs)
            dummyNode.children.append(child)
            root.children.append(dummyNode)

    return root

In [9]:
 def printTree(root:Node, depth=0):
    for i in range(depth):
        print("\t", end="")
    print(root.value, end="")
    if root.isLeaf:
        print(" -> ", root.pred)
    print()
    for child in root.children:
        printTree(child, depth + 1)

In [10]:
def classify(root, new):
    for child in root.children:
        if child.value == new[root.value]:
            if child.isLeaf:
                print ("Predicted Label for new example", new," is:", child.pred)
                exit
            else:
                classify (child.children[0], new)

In [11]:
root = ID3(data, features)
print("Decision Tree is:")
printTree(root)
print ("------------------")

new = {"outlook":"sunny", "temperature":"hot", "humidity":"normal", "wind":"strong"}
classify (root, new)

Decision Tree is:
outlook
	overcast ->  ['yes']

	rainy
		windy
			False ->  ['yes']

			True ->  ['no']

	sunny
		humidity
			high ->  ['no']

			normal ->  ['yes']

------------------
Predicted Label for new example {'outlook': 'sunny', 'temperature': 'hot', 'humidity': 'normal', 'wind': 'strong'}  is: ['yes']


# With Package

In [119]:
from sklearn import tree
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

In [120]:
# Load data from CSV
data = pd.read_csv("../Datasets/playtennis.csv")
print("The first 5 values of data is \n",data.head())

The first 5 values of data is 
     outlook  temp humidity  windy play
0     sunny   hot     high  False   no
1     sunny   hot     high   True   no
2  overcast   hot     high  False  yes
3     rainy  mild     high  False  yes
4     rainy  cool   normal  False  yes


In [121]:
# Obtain Train data and Train output
X = data.iloc[:,:-1]
print("\nThe first 5 values of Train data is \n",X.head())


The first 5 values of Train data is 
     outlook  temp humidity  windy
0     sunny   hot     high  False
1     sunny   hot     high   True
2  overcast   hot     high  False
3     rainy  mild     high  False
4     rainy  cool   normal  False


In [122]:
y = data.iloc[:,-1]
print("\nThe first 5 values of Train output is \n",y.head())


The first 5 values of Train output is 
 0     no
1     no
2    yes
3    yes
4    yes
Name: play, dtype: object


In [123]:
# Convert them in numbers
le_outlook = LabelEncoder()
X.outlook =  le_outlook.fit_transform(X.outlook)

le_Temperature = LabelEncoder()
X.temp =  le_Temperature.fit_transform(X.temp)

le_Humidity = LabelEncoder()
X.humidity =  le_Humidity.fit_transform(X.humidity)

le_Windy = LabelEncoder()
X.windy =  le_Windy.fit_transform(X.windy)
print("\nNow the Train data is\n",X.head())


Now the Train data is
    outlook  temp  humidity  windy
0        2     1         0      0
1        2     1         0      1
2        0     1         0      0
3        1     2         0      0
4        1     0         1      0


In [124]:
le_PlayTennis = LabelEncoder()
y =  le_PlayTennis.fit_transform(y)
print("\nNow the Train data is\n",y)


Now the Train data is
 [0 0 1 1 1 0 1 0 1 1 1 1 1 0]


In [125]:
## Train model
classifier = DecisionTreeClassifier(criterion='entropy')
classifier.fit(X,y)


In [127]:
# Test model
def labelEncoderForInput(list1):
    list1[0] =  le_outlook.transform([list1[0]])[0]
    list1[1] =  le_Temperature.transform([list1[1]])[0]
    list1[2] =  le_Humidity.transform([list1[2]])[0]
    list1[3] =  le_Windy.transform([list1[3]])[0]
    return [list1]

## predict for an input

inp1=["sunny","hot","normal","False"]
inp=inp1.copy()
pred1 = labelEncoderForInput(inp1)

y_pred = classifier.predict(pred1)
print(y_pred)
print("The prediction for {0} is {1}".format(inp, le_PlayTennis.inverse_transform(y_pred)))

[1]
The prediction for ['sunny', 'hot', 'normal', 'False'] is ['yes']


