**Aim:** Write a program to exhibit the decision tree based C4.5 Algorithm <br>
**Theory:** C4.5 is an algorithm used to generate a decision tree developed by Ross Quinlan.C4.5 is an extension of Quinlan's earlier ID3 algorithm. The decision trees generated by C4.5 can be used for classification, and for this reason, C4.5 is often referred to as a statistical classifier. In 2011, authors of the Weka machine learning software described the C4.5 algorithm as "a landmark decision tree program that is probably the machine learning workhorse most widely used in practice to date".<br>




**Code:**<br>
Dataset used is **Iris Dataset**


In [1]:
#importing libraires
import pandas as pd
import numpy as np

In [28]:
df=pd.read_csv('datasets_19_420_Iris.csv')
df=df.drop('Id',axis=1)

df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [29]:
from sklearn.preprocessing import LabelEncoder 

In [30]:
le=LabelEncoder()
df['Species']=le.fit_transform(df['Species'])
df['Species'].value_counts()

2    50
1    50
0    50
Name: Species, dtype: int64

In [31]:
from sklearn.model_selection import train_test_split

In [32]:
X=df.copy()
y=df.copy()

In [33]:
X=X.drop(columns=['Species'])
X.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [34]:
attribute=['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm']

In [35]:
y=y.drop(columns=attribute)
y.head()

Unnamed: 0,Species
0,0
1,0
2,0
3,0
4,0


In [36]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [40]:
x_final_df= pd.concat([X_train,y_train],axis=1)

In [53]:
x_final_df=x_final_df.rename(columns={'Species':'label'})
x_final_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,label
128,6.4,2.8,5.6,2.1,2
131,7.9,3.8,6.4,2.0,2
145,6.7,3.0,5.2,2.3,2
108,6.7,2.5,5.8,1.8,2
143,6.8,3.2,5.9,2.3,2


In [44]:
eps=np.finfo(float).eps


# C4.5 Implementation


In [13]:
def totalEntropy(df) :
    label=df.keys()[-1]
    entropy=0
    items=df[label].unique()
    for value in items :
        fraction = df[label].value_counts() [value]/len(df[label])
        entropy+= -fraction*np.log2(fraction)
    return entropy



In [15]:
def entropyAttribute(df,attribute) :
    label=df.keys()[-1]
    targetVariables=df[label].unique()
    variables=df[attribute].unique()
    finalEntropy=0
    
    total_examples = len(df)*float(1.0)
    split_info =float (0.0)
    
    for var in variables:
        entropy=0;
        for tar in targetVariables:
            num=len(df[attribute][df[attribute]==var][df[label]==tar])
            den=len(df[attribute][df[attribute]==var])
            fraction=num/den
            entropy+=-fraction*np.log2(fraction+eps)
        fraction2=den/len(df)
        entropy2=-fraction2*entropy
        
        frequency=len(df[attribute][df[attribute]==var])
        split_info += (frequency/total_examples)*np.log2((frequency/total_examples))*float(-1.0)
        
    return 0 if split_info == 0 else abs(entropy2 / split_info)
            

In [17]:
def highestInfoGain(df):
    attributes=df.keys()[:-1]
    infoGain=[]
    entropy=totalEntropy(df)
    
    for key in attributes :
        ratio = entropyAttribute(df,key)
        infoGain.append(0 if ratio == 0 else (entropy-ratio)/ratio)
        
    return df.keys()[:-1][np.argmax(infoGain)]


In [18]:
def subTable(df, node, value):
    '''
        node: Column name
        value: Unique value of the column
    '''
    return df[df[node] == value].reset_index(drop=True)

In [16]:
def createTree(df, tree=None) :
    # Store Labels in a list
    label=(df.keys()[-1])
    
    #Find node which gives highest info gain
    node =highestInfoGain(df)
    
    #Find all the different attributes of that column
    attValue=np.unique(df[node])
    
    #If tree doesn't exist then create a dictionary to store nodes of tree 
    if tree is None:
        tree={}
        tree[node]={}
        
    #We make loop to construct a tree by calling this function recursively. 
    #In this we check if the subset is pure and stops if it is pure. 
    
    for value in attValue:
        divideTable=subTable(df,node,value)
        colVal,counts= np.unique(divideTable['label'], return_counts=True)
        
        if len(counts)==1:
            tree[node][value]=colVal[0]
        else :
            tree[node][value]=createTree(divideTable)
    return tree
    

In [54]:
import pprint
tree= createTree(x_final_df)
pprint.pprint(tree)

{'PetalLengthCm': {1.0: 0,
                   1.1: 0,
                   1.2: 0,
                   1.3: 0,
                   1.4: 0,
                   1.5: 0,
                   1.6: 0,
                   1.7: 0,
                   1.9: 0,
                   3.0: 1,
                   3.3: 1,
                   3.5: 1,
                   3.7: 1,
                   3.8: 1,
                   3.9: 1,
                   4.0: 1,
                   4.1: 1,
                   4.2: 1,
                   4.3: 1,
                   4.4: 1,
                   4.5: {'SepalLengthCm': {4.9: 2,
                                           5.4: 1,
                                           5.6: 1,
                                           6.0: 1,
                                           6.4: 1}},
                   4.6: 1,
                   4.7: 1,
                   4.8: {'SepalLengthCm': {5.9: 1, 6.0: 2, 6.2: 2}},
                   4.9: {'SepalWidthCm': {2.5: 1, 2.7: 2, 2.8: 2, 3.1: 1}},
    

In [55]:
def predict(test,tree):
    #Recursively we go through the tree that we built earlier
    for nodes in tree.keys():        
        
        value = test[nodes]
        if value in tree[nodes].keys():
            tree = tree[nodes][value]
        else:
            m = 10000
            ans = 0
            for i in tree[nodes].keys():
                  if(abs(value-i)<m):
                        m=abs(value-i)
                        ans = i
            tree = tree[nodes][ans]
        prediction = 0
            
        if type(tree) is dict:
            prediction = predict(test, tree)
        else:
            prediction = tree
            break 

    return prediction

In [56]:
X_test.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
73,6.1,2.8,4.7,1.2
18,5.7,3.8,1.7,0.3
118,7.7,2.6,6.9,2.3
78,6.0,2.9,4.5,1.5
76,6.8,2.8,4.8,1.4


In [59]:
prediction=[]

for val in range(len(X_test)):
    prediction.append(predict(X_test.iloc[val],tree))
pprint.pprint(prediction)

[1, 0, 2, 1, 2, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 1, 0]


In [82]:
def score(labels, y_test):
        return (labels == y_test['Species']).sum() / len(y_test)

In [83]:
score(prediction,y_test)

0.9130434782608695