# Decision Tree ID3 Classification


### 1) Import Data

In [1]:
#!wget https://dataset-ppm.s3.amazonaws.com/car_sample.csv
# https://drive.google.com/file/d/1zSZwCH9N1XgcNHDk35nTgYlLQ2HKxU_7/view?usp=drive_link
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1zSZwCH9N1XgcNHDk35nTgYlLQ2HKxU_7' -O car_sample.csv

--2024-02-06 23:09:57--  https://docs.google.com/uc?export=download&id=1zSZwCH9N1XgcNHDk35nTgYlLQ2HKxU_7
Resolving docs.google.com (docs.google.com)... 172.217.164.14, 2607:f8b0:4025:803::200e
Connecting to docs.google.com (docs.google.com)|172.217.164.14|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1zSZwCH9N1XgcNHDk35nTgYlLQ2HKxU_7&export=download [following]
--2024-02-06 23:09:57--  https://drive.usercontent.google.com/download?id=1zSZwCH9N1XgcNHDk35nTgYlLQ2HKxU_7&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 172.217.0.65, 2607:f8b0:4025:810::2001
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|172.217.0.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 53645 (52K) [application/octet-stream]
Saving to: ‘car_sample.csv’


2024-02-06 23:09:58 (8.14 MB/s) - ‘car_sample.csv’ saved [53645/53645]



In [2]:
import pandas as pd
import numpy as np
data = pd.read_csv('car_sample.csv')

In [3]:
data.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


## 2) Splitting Training and Testing Data


In [4]:
from sklearn.model_selection import train_test_split
data_latih,data_uji = train_test_split(data,test_size=0.3,random_state=101)
data_latih.reset_index(drop=True)
data_uji.reset_index(drop=True)

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,high,4,more,med,med,unacc
1,low,med,2,more,small,high,unacc
2,vhigh,low,5more,2,big,med,unacc
3,low,vhigh,3,more,small,med,unacc
4,med,low,3,more,small,low,unacc
...,...,...,...,...,...,...,...
514,high,high,2,2,med,med,unacc
515,high,vhigh,3,4,big,med,unacc
516,med,low,5more,2,big,high,unacc
517,med,med,5more,2,big,med,unacc


Tampilkan jumlahnya data pada **data_latih** dan **data_uji**. Seharusnya **data_latih** terdiri dari 208 data, dan **data_uji** terdiri dari 52 data

In [5]:
print(data_uji.shape[0])
print(data_latih.shape[0])

519
1209


## 3) Determining GINI Score

In [6]:
def hitung_gini(kolom_kelas):
  elemen,jumlah = np.unique(kolom_kelas,return_counts = True)
  nilai_gini = 1 - np.sum([(jumlah[i]/np.sum(jumlah))**2 for i in range(len(elemen))])
  return nilai_gini

In [7]:
def gini_split(data,nama_fitur_split,nama_fitur_kelas):
  nilai,jumlah = np.unique(data[nama_fitur_split],return_counts=True)
  gini_split = np.sum([(jumlah[i]/np.sum(jumlah))*hitung_gini(data.where(data[nama_fitur_split]==nilai[i]).dropna()[nama_fitur_kelas]) for i in range(len(nilai))])
  return gini_split

In [8]:
gini_split(data_latih,"buying","class")

0.45137292092994846

## 4) Training

In [9]:
def buat_tree(data,data_awal,daftar_fitur,nama_fitur_kelas,kelas_parent_node=None):
  if len(np.unique(data[nama_fitur_kelas]))<=1:
    return np.unique(data[nama_fitur_kelas])[0]
  elif len(data)==0:
    return np.unique(data_awal[nama_fitur_kelas])[np.argmax(np.unique(data_awal[nama_fitur_kelas],return_counts=True)[1])]
  elif len(daftar_fitur)==0:
    return kelas_parent_node
  else:
    kelas_parent_node = np.unique(data[nama_fitur_kelas])[np.argmax(np.unique(data[nama_fitur_kelas],return_counts=True)[1])]
    nilai_split = [gini_split(data,fitur,nama_fitur_kelas) for fitur in daftar_fitur]
    index_fitur_terbaik = np.argmin(nilai_split)
    fitur_terbaik = daftar_fitur[index_fitur_terbaik]
    tree = {fitur_terbaik:{}}
    daftar_fitur = [i for i in daftar_fitur if i !=fitur_terbaik]
    for nilai in np.unique(data[fitur_terbaik]):
      sub_data = data.where(data[fitur_terbaik]==nilai).dropna()
      subtree = buat_tree(sub_data,data_awal,daftar_fitur,nama_fitur_kelas,kelas_parent_node)
      tree[fitur_terbaik][nilai]=subtree
    return(tree)

In [10]:
tree = buat_tree(data_latih,data_latih,data_latih.columns[:-1],'class')

In [11]:
from pprint import pprint
pprint(tree)

{'safety': {'high': {'persons': {'2': 'unacc',
                                 '4': {'buying': {'high': {'maint': {'high': 'acc',
                                                                     'low': 'acc',
                                                                     'med': 'acc',
                                                                     'vhigh': 'unacc'}},
                                                  'low': {'maint': {'high': {'lug_boot': {'big': 'vgood',
                                                                                          'med': {'doors': {'2': 'acc',
                                                                                                            '3': 'acc',
                                                                                                            '4': 'vgood'}},
                                                                                          'small': 'acc'}},
                                    

## 5) Testing

In [12]:
def prediksi(data_uji,tree):
  for key in list(data_uji.keys()):
    if key in list(tree.keys()):
      try:
        hasil = tree[key][data_uji[key]]
      except:
        return 1
      hasil = tree[key][data_uji[key]]
      if isinstance(hasil,dict):
        return prediksi(data_uji,hasil)
      else:
        return hasil

In [13]:
data_uji_dict = data_uji.iloc[:,:-1].to_dict(orient="records")

In [14]:
hasil_prediksi_total = []
for i in range(len(data_uji_dict)):
  hasil_prediksi = prediksi(data_uji_dict[i],tree)
  hasil_prediksi_total.append(hasil_prediksi)

In [15]:
print("Total prediksi benar: ",sum(hasil_prediksi_total==data_uji['class']))

Total prediksi benar:  457


In [16]:
len(data_uji)

519

Accuracy: 457/519 x 100% = 88%

## Extras
Testing out the Information Gain value instead of GINI

In [17]:
def hitung_entropy(kolom_kelas):
  elemen,banyak = np.unique(kolom_kelas,return_counts=True)
  entropy = np.sum([(-banyak[i]/np.sum(banyak))*np.log2(banyak[i]/np.sum(banyak)) for i in range(len(elemen))])
  return entropy

In [18]:
def information_gain(data, nama_fitur_split, nama_fitur_kelas):
  nilai,banyak= np.unique(data[nama_fitur_split],return_counts=True)
  a = hitung_entropy(data[nama_fitur_kelas])
  b = np.sum([(banyak[i]/np.sum(banyak))*hitung_entropy(data.where(data[nama_fitur_split]==nilai[i]).dropna()[nama_fitur_kelas]) for i in range(len(nilai))])
  information_gain = a-b
  return information_gain

In [19]:
def buat_tree_ig(data,data_awal, daftar_fitur, nama_fitur_kelas,kelas_parent_node=None):
  if len(np.unique(data[nama_fitur_kelas]))<=1:
    return np.unique(data[nama_fitur_kelas])[0]
  elif len(data)==0:
    return np.unique(data_awal[nama_fitur_kelas])[np.argmax(np.unique(data_awal[nama_fitur_kelas],return_counts=True)[1])]
  elif len(daftar_fitur)==0:
    return kelas_parent_node
  else:
    kelas_parent_node = np.unique(data[nama_fitur_kelas])[np.argmax(np.unique(data[nama_fitur_kelas],return_counts=True)[1])]
    nilai_gain = [information_gain(data,fitur,nama_fitur_kelas) for fitur in daftar_fitur]
    index_fitur_terbaik = np.argmax(nilai_gain)
    fitur_terbaik = daftar_fitur[index_fitur_terbaik]
    tree = {fitur_terbaik:{}}
    daftar_fitur = [i for i in daftar_fitur if i !=fitur_terbaik]
    for nilai in np.unique(data[fitur_terbaik]):
      sub_data = data.where(data[fitur_terbaik]==nilai).dropna()
      subtree = buat_tree(sub_data,data_awal,daftar_fitur,nama_fitur_kelas,kelas_parent_node)
      tree[fitur_terbaik][nilai]=subtree
    return(tree)

In [20]:
tree_ig = buat_tree_ig(data_latih,data_latih,data_latih.columns[:-1],'class')

In [21]:
pprint(tree_ig)

{'safety': {'high': {'persons': {'2': 'unacc',
                                 '4': {'buying': {'high': {'maint': {'high': 'acc',
                                                                     'low': 'acc',
                                                                     'med': 'acc',
                                                                     'vhigh': 'unacc'}},
                                                  'low': {'maint': {'high': {'lug_boot': {'big': 'vgood',
                                                                                          'med': {'doors': {'2': 'acc',
                                                                                                            '3': 'acc',
                                                                                                            '4': 'vgood'}},
                                                                                          'small': 'acc'}},
                                    

In [22]:
hasil_prediksi_total_ig = []
for i in range(len(data_uji_dict)):
  hasil_prediksi = prediksi(data_uji_dict[i],tree_ig)
  hasil_prediksi_total_ig.append(hasil_prediksi)
print("Total prediksi benar: ",sum(hasil_prediksi_total_ig==data_uji['class']))

Total prediksi benar:  457


###No Difference :)
###Naive bayes is higher, with 89% acc