In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
import pandas as pd
df = pd.read_csv('/content/drive/MyDrive/Dataset/car.csv')
df.shape

(1727, 7)

In [None]:
df

Unnamed: 0,vhigh,vhigh.1,2,2.1,small,low,unacc
0,vhigh,vhigh,2,2,small,med,unacc
1,vhigh,vhigh,2,2,small,high,unacc
2,vhigh,vhigh,2,2,med,low,unacc
3,vhigh,vhigh,2,2,med,med,unacc
4,vhigh,vhigh,2,2,med,high,unacc
...,...,...,...,...,...,...,...
1722,low,low,5more,more,med,med,good
1723,low,low,5more,more,med,high,vgood
1724,low,low,5more,more,big,low,unacc
1725,low,low,5more,more,big,med,good


In [None]:
import numpy as np
test = np.unique(df)
test

array(['2', '3', '4', '5more', 'acc', 'big', 'good', 'high', 'low', 'med',
       'more', 'small', 'unacc', 'vgood', 'vhigh'], dtype=object)

In [None]:
df.head

In [None]:
df.values.tolist()

In [None]:
"""
Hồ Thái Đạt: N20DCCN014
Đổ Xuân Minh: N20DCCN041
Nguyễn Văn Thuận: N20DCCN077
"""


import pandas as pd
import numpy as np
import math
from sklearn import datasets
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

# khoi tao class node
class Node:
  # khoi tao các gia tri
	def __init__(self, feature=None, threshold=None, left=None, right=None,*,value=None):
		self.feature = feature 
		self.threshold = threshold
		self.left = left
		self.right = right
		self.value = value
		# kiem tra xem cay co rong hay k
	def is_leaf_node(self):
		return self.value is not None

# khoi tạo class cay quyet dinh
class DecisionTree:
  # khoi tao gia tri, thuoc tinh trong cay quyet dinh
	def __init__(self, min_samples_split=2, max_depth=100, n_features=None):
		self.min_samples_split=min_samples_split #  giá trị chia nhánh nhỏ nhất của cây
		self.max_depth=max_depth # độ sâu của cây
		
		self.n_features=n_features 
		self.root=None	
# hàm trainning mô hình
	def fit(self, X, y):
		self.n_features = X.shape[1] if not self.n_features else min(X.shape[1],self.n_features)
		self.root = self._grow_tree(X, y)

# hàm xử lý cái cây quyết định
	def _grow_tree(self, X, y, depth=0):
		n_samples, n_feats = X.shape  # gán giá trị cho n_sample = hàng , n_feats = cột
		n_labels = len(np.unique(y)) # gán nhãn từ y / y là cột kết quả

		if len(y) > 0 and (depth>=self.max_depth or n_labels==1 or n_samples<self.min_samples_split):
			leaf_value = self._most_common_label(y)
			return Node(value=leaf_value)

		feat_idxs = np.random.choice(n_feats, self.n_features, replace=False)

		best_feature, best_thresh = self._best_split(X, y, feat_idxs)
		left_idxs, right_idxs = self._split(X[:, best_feature], best_thresh)
		left = self._grow_tree(X[left_idxs, :], y[left_idxs], depth+1)
		right = self._grow_tree(X[right_idxs, :], y[right_idxs], depth+1)
		return Node(best_feature, best_thresh, left, right)


	def _best_split(self, X, y, feat_idxs):
		best_gain = -1
		split_idx, split_threshold = None, None

		for feat_idx in feat_idxs:
			X_column = X[:, feat_idx]

			thresholds = np.unique(X_column)

			for thr in thresholds:
				gain = self._information_gain(y, X_column, thr)

				if gain > best_gain:
					best_gain = gain
					split_idx = feat_idx
					split_threshold = thr

		return split_idx, split_threshold

	def _information_gain(self, y, X_column, threshold):

		parent_entropy = self._entropy(y)

		left_idxs, right_idxs = self._split(X_column, threshold)

		if len(left_idxs) == 0 or len(right_idxs) == 0:
			return 0
		n = len(y)
		n_l, n_r = len(left_idxs), len(right_idxs)
		e_l, e_r = self._entropy(y[left_idxs]), self._entropy(y[right_idxs])
		child_entropy = (n_l/n) * e_l + (n_r/n) * e_r

		# calculate the IG
		information_gain = parent_entropy - child_entropy
		return information_gain

# hàm chia dữ liệu
	def _split(self, X_column, split_thresh):
		left_idxs = np.argwhere(X_column <= split_thresh).flatten()
		right_idxs = np.argwhere(X_column > split_thresh).flatten()
		return left_idxs, right_idxs

# hàm trả về giá trị sai số
	def _entropy(self, y):
		hist = np.bincount(y)
		ps = hist / len(y)
		return -np.sum([p * np.log(p) for p in ps if p>0])

	def _most_common_label(self, y):
		counter = Counter(y)
		value = counter.most_common(1)[0][0]
		
		return value
# hàm trả về giá trị dự đoán
	def predict(self, X):
		return np.array([self._traverse_tree(x, self.root) for x in X])

	def _traverse_tree(self, x, node):
		if node.is_leaf_node():
			return node.value

		if x[node.feature] <= node.threshold:
			return self._traverse_tree(x, node.left)
		return self._traverse_tree(x, node.right)



# tiền xử lý dữ liệu
BYING = {
	'vhigh': 1,
	'high': 2,
	'med': 3,
	'low': 4
}

MAINT = {
	'vhigh': 1,
	'high': 2,
	'med': 3,
	'low': 4
}

DOOR = {
	'2': 1,
	'3': 2,
	'4': 3,
	'5more': 4	
}

PERSON = {
	'2': 1,
	'4': 2,
	'more': 3
}

LOG_BOOT = {
	'small': 1,
	'med': 2,
	'big': 3
}

SAFETY = {
	'low': 1,
	'med': 2,
	'high': 3	
}

LABLE = {
	'unacc': 0, 
	'acc': 1, 
	'good': 2, 
	'vgood': 3
}

#hàm gán lại giá trị
def change_dataset():
	for i in range(len(dataset)):
		dataset[i][0] = BYING[dataset[i][0]]
		dataset[i][1] = MAINT[dataset[i][1]]	
		dataset[i][2] = DOOR[dataset[i][2]]	
		dataset[i][3] = PERSON[dataset[i][3]]	
		dataset[i][4] = LOG_BOOT[dataset[i][4]]	
		dataset[i][5] = SAFETY[dataset[i][5]]
		dataset[i][6] = LABLE[dataset[i][6]]

# hàm lấy ra cái nhãn dự đoán
def get_data_label(data_f):
	lbl = data_f.columns[-1]

	data= df.drop([lbl], axis=1).values
	label = df['safety'].values

	return data, label

dataset = pd.read_csv('/content/drive/MyDrive/Dataset/car.csv')
dataset = dataset.values.tolist()

# Change dataset to int
change_dataset()

dataset = np.array(dataset)

dataset_x = dataset[:,0:-1]
dataset_y = np.array([x[-1] for x in dataset])

randIndex = np.arange(dataset_x.shape[0])
np.random.shuffle(randIndex)

dataset_x = dataset_x[randIndex]
dataset_y = dataset_y[randIndex]

X_train, X_test, y_train, y_test = train_test_split(
	dataset_x, dataset_y, test_size=0.2
)

# khởi tạo giá trị mô hình
clf = DecisionTree(max_depth=10)
# training mô hình
clf.fit(X_train, y_train)
#đưa ra giá trị dự đoán
predictions = clf.predict(X_test)

# hàm đưa ra độ chính xác của mô hình
def accuracy(y_test, y_pred):
	return np.sum(y_test == y_pred) / len(y_test)

acc = accuracy(y_test, predictions)
print('Accuracy My DecisionTree: ' + str(acc))

classifier = DecisionTreeClassifier()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

acc = accuracy(y_test, y_pred)
print('Accuracy sklearn DecisionTree: ' + str(acc))

Accuracy My DecisionTree: 0.9710982658959537
Accuracy sklearn DecisionTree: 0.9855491329479769
