In [1]:
import pandas as pd
import numpy as np
from models.decision_tree import DecisionTree
from model_selection import train_test_split
from models.knn import KNearestNeighbor
from models.naive_bayes import NaiveBayes

# Decision Tree

In [2]:
dataset1 = pd.read_csv('DatasetModified/dataset1.csv')
dataset1_unknown = pd.read_csv('DatasetModified/dataset1_unknown.csv')

In [3]:
train_set, test_set = train_test_split(dataset1, train_ratio=0.8)

## Visualize Tree with different criterion

In [4]:
decision_tree = DecisionTree(train_set, test_set, max_depth=3, min_size=10, criterion='GINI')
decision_tree.visualize_tree()

|--- feature_50 == 0
|  |--- feature_26 == 0
|  |  |--- feature_59 == 0
|  |  |  |--- class 0
|  |  |--- feature_59 == 1
|  |  |  |--- class 0
|  |--- feature_26 == 1
|  |  |--- feature_59 == 0
|  |  |  |--- class 1
|  |  |--- feature_59 == 1
|  |  |  |--- class 1
|--- feature_50 == 1
|  |--- feature_25 == 0
|  |  |--- feature_65 == 0
|  |  |  |--- class 0
|  |  |--- feature_65 == 1
|  |  |  |--- class 1
|  |--- feature_25 == 1
|  |  |--- feature_91 == 0
|  |  |  |--- class 1
|  |  |--- feature_91 == 1
|  |  |  |--- class 0


In [5]:
decision_tree2 = DecisionTree(train_set, test_set, max_depth=3, min_size=10, criterion='ENTROPY')
decision_tree2.visualize_tree()

|--- feature_50 == 0
|  |--- feature_73 == 0
|  |  |--- feature_59 == 0
|  |  |  |--- class 0
|  |  |--- feature_59 == 1
|  |  |  |--- class 0
|  |--- feature_73 == 1
|  |  |--- feature_91 == 0
|  |  |  |--- class 0
|  |  |--- feature_91 == 1
|  |  |  |--- class 0
|--- feature_50 == 1
|  |--- feature_25 == 0
|  |  |--- feature_65 == 0
|  |  |  |--- class 0
|  |  |--- feature_65 == 1
|  |  |  |--- class 1
|  |--- feature_25 == 1
|  |  |--- feature_91 == 0
|  |  |  |--- class 1
|  |  |--- feature_91 == 1
|  |  |  |--- class 0


## Calculate Accuracy

In [6]:
decision_tree.get_accuracy()

80.69097888675624

In [7]:
decision_tree2.get_accuracy()

80.7869481765835

## predict dataset_unknown

In [8]:
decision_tree.change_test_set(dataset1_unknown)
y_predicted = decision_tree.predict()
_, counts = np.unique(y_predicted, return_counts=True)
print(f"class 0: {counts[0]}\nclass 1: {counts[1]}")

class 0: 5691
class 1: 821


In [9]:
decision_tree2.change_test_set(dataset1_unknown)
y_predicted = decision_tree2.predict()
_, counts = np.unique(y_predicted, return_counts=True)
print(f"class 0: {counts[0]}\nclass 1: {counts[1]}")

class 0: 5724
class 1: 788


# K-Nearest Neighbors

In [10]:
dataset2 = pd.read_csv('DatasetModified/dataset2.csv')
dataset2_unknown = pd.read_csv('DatasetModified/dataset2_unknown.csv')

In [11]:
train_set, test_set = train_test_split(dataset2, train_ratio=0.8)

In [12]:
knn = KNearestNeighbor(train_set, test_set, k=10)
knn.get_accuracy()

100.0

In [13]:
knn.change_test_set(dataset2_unknown)
y_predicted = knn.predict()
_, counts = np.unique(y_predicted, return_counts=True)
print(f"class 0: {counts[0]}\nclass 1: {counts[1]}")

class 0: 800
class 1: 825


# Naive Bayes

In [14]:
dataset3 = pd.read_csv('DatasetModified/dataset3.csv')
dataset3_unknown = pd.read_csv('DatasetModified/dataset3_unknown.csv')

In [15]:
train_set, test_set = train_test_split(dataset3, train_ratio=0.8)

In [16]:
naive_bayes = NaiveBayes(train_set, test_set)
naive_bayes.get_accuracy()

77.55102040816327

In [17]:
naive_bayes.change_test_set(dataset3_unknown)
y_predicted = naive_bayes.predict()
_, counts = np.unique(y_predicted, return_counts=True)
print(f"class 0: {counts[0]}\nclass 1: {counts[1]}")

class 0: 35
class 1: 26
