In [58]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import random
from pprint import pprint

In [59]:
%matplotlib inline

In [60]:
df = pd.read_csv('wine-dataset.csv')

In [61]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,0
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,0
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,0


# Train Test Split


Generally after split we get, X_train, X_test, Y_train, Y_test from sklearn, we need to seperate the data and its label, and then for test random splitting. We will create a function, which takes df and test size, and return the respective outputs.

In [62]:
def split (df, test_size):
    msk = np.random.rand(len(df))<1-test_size
    train_df = df[msk]
    test_df = df[~msk]
    return test_df, train_df

In [63]:
random.seed(0)
train_df, test_df = split(df, test_size=0.2)

# Cross- validation

def cross_val (df, k):
    for i in k:
        msk = df.index.tolist()
        x = len(msk)//k
        l = (i-1)*x 
        m = (i)*x
        csk = msk[l:m]
        test_df = df.loc[csk]
        train_df = df.drop(csk,axis =0)

    
    
    
    

In [65]:
def check_purity(data):
    
    label_column = data[:, -1]
    unique_classes = np.unique(label_column)

    if len(unique_classes) == 1:
        return True
    else:
        return False

In [66]:
def classify_data(data):
    
    label_column = data[:, -1]
    unique_classes, counts_unique_classes = np.unique(label_column, return_counts=True)

    index = counts_unique_classes.argmax()
    classification = unique_classes[index]
    
    return classification
    

In [46]:
def potential_splits(data):
    potential_splits = {}
    row, columns = data.shape

    for column in range (columns-1):
    
        potential_splits[column]= []
        x= np.unique(data[:,column])
    
        for i in range(len(x)):
            if i==0:
                pass
            else:
                current_val= x[i]
                previous_val= x[i-1]
                potential_split= (current_val+previous_val)/2
                potential_splits[column].append(potential_split)

    return potential_splits

    
    

In [67]:
def split_data(data, split_column, split_value):
    x= data[:,split_column]
    data_1= data[x<=split_value]
    data_2= data[x>split_value]
    return data_1,data_2

In [68]:
def calculate_entropy(data):
    x,y= np.unique(data[:,-1],return_counts=True)
    p= y/y.sum()
    e=sum(p*-np.log2(p))
    return e

In [69]:
def overall_entropy(data_1,data_2):
    e_1= calculate_entropy(data_1)
    e_2= calculate_entropy(data_2)
    p_1=len(data_1)/(len(data_1)+len(data_2))
    p_2=len(data_2)/(len(data_1)+len(data_2))
    o= e_1*p_1+e_2*p_2
    return o

    

In [57]:
def best_split(data,potential_splits):
    overall_entro = 10000
    for column_index in potential_splits:
        for v in potential_splits[column_index]:
            data_1,data_2= split_data(data,column_index,v)
            current_overall_entropy= overall_entropy(data_1,data_2)
            if current_overall_entropy< overall_entro:
                overall_entro = current_overall_entropy
                best_split_column = column_index
                best_split_value = v
            
    return best_split_column,best_split_value

In [80]:
def decision_tree_algo(data):
    
    
    
    if check_purity(data):
        classification= classify_data(data)
        return classification
    else:
        
        
        pot_splits = potential_splits(data)
        bst_split_column, bst_split_value = best_split(data,pot_splits)
        Data_1, Data_2 = split_data(data, bst_split_column, bst_split_value)
        
        question = "{} <= {}".format(bst_split_column, bst_split_value)
        sub_tree = {question: []}
        
        yes_ans = decision_tree_algo(Data_1)
        no_ans = decision_tree_algo(Data_2)
        
        sub_tree[question].append(yes_ans)
        sub_tree[question].append(no_ans)
        
        return sub_tree
        

In [83]:
tree = decision_tree_algo(train_df.values)
print(tree)

{'10 <= 10.625': [{'1 <= 0.195': [{'8 <= 3.05': [{'2 <= 0.31': [1.0, {'0 <= 8.149999999999999': [0.0, 1.0]}]}, {'9 <= 0.81': [{'6 <= 145.0': [{'6 <= 105.5': [{'10 <= 10.350000000000001': [{'7 <= 0.99688': [0.0, 1.0]}, {'0 <= 7.2': [1.0, 0.0]}]}, {'8 <= 3.2699999999999996': [{'4 <= 0.036500000000000005': [1.0, {'3 <= 11.75': [0.0, {'3 <= 14.3': [1.0, 0.0]}]}]}, {'9 <= 0.42000000000000004': [{'5 <= 34.0': [0.0, 1.0]}, 1.0]}]}]}, {'4 <= 0.0565': [0.0, {'3 <= 8.95': [1.0, 0.0]}]}]}, 1.0]}]}, {'8 <= 3.4450000000000003': [{'2 <= 0.265': [0.0, {'1 <= 0.315': [{'6 <= 200.5': [{'3 <= 14.825': [{'3 <= 13.3': [{'10 <= 10.350000000000001': [{'6 <= 199.0': [{'9 <= 0.555': [0.0, {'8 <= 3.1550000000000002': [{'2 <= 0.315': [0.0, 1.0]}, 0.0]}]}, 1.0]}, {'5 <= 13.0': [{'4 <= 0.0395': [1.0, 0.0]}, {'2 <= 0.275': [1.0, {'9 <= 0.565': [0.0, {'2 <= 0.43': [{'2 <= 0.315': [{'2 <= 0.305': [{'6 <= 109.0': [1.0, 0.0]}, 1.0]}, 0.0]}, 1.0]}]}]}]}]}, {'2 <= 0.38': [{'0 <= 8.25': [0.0, 1.0]}, {'9 <= 0.459999999999