In [1]:
import numpy as np
import pandas as pd
import scipy.stats as sts
from random import randrange
import csv
import random
import math
import operator

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/OHrydko/Machine-Learning-Labs/master/lab2/MP-07-Hrydko.csv', 
                 header=None, names=['x0', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'y_s'], sep=';',engine='python')

df.head()

Unnamed: 0,x0,x1,x2,x3,x4,x5,x6,y_s
0,F,yes,no,no,4,1,3,11
1,F,no,no,yes,5,1,3,11
2,F,yes,no,yes,4,3,3,12
3,F,no,yes,yes,3,1,5,14
4,F,no,no,no,4,2,5,13


In [3]:
df = df.replace({'yes': 1, 'no': 0})
df = pd.get_dummies(df)
df['y'] = df['y_s']
df = df.drop(['y_s'], axis=1)
df.head()

Unnamed: 0,x1,x2,x3,x4,x5,x6,x0_F,x0_M,y
0,1,0,0,4,1,3,1,0,11
1,0,0,1,5,1,3,1,0,11
2,1,0,1,4,3,3,1,0,12
3,0,1,1,3,1,5,1,0,14
4,0,0,0,4,2,5,1,0,13


In [14]:
# # Split a dataset into k folds
# def cross_validation_split(dataset, n_folds):
#     dataset_split = list()
#     dataset_copy = list(dataset)
#     fold_size = int(len(dataset) / n_folds)
#     for i in range(n_folds):
#         fold = list()
#         while len(fold) < fold_size:
#             index = randrange(len(dataset_copy))
#             fold.append(dataset_copy.pop(index))
#         dataset_split.append(fold)
#     return dataset_split

# Calculate accuracy percentage
def accuracy_metric(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    return correct / float(len(actual)) * 100.0

# Evaluate an algorithm using a cross validation split
def cart(dataset, algorithm, n_folds, *args):
    scores = list()
    train = dataset[:50]
    test = dataset[50:]
  
    predicted = algorithm(train, test, *args)
    actual = [row[-1] for row in test]
    accuracy = accuracy_metric(actual, predicted)
    scores.append(accuracy)
    return scores

def test_split(index, value, dataset):
    left, right = list(), list()
    for row in dataset:
        if row[index] < value:
            left.append(row)
        else:
            right.append(row)
    return left, right

# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
    # count all samples at split point
    n_instances = float(sum([len(group) for group in groups]))
    # sum weighted Gini index for each group
    gini = 0.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue
        score = 0.0
        # score the group based on the score for each class
        for class_val in classes:
            p = [row[-1] for row in group].count(class_val) / size
            score += p * p
        # weight the group score by its relative size
        gini += (1.0 - score) * (size / n_instances)
    return gini

# Select the best split point for a dataset
def get_split(dataset):
    class_values = list(set(row[-1] for row in dataset))
    b_index, b_value, b_score, b_groups = 999, 999, 999, None
    for index in range(len(dataset[0])-1):
        for row in dataset:
            groups = test_split(index, row[index], dataset)
            gini = gini_index(groups, class_values)
#             print('X%d < %.3f Gini=%.3f' % ((index+1), row[index], gini))
            if gini < b_score:
                b_index, b_value, b_score, b_groups = index, row[index], gini, groups
    return {'index':b_index, 'value':b_value, 'groups':b_groups}

# Create a terminal node value
def to_terminal(group):
    outcomes = [row[-1] for row in group]
    return max(set(outcomes), key=outcomes.count)

# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
    left, right = node['groups']
    del(node['groups'])
    # check for a no split
    if not left or not right:
        node['left'] = node['right'] = to_terminal(left + right)
        return
    # check for max depth
    if depth >= max_depth:
        node['left'], node['right'] = to_terminal(left), to_terminal(right)
        return
    # process left child
    if len(left) <= min_size:
        node['left'] = to_terminal(left)
    else:
        node['left'] = get_split(left)
        split(node['left'], max_depth, min_size, depth+1)
    # process right child
    if len(right) <= min_size:
        node['right'] = to_terminal(right)
    else:
        node['right'] = get_split(right)
        split(node['right'], max_depth, min_size, depth+1)

# Build a decision tree
def build_tree(train, max_depth, min_size):
    root = get_split(train)
    split(root, max_depth, min_size, 1)
    return root

# Print a decision tree
def print_tree(node, depth=0):
    if isinstance(node, dict):
        print('%d %s X%d = %.3f' % ((depth, depth*'  ', (node['index']+1), node['value'])))
        print_tree(node['left'], depth+1)
        print_tree(node['right'], depth+1)
    else:
        print('%d %s[%s]' % ((depth, depth*'  ', node)))

# Make a prediction with a decision tree
def predict(node, row):
    if row[node['index']] < node['value']:
        if isinstance(node['left'], dict):
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict):
            return predict(node['right'], row)
        else:
            return node['right']
        

    
# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
    tree = build_tree(train, max_depth, min_size)
    print_tree(tree)
    predictions = list()
    for row in test:
        prediction = predict(tree, row)
        predictions.append(prediction)
    print(predictions)
    print(np.array(test)[:,-1])
    return(predictions)
        
n_folds = 5
max_depth = 5
min_size = 1
dataset = []
for index, row in df.iterrows():
    dataset.append(list(row))
scores = cart(dataset, decision_tree, n_folds, max_depth, min_size)

print('Scores: %s' % scores)
print('Mean Accuracy: %.3f%%' % (sum(scores)/float(len(scores))))

0  X6 = 3.000
1    X4 = 4.000
2      X4 = 3.000
3       [11]
3        X1 = 0.000
4         [14]
4         [14]
2      X1 = 1.000
3        X4 = 5.000
4          X6 = 2.000
5           [17]
5           [17]
4         [12]
3        X2 = 1.000
4          X3 = 1.000
5           [13]
5           [10]
4         [12]
1    X5 = 3.000
2      X7 = 1.000
3        X4 = 5.000
4          X5 = 2.000
5           [15]
5           [12]
4          X2 = 1.000
5           [12]
5           [13]
3        X2 = 1.000
4          X5 = 2.000
5           [11]
5           [13]
4          X6 = 5.000
5           [10]
5           [11]
2      X4 = 5.000
3        X4 = 3.000
4          X4 = 2.000
5           [12]
5           [13]
4          X1 = 1.000
5           [12]
5           [12]
3        X2 = 1.000
4          X1 = 0.000
5           [11]
5           [11]
4          X1 = 0.000
5           [10]
5           [10]
[12, 11, 11, 12, 12, 12, 17, 15, 15, 11, 13, 10, 12, 12, 12]
[13 16  9 12 13 12 15 16 14 16 16 16 10 13 12]
S

In [12]:
def euclideanDistance(instance1, instance2, length):
    distance = 0
    for x in range(length):
        distance += pow((instance1[x] - instance2[x]), 2)
    return math.sqrt(distance)

def get_neighbors(train, test_row, num_neighbors):
    distances = []
    for train_row in train:
        dist = np.sqrt(np.sum((np.array(test_row)[:-1] - np.array(train_row)[:-1])**2))
        distances.append((train_row, dist))
    distances.sort(key=lambda tup: tup[1])
    neighbors = []
    for i in range(num_neighbors):
        neighbors.append(distances[i][0])
    return neighbors

def getResponse(neighbors):
    classVotes = {}
    for x in range(len(neighbors)):
        response = neighbors[x][-1]
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return sortedVotes[0][0]

def getAccuracy(testSet, predictions):
    correct = 0
    for x in range(len(testSet)):
        if testSet[x][-1] == predictions[x]:
            correct += 1
    return (correct/float(len(testSet))) * 100.0
    
def main():
    # prepare data
    train = dataset[:60]
    test = dataset[60:]
    print( 'Train set: ' + repr(len(train)))
    print('Test set: ' + repr(len(test)))
    # generate predictions
    predictions=[]
    neighbors = [3, 4, 5, 6, 7]
    for k in neighbors:
        print('k = {}'.format(k))
        for x in range(len(test)):
            neighbors = get_neighbors(train, test[x], k)
            result = getResponse(neighbors)
            predictions.append(result)
            print('> predicted=' + repr(result) + ', actual=' + repr(test[x][-1]))
        accuracy = getAccuracy(test, predictions)
        print('Accuracy: ' + repr(accuracy) + '%')
    
main()

Train set: 60
Test set: 5
k = 3
> predicted=12, actual=16
> predicted=7, actual=16
> predicted=15, actual=10
> predicted=12, actual=13
> predicted=12, actual=12
Accuracy: 20.0%
k = 4
> predicted=12, actual=16
> predicted=7, actual=16
> predicted=12, actual=10
> predicted=12, actual=13
> predicted=12, actual=12
Accuracy: 20.0%
k = 5
> predicted=12, actual=16
> predicted=11, actual=16
> predicted=12, actual=10
> predicted=12, actual=13
> predicted=14, actual=12
Accuracy: 20.0%
k = 6
> predicted=12, actual=16
> predicted=11, actual=16
> predicted=12, actual=10
> predicted=12, actual=13
> predicted=14, actual=12
Accuracy: 20.0%
k = 7
> predicted=12, actual=16
> predicted=12, actual=16
> predicted=12, actual=10
> predicted=12, actual=13
> predicted=13, actual=12
Accuracy: 20.0%
