In [1]:
import numpy as np
import csv
import math
from sklearn import datasets
import copy

# Load Classification Player Data

data = []

#Load Data
with open('players_stats.csv') as file:
    csv_reader = csv.reader(file, delimiter=',')
    for row in csv_reader:
        data.append(row)
player_data_raw = np.array(data)

# Remove labels
player_data_uncentered_unnormalized = player_data_raw[1:].T[1:-1].T.astype(np.float)

# Mean Center Data
player_data_unnormalized = player_data_uncentered_unnormalized - np.mean(player_data_uncentered_unnormalized, axis=0)

# Normalized data so there wouldnt be any overflow errors
player_data = player_data_unnormalized / np.linalg.norm(player_data_unnormalized, axis=0)

# Get labels
player_labels = player_data_raw.T[-1].T[1:]

In [2]:
# Load Classification Iris Data

iris = datasets.load_iris()
iris_data_uncentered_unnormalized = iris.data

# Mean Center
iris_data_unnormalized = iris_data_uncentered_unnormalized - np.mean(iris_data_uncentered_unnormalized, axis=0)

# Normalize data
iris_data = iris_data_unnormalized / np.linalg.norm(iris_data_unnormalized, axis=0)

# Get Labels
iris_label = iris.target

In [3]:
# Load Clustering Seed Data

data = []

#Load Data from file
with open('Seed_Data.csv') as file:
    csv_reader = csv.reader(file, delimiter=',')
    for row in csv_reader:
        data.append(row)
seed_data_raw = np.array(data)

# Remove Expected Clusters
seed_data_uncentered = seed_data_raw[1:].T[:-1].T.astype(np.float)

# Mean Center Data
seed_data = seed_data_uncentered - np.mean(seed_data_uncentered, axis=0)

# Get Expected Clusters
seed_label = seed_data_raw.T[-1].T[1:]

In [4]:
# Learn the Multivariate Gaussian distribution for each class

def naive_bayes_learn(X, Y):
    #Determine the classes we are classifying
    classes = []
    data = []
    for i in Y:
        if i not in classes:
            classes.append(i)
            data.append([])
    
    #Split up data into arrays by class
    for index, info in enumerate(X):
        data[classes.index(Y[index])].append(info)
    
    #Calculate mean and covariance by class
    mean = []
    cov = []
    for cls in data:
        mean.append(np.mean(cls, axis=0))
        cov.append(np.cov(np.array(cls).T))
    
    return classes, mean, cov


In [5]:
# Calculate the Multivariate Gaussian based on the mean vector, covariance vector, and data sample

def multi_gauss(u, E, x):
    sub = np.subtract(x, u)
    top = math.exp(-0.5 * (sub.T @ np.linalg.inv(E) @ sub))
    bottom = math.sqrt(np.power(2*np.pi, len(x)) * abs(np.linalg.det(E)))
    return top / bottom

In [6]:
# Predict a set of X data samples given the outputs of the learn function

def naive_bayes_predict(classes, mean, cov, X):
    # Check each class to see if it has highest probability
    pred = []
    for data in X:
        preds = []
        # For every element, test every class
        for index, cls in enumerate(classes):
            preds.append(multi_gauss(mean[index], cov[index], data))
        # Append highest probability class, aka the prediction
        pred.append(classes[np.argmax(preds)])
    return pred

In [7]:
# Player classifier trained on all data
player_classes, player_mean, player_cov = naive_bayes_learn(player_data, player_labels)
pred = naive_bayes_predict(player_classes, player_mean, player_cov, player_data)
match = 0
missed = 0
for index in range(0, len(pred)):
    if pred[index] == player_labels[index]:
        match += 1
    else:
        missed += 1
print("Match, Missed: ", match, missed)

Match, Missed:  325 165


I performed classification on the entire dataset to determine if the algorithm was working, and to determine how well it would perform for this set of data. I also tried performing classification with un-normalized data, but this resulted in an overflow error in the exponent of the multivariate gaussian, so I opted to normalize the data in order to fix this.

In [8]:
# Iris classifier trained on all data
iris_classes, iris_mean, iris_cov = naive_bayes_learn(iris_data, iris_label)
pred = naive_bayes_predict(iris_classes, iris_mean, iris_cov, iris_data)
match = 0
missed = 0
for index in range(0, len(pred)):
    if pred[index] == iris_label[index]:
        match += 1
    else:
        missed += 1
print("Match, Missed: ", match, missed)

Match, Missed:  147 3


In [9]:
# Cluster dataset X into k clusters using k_means clustering

def k_means(k, X):
    
    #Create k clusters and randomly initalize them
    clusters = []
    for clus in range(0, k):
        clusters.append([])
    for index, x in enumerate(X):
        clusters[index % k].append(x)
    
    #Start Kmeans loop
    converged = False
    while not converged:
        # Keep track of previous clusters
        prev_clusters = copy.deepcopy(clusters)

        # Calculate centroid of clusters
        cluster_means = []
        for cluster in prev_clusters:            
            cluster_means.append(np.mean(cluster, axis=0))

        # Initalize new blank clusters
        clusters = []
        for clus in range(0, k):
            clusters.append([])
        
        # Reset Convergence Test
        converged = True
        
        # Move all elements to cluster with closest centroid
        for index, cluster in enumerate(prev_clusters):
            for data in cluster:
#                 print("Data: ", data)
                dist = []
                data = np.array(data)
                # Calculate distance from all centroids
                for mean in cluster_means:
                    dist.append(np.linalg.norm(mean - data))
#                 print("Distances: ", dist)
#                 print(dist)
                new_index = np.argmin(dist)
                clusters[new_index].append(data)
#                 print("Data moved to: ", new_index)
                # If any datapoint was moved to a different cluster, no convergence
                if index != new_index:
                    converged = False

    return clusters

In [10]:
# Cluster the seed database

predicted_clusters = k_means(3, seed_data)

#Determine the clusters
clusters = []
data = []
for i in seed_label:
    if i not in clusters:
        clusters.append(i)
        data.append([])
#Split up data into arrays by cluster
for index, info in enumerate(seed_data):
    data[clusters.index(seed_label[index])].append(info)
    
# Compare all given clustes with estimated clusters in order to determine performance
cluster_map = []
for known_index, cluster in enumerate(data):
    # Compare known cluster against all possible clusters
    max_matches = 0
    best_match = 0
    for index, pred in enumerate(predicted_clusters):
        matches = 0
        # Check every element if it is in the predicted cluster
        for data1 in cluster:
            for every in pred:
                data1 = np.array(data1)
                every = np.array(every)
                if np.array_equal(data1, every):
                    matches += 1
                    break
#         print("Cluster: ", index, "matched: ", matches, "times with given Cluster:", known_index)
        if matches > max_matches:
            max_matches = matches
            best_match = index
    cluster_map.append(best_match)
#     print()

# Print the matches
for index, i in enumerate(cluster_map):
    print("Given Cluster: ", index, "maps with estimated Cluster: ", i)
    matches = 0
    for data1 in data[index]:
        for every in predicted_clusters[i]:
            data1 = np.array(data1)
            every = np.array(every)
            if np.array_equal(data1, every):
                matches += 1
                break
    print("Num Matched: ", matches, "Num Estimated: ", len(predicted_clusters[i]), "Num Given: ", len(data[index]))
    print()

Given Cluster:  0 maps with estimated Cluster:  1
Num Matched:  57 Num Estimated:  67 Num Given:  70

Given Cluster:  1 maps with estimated Cluster:  0
Num Matched:  60 Num Estimated:  61 Num Given:  70

Given Cluster:  2 maps with estimated Cluster:  2
Num Matched:  70 Num Estimated:  82 Num Given:  70



In [11]:
# K fold crossvalidation on Player Dataset

# add one for validation
k = 5 + 1

# Create k clusters and randomly initalize them
data_folds = []
label_folds = []
for fold in range(0, k):
    data_folds.append([])
    label_folds.append([])

for index, x in enumerate(player_data):
    data_folds[index % k].append(x)
    label_folds[index % k].append(player_labels[index])

# Take out a validation fold
data_val = data_folds.pop(0)
label_val = label_folds.pop(0)

best_mean = None
best_cov = None
best_acc = 0
    
for index in range(0, k - 1):
    # Create training and testing folds
    temp_data = copy.deepcopy(data_folds)
    test_data = temp_data.pop(index)
    train_data = []
    for fold in temp_data:
        for data in fold:
            train_data.append(data)
    
    
    temp_label = copy.deepcopy(label_folds)
    test_label = temp_label.pop(index)
    train_label = []
    for fold in temp_label:
        for data in fold:
            train_label.append(data)
            
            
    # Player classifier trained on all data
    player_classes, player_mean, player_cov = naive_bayes_learn(train_data, train_label)
    pred = naive_bayes_predict(player_classes, player_mean, player_cov, test_data)
    match = 0
    missed = 0
    for index in range(0, len(pred)):
        if pred[index] == test_label[index]:
            match += 1
        else:
            missed += 1
    acc = match / (match + missed)
    print("Match, Missed: ", match, missed, "Accuracy: ", acc)
    if acc > best_acc:
        best_mean = player_mean
        best_cov = player_cov
print()

print("Validating Data")
pred = naive_bayes_predict(player_classes, best_mean, best_cov, data_val)
match = 0
missed = 0
for index in range(0, len(pred)):
    if pred[index] == label_val[index]:
        match += 1
    else:
        missed += 1
acc = match / (match + missed)
print("Accuracy: ", acc)
    

Match, Missed:  35 47 Accuracy:  0.4268292682926829
Match, Missed:  31 51 Accuracy:  0.3780487804878049
Match, Missed:  36 46 Accuracy:  0.43902439024390244
Match, Missed:  44 37 Accuracy:  0.5432098765432098
Match, Missed:  39 42 Accuracy:  0.48148148148148145

Validating Data
Accuracy:  0.4268292682926829


In [12]:
# K fold crossvalidation on Player Dataset with one validation set

# add one for validation
k = 5 + 1

# Create k clusters and randomly initalize them
data_folds = []
label_folds = []
for fold in range(0, k):
    data_folds.append([])
    label_folds.append([])

for index, x in enumerate(iris_data):
    data_folds[index % k].append(x)
    label_folds[index % k].append(iris_label[index])

# Take out a validation fold
data_val = data_folds.pop(0)
label_val = label_folds.pop(0)

best_mean = None
best_cov = None
best_acc = 0

for index in range(0, k - 1):
    # Create training and testing folds
    temp_data = copy.deepcopy(data_folds)
    test_data = temp_data.pop(index)
    train_data = []
    for fold in temp_data:
        for data in fold:
            train_data.append(data)
    
    
    temp_label = copy.deepcopy(label_folds)
    test_label = temp_label.pop(index)
    train_label = []
    for fold in temp_label:
        for data in fold:
            train_label.append(data)
            
            
    # Player classifier trained on all data
    iris_classes, iris_mean, iris_cov = naive_bayes_learn(train_data, train_label)
    pred = naive_bayes_predict(iris_classes, iris_mean, iris_cov, test_data)
    match = 0
    missed = 0
    for index in range(0, len(pred)):
        if pred[index] == test_label[index]:
            match += 1
        else:
            missed += 1
    acc = match / (match + missed)
    print("Match, Missed: ", match, missed, "Accuracy: ", acc)
    if acc > best_acc:
        best_mean = iris_mean
        best_cov = iris_cov
print()

print("Validating Data")
pred = naive_bayes_predict(iris_classes, best_mean, best_cov, data_val)
match = 0
missed = 0
for index in range(0, len(pred)):
    if pred[index] == label_val[index]:
        match += 1
    else:
        missed += 1
acc = match / (match + missed)
print("Accuracy: ", acc)

Match, Missed:  25 0 Accuracy:  1.0
Match, Missed:  24 1 Accuracy:  0.96
Match, Missed:  25 0 Accuracy:  1.0
Match, Missed:  24 1 Accuracy:  0.96
Match, Missed:  24 1 Accuracy:  0.96

Validating Data
Accuracy:  0.96
