Importing and Preprocessing the Adult Autism Data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from scipy import stats 
from scipy.io.arff import loadarff
from IPython.display import clear_output

#Autism
Autism_Adult, meta = loadarff('Autism-Adult-Data.arff')

Autism_Adult_data = np.array(Autism_Adult[meta.names()[0]].astype(int, copy = True)).reshape(704,1)

# Add every integer input vector to Eye_State_data
for i in range(1,11):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(int, copy = True)]

# Add every string input vector to Eye_State_data
for i in range(11,17):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

# Add integer input vector to Eye_State_data, 18th column
Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[17]]).astype(int, copy = True)]

for i in range(18,len(meta.names())):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

Autism_frame = pd.DataFrame(data = Autism_Adult_data, columns = meta.names()[:])
Autism_frame.replace('?',np.NaN, inplace = True)

autism_mode = Autism_frame.mode(axis=0)
Autism_frame[meta.names()[12]].replace(np.NaN, autism_mode[meta.names()[12]].values[0],inplace = True)
Autism_frame[meta.names()[19]].replace(np.NaN, autism_mode[meta.names()[19]].values[0],inplace = True)

from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
for i in range(11,len(meta.names())):
    if i != 17: 
        Autism_frame[meta.names()[i]] = labelEncoder_X.fit_transform(Autism_frame[meta.names()[i]])

autism_data = Autism_frame.iloc[:,:-1].values
autism_labels = Autism_frame.iloc[:,20].values
autism_data = autism_data.astype(int, copy = True)
autism_labels = autism_labels.astype(int, copy = True)

### Finding the top r principal components to obtain 100% of the original data information and transforming the data into the r-dimension

In [2]:
#Define PCA Function
def pca(data, num_of_prin_comp, data_orientation = "row"):
    num_of_data = len(data)
    dim_of_data = len(data[0])
    if data_orientation == "row":
        transposed_data = np.transpose(data) #Changes dataset so that data samples are column vectors
    mean = transposed_data.mean(1)  #Mean Vector
    centered_data = np.zeros((dim_of_data,num_of_data))

    for i in range(num_of_data):
        centered_data[:,i] = transposed_data[:,i] - mean  #Centering Data

    svd_u, svd_sigma, svd_v = np.linalg.svd(centered_data, full_matrices = True)  # Singular Value Decompostion

    u = np.zeros((dim_of_data,num_of_prin_comp))
    s = np.zeros((num_of_prin_comp,num_of_prin_comp))

    for i in range(dim_of_data):
        for j in range(num_of_prin_comp):
            u[i,j] = svd_u[i,j] #First r singular vectors of U
    for i in range(num_of_prin_comp):
        s[i,i] = svd_sigma[i] #Largest r singular values
    
    w = np.matrix(u)*np.matrix(s) #Principal Component Matrix with Principal Axes as Columns
    for i in range(num_of_prin_comp):
        w[:,i] = w[:,i]/np.linalg.norm(w[:,i]) #Normalizing Each Principal Component


    transformed_data = np.transpose(np.transpose(w)*centered_data) #Feature Vectors
    return transformed_data

In [3]:
#Finding the smallest number of principal components for .99 Representation of Original Data:
m = len(autism_data[0])
n = len(autism_data)
centered = np.zeros((m,n))
for i in range(n):
    #Centering Training Data
    centered[:,i] = np.transpose(autism_data)[:,i] - np.transpose(autism_data).mean(1)
training_data_norm_squared = np.square(np.linalg.norm(centered))
svd_u, svd_sigma, svd_v = np.linalg.svd(centered, full_matrices = True)  # SVD

r = 0 #Top r principal components

for i in range(len(svd_sigma)):
    sum = 0
    representation = 0;
    for j in range(i+1):
        sum += np.square(svd_sigma[j])
    representation = sum/training_data_norm_squared
    if representation >= 1:
        r = i+1
        print("100% of the Original Data is represented by the top", r, "principal components")
        break

100% of the Original Data is represented by the top 4 principal components


In [4]:
transformed_autism = pca(autism_data, r)

### Defining the decision trees and the metrics used. Also defining the cross validation function and training the data with 10 folds.

In [5]:
#Gini Index Metric for Classification
def gini_index(region_labels):
    return 1-region_labels.tolist().count(stats.mode(region_labels)[0])/len(region_labels)

def mean_squared_error(region_labels):
    error = 0
    mean = np.mean(region_labels)
    for i in range(len(region_labels)):
        error += np.square(region_labels[i]-mean)
    return error/len(region_labels)

def predict(region_labels,type_of_problem="classification"):
    if type_of_problem == "classification":
        return stats.mode(region_labels)[0]
    elif type_of_problem == "regression":
        return np.mean(region_labels)
    return np.NaN

class Node:
    def __init__(self):
        self.parent = None #Contains the original region this region was split from
        self.left = None #Contains the regions that are split from this region
        self.right = None
        self.prediction = np.NaN #Contains either the majority vote or mean of the data in the region
        self.data = None #Contains the dataset of the region
        self.labels = None #Contains the labels of the dataset of the region
        self.split_axis = np.NaN #The axis of the training data the split was made on
        self.split_value = np.NaN #The value on the axis where the training data was split on
        
def tree_split(region_node, metric, type_of_problem):
    data, labels = region_node.data, region_node.labels
    left_node, right_node = Node(), Node()
    splits, error = np.empty((len(data[0]), 15)), np.empty((len(data[0]), 15))
    
    for i in range(len(data[0])):
        effective_range = np.delete(data[:,i],np.concatenate((np.argpartition(data[:,i], -5)[-5:],np.argpartition(data[:,i], 5)[:5])))
        dmin,mean,dmax = np.min(effective_range), np.mean(effective_range), np.max(effective_range)
        splits[i] = np.delete(np.linspace(dmin, dmax, num=17),[0,16])
        
        for j in range(len(splits)):
            first, second = list(), list()
            for k in range(len(data)):
                if (data[k,i] < splits[i,j]):
                    first.append(labels[k])
                else:
                    second.append(labels[k])
            if len(first) == 0 or len(second) == 0:
                error[i,j] = np.inf
            else:
                error[i,j] = metric(np.array(first)) + metric(np.array(second))
                
    region_node.split_axis, s = np.unravel_index(error.argmin(), error.shape)
    region_node.split_value = splits[region_node.split_axis, s]
    
    left_node.data, right_node.data = np.empty((0,len(data[0]))),np.empty((0,len(data[0])))
    left_node.labels, right_node.labels = np.empty((0,1)),np.empty((0,1))

    for k in range(len(data)):
        if (data[k, region_node.split_axis] < region_node.split_value):
            left_node.data = np.append(left_node.data, [data[k]], axis=0)
            left_node.labels = np.append(left_node.labels, [labels[k]])
        else:
            right_node.data = np.append(right_node.data, [data[k]], axis=0)
            right_node.labels = np.append(right_node.labels, [labels[k]])
            
    left_node.prediction = predict(left_node.labels,type_of_problem)
    right_node.prediction = predict(right_node.labels,type_of_problem)
    
    region_node.left, region_node.right = left_node, right_node
    left_node.parent, right_node.parent = region_node, region_node
    
    if len(left_node.labels) > 50:
        tree_split(left_node,gini_index,type_of_problem)
    if len(right_node.labels) > 50:
        tree_split(right_node,gini_index,type_of_problem)
            
#Decision Tree Algorithm
def decision_tree(training_data, training_labels, test_data, type_of_problem="classification"):
    root = Node()
    root.data, root.labels = training_data, training_labels
    tree_split(root,gini_index,type_of_problem)
    
    predicted_values = np.empty((len(test_data), 1))
    classification_rate = 0
    temp_node = root
    
    for i in range(len(test_data)):
        while (temp_node.left != None and temp_node.right != None):
            if test_data[i,temp_node.split_axis] < temp_node.split_value:
                temp_node = temp_node.left
            else:
                temp_node = temp_node.right
        predicted_values[i] = temp_node.prediction
    return predicted_values    

In [6]:
from sklearn.ensemble import RandomForestClassifier
def random_forests(train, train_lbls, test):
    clf = RandomForestClassifier()
    clf.fit(train,train_lbls)
    return clf.predict(test)

In [7]:
def cross_validation(k, split_data, split_labels, classifier_func = decision_tree):
    classification_rate = np.zeros((k)) #Array to hold classification rate of each fold
    
    for i in range(k):
        #Seperating split data into training and test sets (4 splits for training, 1 for testing)
        train = np.concatenate(np.delete(split_data, i))
        train_lbls = np.concatenate(np.delete(split_labels, i))
        
        test = split_data[i]
        test_lbls = split_labels[i]
        
        #Obtaining classified test labels
        classifier_labels = classifier_func(train, train_lbls, test)
        
        #Calculating classification rate: (# of correctly classified test samples)/(total number of test samples)
        for j in range(len(classifier_labels)):
            if test_lbls[j] == classifier_labels[j]:
                classification_rate[i] += 1
        classification_rate[i] /= len(classifier_labels)
    
    #Returning the mean and variance of the classification rates
    return np.mean(classification_rate), np.var(classification_rate)

In [8]:
#Splits
k = 10 #Number of folds for cross validation

n = len(transformed_autism)

data_splits = np.zeros((k), dtype=object)
label_splits = np.zeros((k), dtype=object)

#Defining the splits
for i in range(0,k):
    data_splits[i] = np.array(transformed_autism[int((i*n)/k): int((i+1)*n/k)])
    label_splits[i] = np.array(autism_labels[int((i*n)/k): int((i+1)*n/k)])

In [9]:
mean, var = cross_validation(10, data_splits, label_splits, decision_tree)
print("Mean of classification rate with decision trees is: ", mean)
print("Variance of classification rate with decision trees is: ", var)

Mean of classification rate with decision trees is:  0.7320925553319919
Variance of classification rate with decision trees is:  0.011773702577638873


Comparing with sklearn's implementation of Random Forests

In [10]:
mean, var = cross_validation(10, data_splits, label_splits, random_forests)
print("Mean of classification rate with random forests is: ", mean)
print("Variance of classification rate with random forests is: ", var)

Mean of classification rate with random forests is:  0.9928973843058351
Variance of classification rate with random forests is:  0.00013094300207684722
