Importing and Preprocessing the Adult Autism Data

In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import math
import pandas as pd
from scipy import stats 
from scipy.io.arff import loadarff
from IPython.display import clear_output

#Autism
Autism_Adult, meta = loadarff('Autism-Adult-Data.arff')

Autism_Adult_data = np.array(Autism_Adult[meta.names()[0]].astype(int, copy = True)).reshape(704,1)

# Add every integer input vector to Eye_State_data
for i in range(1,11):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(int, copy = True)]

# Add every string input vector to Eye_State_data
for i in range(11,17):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

# Add integer input vector to Eye_State_data, 18th column
Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[17]]).astype(int, copy = True)]

for i in range(18,len(meta.names())):
    Autism_Adult_data = np.c_[Autism_Adult_data, np.array(Autism_Adult[meta.names()[i]]).astype(str, copy = True)]

Autism_frame = pd.DataFrame(data = Autism_Adult_data, columns = meta.names()[:])
Autism_frame.replace('?',np.NaN, inplace = True)

autism_mode = Autism_frame.mode(axis=0)
Autism_frame[meta.names()[12]].replace(np.NaN, autism_mode[meta.names()[12]].values[0],inplace = True)
Autism_frame[meta.names()[19]].replace(np.NaN, autism_mode[meta.names()[19]].values[0],inplace = True)

from sklearn.preprocessing import LabelEncoder
labelEncoder_X = LabelEncoder()
for i in range(11,len(meta.names())):
    if i != 17: 
        Autism_frame[meta.names()[i]] = labelEncoder_X.fit_transform(Autism_frame[meta.names()[i]])
        
Autism_frame.drop(columns = ['age_desc'], inplace = True)
Autism_frame.drop(columns = ['result'], inplace = True)
Autism_frame.drop(columns = ['used_app_before'], inplace = True)
Autism_frame.drop(columns = ['contry_of_res'], inplace = True)

autism_data = Autism_frame.iloc[:,:-1].values
autism_labels = Autism_frame.iloc[:,16].values
autism_data = autism_data.astype(int, copy = True)
autism_labels = autism_labels.astype(int, copy = True)

### Finding the top r principal components to obtain 99.99% of the original data information and transforming the data into the r-dimension

In [2]:
#Define PCA Function
def pca(data, num_of_prin_comp, data_orientation = "row"):
    num_of_data = len(data)
    dim_of_data = len(data[0])
    if data_orientation == "row":
        transposed_data = np.transpose(data) #Changes dataset so that data samples are column vectors
    mean = transposed_data.mean(1)  #Mean Vector
    centered_data = np.zeros((dim_of_data,num_of_data))

    for i in range(num_of_data):
        centered_data[:,i] = transposed_data[:,i] - mean  #Centering Data

    svd_u, svd_sigma, svd_v = np.linalg.svd(centered_data, full_matrices = True)  # Singular Value Decompostion

    u = np.zeros((dim_of_data,num_of_prin_comp))
    s = np.zeros((num_of_prin_comp,num_of_prin_comp))

    for i in range(dim_of_data):
        for j in range(num_of_prin_comp):
            u[i,j] = svd_u[i,j] #First r singular vectors of U
    for i in range(num_of_prin_comp):
        s[i,i] = svd_sigma[i] #Largest r singular values
    
    w = np.matrix(u)*np.matrix(s) #Principal Component Matrix with Principal Axes as Columns
    for i in range(num_of_prin_comp):
        w[:,i] = w[:,i]/np.linalg.norm(w[:,i]) #Normalizing Each Principal Component


    transformed_data = np.transpose(np.transpose(w)*centered_data) #Feature Vectors
    return transformed_data

$\frac{\sum_{i=1}^{k} \sigma^2_i}{\left \| \overline{X} \right \|^2_F}\geq .9999$
<br>
This uses the singular value decomposition and the correlation of representation to the singular vlaues to find the top r principal components.

In [3]:
#Finding the smallest number of principal components for .9999 Representation of Original Data:
m = len(autism_data[0])
n = len(autism_data)
centered = np.zeros((m,n))
for i in range(n):
    #Centering Training Data
    centered[:,i] = np.transpose(autism_data)[:,i] - np.transpose(autism_data).mean(1)
training_data_norm_squared = np.square(np.linalg.norm(centered))
svd_u, svd_sigma, svd_v = np.linalg.svd(centered, full_matrices = True)  # SVD
print(svd_sigma)
r = 0 #Top r principal components

for i in range(len(svd_sigma)):
    sum = 0
    representation = 0;
    for j in range(i+1):
        sum += np.square(svd_sigma[j])
    representation = sum/training_data_norm_squared
    if representation >= .9999:
        r = i+1
        print("99.99% of the Original Data is represented by the top", r, "principal components")
        break

[3.03268355e+09 9.71392010e+01 2.13240272e+01 1.75560118e+01
 1.45167897e+01 1.34245513e+01 1.27766248e+01 1.18975048e+01
 1.15093056e+01 1.12745755e+01 1.03558018e+01 1.00589872e+01
 9.85292466e+00 8.81945541e+00 8.44445192e+00 7.39238642e+00]
99.99% of the Original Data is represented by the top 1 principal components


Transforming the data using the number of principal components found

In [4]:
transformed_autism = pca(autism_data, r)

### Defining the decision trees and the metrics used. Also defining the cross validation function and training the data with 10 folds.

In [5]:
#sklearn's implementation of random forests
from sklearn.ensemble import RandomForestClassifier
def random_forests(train, train_lbls, test):
    clf = RandomForestClassifier()
    clf.fit(train,train_lbls)
    return clf.predict(test)

In [6]:
def cross_validation(k, split_data, split_labels, classifier_func = random_forests):
    classification_rate = np.zeros((k)) #Array to hold classification rate of each fold
    
    for i in range(k):
        #Seperating split data into training and test sets
        train = np.concatenate(np.delete(split_data, i))
        train_lbls = np.concatenate(np.delete(split_labels, i))
        
        test = split_data[i]
        test_lbls = split_labels[i]
        
        #Obtaining classified test labels
        classifier_labels = classifier_func(train, train_lbls, test)
        
        #Calculating classification rate: (# of correctly classified test samples)/(total number of test samples)
        for j in range(len(classifier_labels)):
            if test_lbls[j] == classifier_labels[j]:
                classification_rate[i] += 1
        classification_rate[i] /= len(classifier_labels)
    
    #Returning the mean and variance of the classification rates
    return np.mean(classification_rate), np.var(classification_rate)

In [7]:
#Splits
k = 10 #Number of folds for cross validation

n = len(transformed_autism)

data_splits = np.zeros((k), dtype=object)
data_splits2 = np.zeros((k), dtype=object)
label_splits = np.zeros((k), dtype=object)

#Defining the splits
for i in range(0,k):
    data_splits[i] = np.array(transformed_autism[int((i*n)/k): int((i+1)*n/k)])
    data_splits2[i] = np.array(autism_data[int((i*n)/k): int((i+1)*n/k)])
    label_splits[i] = np.array(autism_labels[int((i*n)/k): int((i+1)*n/k)])

sklearn's implementation of Random Forests

In [8]:
mean, var = cross_validation(k, data_splits, label_splits, random_forests)
print("Mean of classification rate with random forests is: ", mean)
print("Variance of classification rate with random forests is: ", var)

Mean of classification rate with random forests is:  0.6609255533199195
Variance of classification rate with random forests is:  0.007789431154330407


Using random forests on original dataset without feature extraction to compare classification rate.

In [9]:
mean, var = cross_validation(k, data_splits2, label_splits, random_forests)
print("Mean of classification rate with random forests and original data is: ", mean)
print("Variance of classification rate with random forests and original data is: ", var)

Mean of classification rate with random forests and original data is:  0.9376458752515091
Variance of classification rate with random forests and original data is:  0.000754753470521317
