In [2]:
import numpy as np
import pandas as pd
import time
import random

from sklearn import neighbors, datasets
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

### Load and Prepare Data

In [3]:
dataframe = pd.read_csv('./data.csv')

data_size = dataframe.shape[0]
feature_num = dataframe.shape[1] - 1
test_size = round(data_size * 0.2)
#print(test_size)

# normalization
df = dataframe.iloc[:, 0:48]
xmax = df.max()
xmin = df.min()

dff=(df-xmin)/(xmax-xmin)
#print(dff)

dataframee = pd.concat([dff,dataframe['48']],axis=1)

#Split data
random.seed(0)
indices = dataframee.index.tolist()
#print(indices)
test_in = random.sample(population = indices, k = test_size)
#print(test_in)

test_df = dataframee.loc[test_in]
train_df = dataframee.drop(test_in)



X_train_df = (train_df.iloc[:, 0:48])
Y_train_df = train_df['48']

X_test_df = (test_df.iloc[:, 0:48])
Y_test_df = test_df['48'] 

X_train = X_train_df.to_numpy()
Y_train = Y_train_df.to_numpy()
X_test = X_test_df.to_numpy()
Y_test = Y_test_df.to_numpy()

print(X_train.shape,Y_train.shape)
    
print("size of training data:",len(X_train))
print("size of testing data:", len(X_test))



(32765, 48) (32765,)
size of training data: 32765
size of testing data: 8191


In [4]:
dataframee.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,39,40,41,42,43,44,45,46,47,48
0,0.596225,0.540284,0.615606,0.838338,0.717266,0.77362,0.710818,0.710995,0.712087,0.644,...,0.000151,0.012935,0.110421,0.320896,0.364686,0.358736,0.317241,0.344491,0.308625,5
1,0.596119,0.541987,0.65748,0.839107,0.7148,0.756745,0.784912,0.785015,0.784224,0.710168,...,0.00024,0.015774,0.05857,0.464179,0.524752,0.552045,0.228966,0.256625,0.221024,8
2,0.596369,0.544879,0.644498,0.839618,0.724341,0.781678,0.688424,0.688369,0.688078,0.533995,...,0.000351,0.038273,0.148641,0.383582,0.435644,0.444238,0.329655,0.358438,0.320755,9
3,0.596565,0.543414,0.630369,0.838824,0.702555,0.798007,0.700639,0.70066,0.701047,0.516568,...,0.000167,0.006927,0.048715,0.326866,0.372937,0.379182,0.371034,0.400279,0.362534,9
4,0.596308,0.545396,0.646339,0.838603,0.722751,0.78374,0.764063,0.763992,0.763704,0.582751,...,0.00035,0.028285,0.112182,0.391045,0.443894,0.453532,0.325517,0.354254,0.316712,6


### KNN

In [5]:
def KNN_find_similarity(X_train, X_test):
    norm_test = np.linalg.norm(X_test, axis = 1)
    norm_test = np.reshape(norm_test,(-1,1))
    norm_train = np.linalg.norm((X_train), axis = 1)
    norm_train = np.reshape(norm_train,(1,-1))
    
    norm = np.dot(norm_test,norm_train) 
    cos_sim = np.dot(X_test, np.transpose(X_train))/norm
    
    return cos_sim

def KNN_predict(cos_sim,K,Y_train):
    Y_pred = []
    for row in cos_sim:
        votes=[]
        index = row.argsort()[::-1][0:K]
        for i in index:
            votes.append(Y_train[i])
        mmax = np.argmax(np.bincount(votes))
        Y_pred.append(mmax)
    Y_pred = np.asarray(Y_pred)
    return Y_pred
    

## K-Means

In [6]:
#find k cluster center
#use test set to find which cluster center it similar to 
def update_centroids(centers, X_train):
    #k = len(centers)
    k = centers.shape[0]
    n = X_train.shape[0]
    d = X_train.shape[1]
    
    #centers = np.asarray(centers)
    #print(centers.shape)
    #print(X_train.shape)
    #print("AAAAAAAAAAAAAAAAAAAA")
    
    #compute l2
    aa = np.sum(np.square(centers),axis = 1)
    bb = np.sum(np.square(X_train),axis = 1)
    #print(aa.shape)
    #print(bb.shape)
    aa = np.reshape(aa,(-1,1))
    bb = np.reshape(bb,(-1,1))
    #print(aa.shape)
    #print(bb.shape)
    aa = np.tile(aa,n)
    bb = np.tile(bb,k)
    bb = np.transpose(bb)
    
    cc = -2 * np.dot(centers,np.transpose(X_train))
    #print(aa.shape)
    #print(bb.shape)
    #print(cc.shape)
    #print("BBBBBBBBBBBBBBBBBBBBBB")
    xx = aa + bb + cc
    #print("bbbbbbbbbbbbbbbbbbbbbbbb")
    dist = np.sqrt(xx)
    #print("bbbbbbbbbbccccccccccccccc")
    colmax = dist.argmax(axis=0)
    #print("CCCCCCCCCCCCCCCCCCCCCCCC")
    
    
    #new_centers = [np.zeros(d)] * k
    #new_centers = np.asarray(new_centers)
    
    #print("new_centers",new_centers.shape)
    
    new_centers = np.zeros((k,d))
    
    clusters = [np.zeros((n,d))] * k
    
    
    for i in range(n):
        cluster_i = colmax[i]
        #clusters[cluster_i].append(X_train[i])
        clusters[cluster_i][i,:]=X_train[i]
    #print("DDDDDDDDDDDDDDDDDDDDD")
    
    
    #clusters = np.asarray(clusters)
    for j in range(k):
        new_centers[j] = np.mean(clusters[j],axis=0)
    #print("EEEEEEEEEEEEEEEEEEEEEEEEE")
    #print(type(new_centers))
    
    return new_centers  # k * d


In [7]:
def find_centroids(X_train,K):
    d = X_train.shape[1]
    
    #create a matrix for centers
    final_centers = [np.zeros(d)] * K
    #centers = np.asarray(centers)
    centers = np.zeros((K,d))
    #print(centers.shape)
    #print(X_train.shape)
    
    #if center still change after 10000 times, force stop
    itera = 10000
    while(itera!=0):
        print(itera)
        
        #initial k centers
        if itera == 10000:
            #print("111111111111")
            ind = random.sample(range(0,len_train),K)
            #print(ind)
            ii = 0
            for i in ind:
                centers[ii,:] = X_train[i,:]
                ii = ii + 1
                #print(X_train[i].shape)
                #centers[i] = X_train[i]
        #print("2222222222222")
        
        #update centers  
        new_centers = update_centroids(centers, X_train)
        #print("333333333333333333")
        
        print(type(new_centers),type(centers),new_centers.shape)       
        if ((new_centers == centers).all()):
            #print("endendendendendend")
            break
        #count = 0
        #for i in range(K):
        #    if ((new_centers[i]==centers[i]).all()):
        #        count = count + 1
        #if count == K:
        #    break
        #print("4444444444444444")
        centers = new_centers
        itera = itera - 1
        
    final_centers = new_centers
    
    return final_centers
    #return new_centers

## PCA

In [None]:
#X_train, N

# mean of feature vectors
mean_vector = np.mean(X_train,axis = 0)

# covariance matrix

## Random Forest

In [1]:
# data preprocessing:
# Bootstrap and make a dataset with size n 
# choose k features at each time
def sampling(data_train, K):
    n = len(data_train)
    sample_index = []
    fea_id = []
    
    data_sample = []
    feature_num = len(data_train[0] - 1)
    
    while len(fea_id) < K:
        fea_id.append(random.randint(0,feature_num-1))
    while len(sample_index) < n:
        sample_index.append(random.randint(0,n-1))
    
    for i in sample_index:
        sample_one = []
        for f in fea_id:
            sample_one.append(data_train[i][f])
#             collect features 
#       expend the last colum with target 
        sample_one.append(data_train[i][-1])
#     add this row of data to data_sample
        data_sample.append(sample_one)
    data_sample = np.asarray(data_sample)
    print(data_sample.shape)
    print(fea_id)
    return data_sample, fea_id
           
# sampling(data_train, 5)

# test1 = np.array([[1,2,3,4],[1,2,3,4],[1,2,3,4]])
# sampling(test1, 2)


# Calculate Gini inpurity
def getGini(data):
    datasum = len(data);
    classes = data[:,-1]
    counts = {}
    gini_inpurity = 1;
    for sample in classes:
        counts[sample] = counts.get(sample, 0) + 1
    for count in counts.values():
        gini_inpurity -= (count/datasum)**2
    return gini_inpurity    

class TreeNode:
    '''TreeNode initiator
    Includes:
    the feature chosen for split on that node;
    the value chosen for that feature
    （ ？？？）if there is no gain after trying all the features and values
        return the node with its tag
    (if there is a positive gain, return the pointer of that node)
    the right and left pointer
    '''
    
    def __init__(self, fea=-1, value = None, tag = -1, right=None, left=None):
        self.feature = feature 
        self.value = value  
        self.tag = tag  
        self.right = right  
        self.left = left  

def buidTree(data, fea_id):

    if len(data) == 0:
        return none;
    if status(data) == 1:
        return TreeNode(tag = data[0][-1])    
    gini_parent = getGini(data)
    best_split = None
    best_gain = 0.0
    best_sets = None
#     choose the best value for a specific feature
    
    for i in range(0, len(fea_id)):
#         for every single feature
        f_id = fea_id[i]
        features = data[:,i]
#     etract numbers for that feature for all the data in dataset
        values = []
        for i in range(0, len(data)-1):
            values.append((features[i]+features[i+1])/2)
        for value in values:
            set1, set2 = tree_split(data, i, value)
            gain = gini_parent - float(getGini(set1) * len(set1)/len(data) + getGini(set2)* len(set2)/len(data))
            if gain > bestgain:
                best_split = (f_id, value)
                best_gain = gain
                best_set = (set1, set2)

#   When to return the treeNode? 
    if bestGain > 0:
        left = buidTree(best_sets[0], fea_id)
        right = buidTree(best_sets[1], fea_id)
        return TreeNode(feature = best_split[0], value = best_split[1], right=right, left=left)
    else:
        return TreeNode(tag = data[0][-1]) 

# split data with two parts by value
def tree_split(data, i, value):
    set1 = []
    set2 = []
    for row in data:
        if row[i] < value:
            set1.append(row)
        elif row[i] > value:
            set2.append(row)
    return set1, set2

# return the number of classes
def status(data):
    count = {}
    for i in data[:, -1]:
        count[i] = count.get(i, 0) + 1
    return len(count)
    



# Construct the random forest:
# build m trees with k features of each 
def random_forest(data_train, m):
    trees = []
    features = []
    K = int(math.log(feature_num, 2))
    while len(trees) < m:
        sample = sampling(data_train, K)[0]
        fea_id = sampling(data_train, K)[1]
        features.append(fea_id)
        trees.append(buidTree(sample, fea_id))   
    return trees
    

def predict(y_true, y_test, m):
    data_train = y
    trees = random_forest(data_train, m)
    ans = []
    for row in data:          
        for tree in trees:
            ans.append(dfs(tree, row))
    return ans
        
        
def dfs(node, row):
    if node.tag!= none:
        return node.tag
    else:
        value = node.value
        f = node.feature
        x = row[f]
        if x < value:
            dfs(node.left, row)
        else:
            dfs(node.right, row)
    

# trees = random_forest(data_train, 2)
# predict(data_train, 2)

NameError: name 'data_train' is not defined

In [2]:
predict(predict(data_train, 2))

NameError: name 'data_train' is not defined

## Confusion Matrix

In [25]:

def confusMax(Y_pred, Y_test):
    num_class = len(np.unique(Y_test))
#     print(np.unique(Y_test))
    Y_hold = Y_test * num_class + Y_pred
#     Y_hold = np.zeros(num_class * num_class)
    Y_bin = np.arange(num_class * num_class + 1)
    con_matrx = np.asarray(np.histogram(Y_hold, bins=Y_bin)[0]).reshape(num_class, num_class)
#     return np.histogram(Y_hold, bins=Y_bin)
    return con_matrx
#     print(len(hold))
    
    

In [26]:
# Y1 = KNN(X_train,X_test,Y_train,5)

confusMax(Y1, Y_test)

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0],
       [  0, 795,   0,   0,   0,   0,  12,   0,   0,   0,   0],
       [  0,   0, 747,   0,   0,   0,   0,   0,   0,   0,  11],
       [  0,   0,   0, 726,   0,   4,   1,   0,   1,   0,   0],
       [  0,   0,   0,   2, 721,   2,   0,   0,   0,   0,   0],
       [  0,   1,   0,   5,   1, 712,   1,   0,   6,   0,   0],
       [  1,  12,   0,   2,   0,   0, 711,   0,   1,   7,   0],
       [  0,   0,   0,   0,   0,   0,   0, 754,   0,   0,   0],
       [  0,   2,   0,   1,   2,   7,   5,   0, 700,   0,   0],
       [  0,   3,   0,   3,   0,   0,  22,   0,   0, 721,   0],
       [  0,   0,  26,   0,   0,   0,   0,   0,   0,   0, 704]])

## Recall

In [45]:
def Recall(y_true,y_pred):
    num_class = len(np.unique(Y_test))
    print(num_class)
    con_matrix = confusMax(y_pred,y_true)
    sum_recall = 0.0
    for j in range(num_class):
        TP = 0
        FN = 0
        for i in range(num_class):
            if(i == j):
                TP += con_matrix[i][j]
            else: 
                FN += con_matrix[i][j]
        if TP == 0:
            sum_recall += 0
        else:             
            sum_recall += (TP/(TP + FN))
    recall = sum_recall/num_class
    return recall
            

            
        
#      """
#     :type y_true: numpy.ndarray
#     :type y_pred: numpy.ndarray
#     :rtype: float
#     """

In [46]:
# num_class = len(np.unique(Y_test))

Recall(Y_test, Y1)

11


0.8921579871684698

In [47]:
def Precision(y_true,y_pred):
    num_class = len(np.unique(Y_test))
    con_matrix = confusMax(y_pred,y_true)
#     num = len(con_matrix)
#     print(num)
    sum_prec = 0.0
    for i in range(num_class):
        TP = 0
        FP = 0
        for j in range(num_class):
            if(i == j):
                TP += con_matrix[i][j]
            else: 
                FP += con_matrix[i][j]
        if TP == 0:
            sum_prec += 0
        else:             
            sum_prec += (TP/(TP + FP))
        prec = sum_prec/num_class
    return prec
            

In [48]:
Precision(Y_test, Y1)

0.8917796237059065

## Functions

In [10]:
# -*- coding: utf-8 -*-
"""
Predicitve_Analytics.py
"""


def Accuracy(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    
    """
    l = len(y_true)
    count = 0
    for i in range(l):
        if y_true[i] == y_pred[i]:
            count = count + 1
    acc = count/l
    return acc

def Recall(y_true,y_pred):
     """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """

def Precision(y_true,y_pred):
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """
def WCSS(Clusters):
    """
    :Clusters List[numpy.ndarray]
    :rtype: float
    """
    
def ConfusionMatrix(y_true,y_pred):
    
    """
    :type y_true: numpy.ndarray
    :type y_pred: numpy.ndarray
    :rtype: float
    """  

def KNN(X_train,X_test,Y_train,N):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    :typr N: constant
    
    :rtype: numpy.ndarray
    """
    cos_sim = KNN_find_similarity(X_train, X_test)
    Y_pred = KNN_predict(cos_sim,N,Y_train)
    
    return Y_pred
        
def RandomForest(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: numpy.ndarray
    """
    
def PCA(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: numpy.ndarray
    """
    
def Kmeans(X_train,N):
    """
    :type X_train: numpy.ndarray
    :type N: int
    :rtype: List[numpy.ndarray]
    """
    centroids = find_centroids(X_train,N)
    return centroids

def SklearnSupervisedLearning(X_train,Y_train,X_test):
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """

def SklearnVotingClassifier(X_train,Y_train,X_test):
    
    """
    :type X_train: numpy.ndarray
    :type X_test: numpy.ndarray
    :type Y_train: numpy.ndarray
    
    :rtype: List[numpy.ndarray] 
    """


#"""
#Create your own custom functions for Matplotlib visualization of hyperparameter search. 
#Make sure that plots are labeled and proper legends are used
#"""



    


## Main()

In [36]:
len_train = len(Y_train)
len_test = len(Y_test)

## KNN
K_knn = 11

s1 = time.time()
Y_pred_knn = KNN(X_train,X_test,Y_train,K_knn)
e1 = time.time()

#Y_pred_knn = np.asarray(Y_pred_knn)
#print(Y_pred_knn.shape)
#print(Y_test.shape)
#acc_knn = Accuracy(Y_test, Y_pred_knn)
#print(acc_knn)
acc_knn = Accuracy(Y_test,Y_pred_knn)

print("Running time of KNN",e1-s1,".")
print("Accuracy of KNN",acc_knn,".")
print("============================")

## K-Means
K_kmeans = 11

s2 = time.time()
centroids = Kmeans(X_train,K_kmeans)
e2 = time.time()
print("Runtime of K-Means: ",e2-s2,"s.")
#print(centroids)

## PCA



Running time of KNN 33.30502200126648 .
Accuracy of KNN 0.9841289219875473 .
10000
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (11, 48)
9999
<class 'numpy.ndarray'> <class 'numpy.ndarray'> (11, 48)
Runtime of K-Means:  0.1835639476776123 s.




In [None]:
a = [np.array([1,2,3,4])]*5
print(a)
b = np.asarray(a)
print(b)
print(b.shape)
aa = np.sum(np.square(a),axis = 1)
print(aa)

In [2]:
import numpy as np
a = np.zeros((3,2))
print(a)

[[0. 0.]
 [0. 0.]
 [0. 0.]]


In [3]:
a = np.array([[1,2,3,4],[2,3,4,5],[3,4,5,6],[4,5,6,7],[5,6,7,8]])
print(a.shape)
x = np.zeros((6,4))
x[1,:] = a[0]
print(x)


#print(a[:,0])
#print(a.max(axis=0))
b = a.argmax(axis=0)
#print(b[0])

aaa = []*11
aaa.append([1,2,3])
aaa.append([222])
aaa.append([3,2,1])
#print(aaa[0])

(5, 4)
[[0. 0. 0. 0.]
 [1. 2. 3. 4.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [29]:
from sklearn.cluster import KMeans
    
kmeans = KMeans(n_clusters=11).fit(train_df)
centroids = kmeans.cluster_centers_
print(centroids)



[[5.96222509e-01 5.46346345e-01 6.46898682e-01 8.38899954e-01
  7.27504165e-01 7.83784611e-01 5.29478346e-01 5.29326875e-01
  5.28742346e-01 5.47212657e-01 5.47290494e-01 5.47474042e-01
  1.58367340e-02 2.03587274e-02 6.42754199e-02 1.57654043e-02
  3.56626558e-02 4.78043144e-02 5.32182908e-01 5.32177568e-01
  5.32247871e-01 5.33635662e-01 5.33666652e-01 5.33752106e-01
  5.41685345e-01 5.62270157e-01 4.52617717e-01 5.43255924e-01
  5.14683908e-01 5.86250793e-01 3.22484271e-01 3.37697482e-01
  3.28006638e-01 1.94725347e-01 1.97781648e-01 2.00199773e-01
  5.70687900e-04 3.75546138e-02 6.60251842e-02 3.49249768e-04
  4.87408056e-02 7.45192054e-02 3.72725076e-01 4.23218629e-01
  4.32447952e-01 3.24090481e-01 3.52505404e-01 3.16465685e-01
  2.00000000e+00]
 [5.96065939e-01 5.43367815e-01 6.41493724e-01 8.38773637e-01
  7.16552764e-01 7.81160565e-01 7.01563656e-01 7.01586333e-01
  7.01451618e-01 5.42056622e-01 5.42226998e-01 5.42502400e-01
  1.67388036e-02 2.00252471e-02 6.43814050e-02 1.642

In [None]:
knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train,Y_train)
a = Accuracy(Y_test,knn.predict(X_test))
print(a,"==========11")