In [106]:
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from glob import glob
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score

%matplotlib inline

h = .02  # step size in the mesh

In [107]:
names = ["Nearest Neighbors","Decision Tree", "Random Forest", "Naive Bayes", "Neural Net"]

In [108]:
classifiers = [
    KNeighborsClassifier(3),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    GaussianNB(),
    MLPClassifier(alpha=1, max_iter=1000)
    ]

SyntaxError: invalid syntax (<ipython-input-108-f5b0f29083cb>, line 7)

In [None]:
def folder_finder(path):

    file_list = []

    for j in glob(path+"*.csv"):
        file_list.append(j)

    return file_list

def read_file(path):

    data = pd.read_csv(path, na_values = 'NaN', keep_default_na = False) 
    return data

def pd_to_np(data):
    
    if type(data) == np.ndarray:
      print('Data is already in numpy format!')
    else:
      data = data.values
      #print('Pandas to Numpy done!')

    return data


def string_to_index(activity_label):

    activity_class =[]
    har_class = {
                    'Cook':0,
                    'Eat':1,
                    'Phone':2,
                    'Read':3,
                    'Watch_TV':4
                }
    for label in activity_label:
        activity_class.append(har_class[label[0]])

    return activity_class


def data_loader(path, split=0.3):
    x = y = []
    feature_list = []
    
    pd_data = read_file(path)

    for i in pd_data:
        feature_list.append(i)

    selectData = pd_data.loc[:, feature_list[:-1]]
    activityLabel = pd_data.loc[:, ['activity']]
    x = pd_to_np(selectData)
    x = StandardScaler().fit_transform(x)
    y = string_to_index(activityLabel.values)
    y = np.asarray(y) 
    y = y.astype('int32')
    return x,y

def full_dataset(file_list):

    x=y = np.asarray([])
    x_temp =  y_temp =  []

    for i in range(len(file_list)):
        x_temp, y_temp = data_loader(file_list[i])

        if i == 0:
            x = x_temp
            y = y_temp
        else:
            x = np.concatenate([x, x_temp],axis=0)
            y  = np.concatenate([y ,y_temp],axis=0)
    return x,y

In [None]:
file_list = folder_finder("../Dataset/")
print(file_list)
print('Total files:',len(file_list))
X,y= full_dataset(file_list)
print(X.shape)
print(y.shape)

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
figure = plt.figure(figsize=(27, 9))

dmap = {
            0:'Cook',
            1:'Eat',
            2:'Phone',
            3:'Read',
            4:'Watch_TV'
        }

for i in range(len(file_list)):
    X,y = data_loader(file_list[i])
    pca = make_pipeline(StandardScaler(),PCA(n_components=5, random_state=42))
    pc_train = pca.fit_transform(X)
    print ("shape of pca",pc_train.shape)
    ex_variance=np.var(pc_train,axis=0)
    ex_variance_ratio = ex_variance/np.sum(ex_variance)
    print (ex_variance_ratio) 
    print("\nSum of ex_variance_ratio : ", np.sum(ex_variance_ratio)) 
#     pcaDf = pd.DataFrame(data = pc_train, columns = ['pc 1', 'pc 2','pc 3'])
#     pcaDf['Target'] = y
#     pcaDf['Target'] = pcaDf["Target"].map(dmap)
#     sns.FacetGrid(pcaDf,hue='Target',height=6).map(plt.scatter,'pc 1','pc 2','pc 3').add_legend()

In [None]:
# figure = plt.figure(figsize=(30, 30))

# count = 1

# for i in range(len(file_list)):
#     X,y = data_loader(file_list[i])
#     pca = PCA(n_components=3)
#     pc_train = pca.fit_transform(X)
#     pcaDf = pd.DataFrame(data = pc_train, columns = ['pc 1', 'pc 2','pc 3'])
#     pcaDf['Target'] = y
    
#     ax = plt.subplot(6, 2, count)
#     ax.scatter(X[:,0], y, color='r')
#     ax.set_xlabel('X 0')
#     ax.set_ylabel('y')
#     ax.set_title('X vs y')
    
#     ax = plt.subplot(6, 2, count+1)
#     ax.scatter(X[:,1], y, color='b')
#     ax.set_xlabel('X 1')
#     ax.set_ylabel('y')
#     ax.set_title('X vs y')
#     count+=2
#     plt.savefig('{:03d}.png'.format(len(file_list)))

# Classifier

In [None]:
X = pc_train
print(X.shape)

rng = np.random.RandomState(2)
X += 2 * rng.uniform(size=X.shape)
linearly_separable = (X, y)

datasets = [linearly_separable]

In [None]:
figure = plt.figure(figsize=(27, 9))
i = 1

# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    # preprocess dataset, split into training and test part
    X, y = ds
    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)

    x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
    y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),np.arange(y_min, y_max, h))

#     # just plot the dataset first
#     cm = plt.cm.RdBu
#     cm_bright = ListedColormap(['#FF0000', '#0000FF'])
#     ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
#     if ds_cnt == 0:
#         ax.set_title("Input data")
#     # Plot the training points
#     ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,edgecolors='k')
#     # Plot the testing points
#     ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6,edgecolors='k')
#     ax.set_xlim(xx.min(), xx.max())
#     ax.set_ylim(yy.min(), yy.max())
#     ax.set_xticks(())
#     ax.set_yticks(())
#     i += 1

    

    # iterate over classifiers
    for name, clf in zip(names, classifiers):
         ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
         clf.fit(X_train, y_train)
         score = clf.score(X_test, y_test)
         print (name,score)
            


#         # Plot the decision boundary. For that, we will assign a color to each
#         # point in the mesh [x_min, x_max]x[y_min, y_max].
#          if hasattr(clf, "decision_function"):
#              Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
#          else:
#              Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]

# #         # Put the result into a color plot
#          Z = Z.reshape(xx.shape)
#          #Z = Z.flatten().reshape(1960,420)
#          ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)
#          print(Z.shape)

# #         # Plot the training points
#          ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright,edgecolors='k')
# #         # Plot the testing points
#          ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,edgecolors='k', alpha=0.6)

#          ax.set_xlim(xx.min(), xx.max())
#          ax.set_ylim(yy.min(), yy.max())
#          ax.set_xticks(())
#          ax.set_yticks(())
#          if ds_cnt == 0:
#              ax.set_title(name)
#          ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
#                  size=15, horizontalalignment='right')
#          i += 1

# plt.tight_layout()
# plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
from sklearn.metrics import confusion_matrix

def train_model(model, X_train, y_train):
    return model.fit(X_train, y_train)

def test_model(model,X_test):
    pred = model.predict(X_test)
    return pred
    
def model_init():

    model = classifiers[0]
    # model = DecisionTreeClassifier()
    return model
    
def heatmap_cm(confusion_matrix):
    sns.heatmap(confusion_matrix, annot=True, cmap="YlGnBu" ,fmt='g')
    plt.tight_layout()
    plt.title('Confusion matrix', y=1.1)
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')
    
# For model evolution
def model_evalution(y_test, y_pred):
    #confusion_matrix(y_test, y_pred)
    print("------------------- Model evaluation ----------------\n\n")
    print("Confusion Matrix : \n",confusion_matrix(y_test, y_pred))
    print("--------------------------------------------")
    print("Accuracy Score : ",accuracy_score(y_test,y_pred))
    print("Classification Report : \n",classification_report(y_test, y_pred))
    print("--------------------------------------------")
    print("")

    heatmap_cm(pd.DataFrame(confusion_matrix(y_test,y_pred)))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4, random_state=42)
X_train.shape

In [None]:
model = model_init()
net = train_model(model,X_train,y_train)

In [None]:
y_pred = test_model(net,X_test)
model_evalution(y_test, y_pred)

In [None]:
# for name, clf in zip(names, classifiers):
#     clf.fit(X_train, y_train)
#     score = clf.score(X_test, y_test)
#     print(score)

In [None]:
def print_confusion_matrix(confusion_matrix, class_names, figsize = (10,7), fontsize=14):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy.ndarray
        The numpy.ndarray object returned from a call to sklearn.metrics.confusion_matrix. 
        Similarly constructed ndarrays can also be used.
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (10,7).
    fontsize: int
        Font size for axes labels. Defaults to 14.
        
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df_cm = pd.DataFrame(
        confusion_matrix, index=class_names, columns=class_names, 
    )
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")
    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    return fig