## Importing the required modules

In [1]:
import numpy as np
import matplotlib.pyplot as plt 
import pylab as pl
import pandas as pd
import os
import seaborn as sns
%matplotlib inline
from sklearn.metrics import confusion_matrix,classification_report
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn import svm, datasets
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from matplotlib.colors import rgb2hex
from matplotlib.cm import get_cmap
from sklearn.cluster import KMeans
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import h5py

# **K-Nearest Neighbours**

## Reading the Train and test sets

In [2]:
#for the train_data
train_data = pd.read_csv('train.csv')
y_train = train_data['Activity']
X_train = train_data.drop(columns = ['Activity', 'subject'])

#for the test_data
test_data = pd.read_csv('test.csv')
y_test = test_data['Activity']
X_test = test_data.drop(columns = ['Activity', 'subject'])

FileNotFoundError: [Errno 2] File b'train.csv' does not exist: b'train.csv'

## **Check if the classes are equally shared**

In [None]:
# Count the number of records for each activity
count_of_each_activity = np.array(y_train.value_counts())

# Identify all the unqiue activities and in sorted order
activities = sorted(y_train.unique())

# Plot a pie chart for different activities
plt.rcParams.update({'figure.figsize': [5, 5], 'font.size': 15})
plt.pie(count_of_each_activity, labels = activities, autopct = '%0.2f')

## **Cross validation on KNN**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier 
# consider a set of values for number of neighbors (K) as shown below for finding out the otimal number of neighbors.
k_list = list(range(1,50,2))
# creating list of cv scores
cv_scores = []

# perform 10-fold cross validation (cv=10)
for k in k_list:
    knn = KNeighborsClassifier(n_neighbors=k)
    #scores is an array of accuracies obtained for each fold considering the number of neighbors as k
    scores = cross_val_score(knn, X_train, y_train, cv=10, scoring='accuracy')
    #store the mean of the accuracies of all the folds in cv_scores array
    cv_scores.append(scores.mean())


## **Elbow method**

In [None]:
# changing to misclassification error
MSE = [1 - x for x in cv_scores]

plt.figure()
plt.figure(figsize=(15,10))
plt.title('The optimal number of neighbors', fontsize=20, fontweight='bold')
plt.xlabel('Number of Neighbors K', fontsize=15)
plt.ylabel('Misclassification Error', fontsize=15)
sns.set_style("whitegrid")
#plot between number of neighboes (K) and misclassification error also called elbow method.
plt.plot(k_list, MSE)

plt.show()

In [None]:
best_k = k_list[MSE.index(min(MSE))]
print("The optimal number of neighbors is %d." % best_k)

## **Applying KNN with K=11 (optimal number of neighbours)**

In [None]:
knn = KNeighborsClassifier(n_neighbors = 11).fit(X_train, y_train)

# accuracy on X_train
accuracy_knn_train = knn.score(X_train, y_train) 
print(accuracy_knn_train)

# creating a confusion matrix for training dataset 
knn_predictions_train = knn.predict(X_train)  
cm_train = confusion_matrix(y_train, knn_predictions_train)
print(cm_train)
print("------------------------------")
# accuracy on X_test 
accuracy_knn_test = knn.score(X_test, y_test) 
print(accuracy_knn_test)
  
# creating a confusion matrix 
knn_predictions_test = knn.predict(X_test)  
cm_test = confusion_matrix(y_test, knn_predictions_test)
print(cm_test)

In [None]:
s = sns.heatmap(cm_test, cmap = "cool",annot=True,fmt='d')

# **Support Vector Machine**

## Splitting the dataset into Train and test sets

In [None]:
train = shuffle(pd.read_csv("train.csv"))
test = shuffle(pd.read_csv("test.csv"))

## Dropping  the result and subject columns

In [None]:
X_train = pd.DataFrame(train.drop(['Activity','subject'],axis=1))  
Y_train_label = train.Activity.values.astype(object)
X_test = pd.DataFrame(test.drop(['Activity','subject'],axis=1))
Y_test_label = test.Activity.values.astype(object)

### Transforming the non numeric labels of the target column into numeric labels

In [None]:
encoder = preprocessing.LabelEncoder()

"""encoding train labels"""
encoder.fit(Y_train_label)
Y_train = encoder.transform(Y_train_label)

"""encoding test labels """
encoder.fit(Y_test_label)
Y_test = encoder.transform(Y_test_label)

### Scaling the Train and Test feature set using StandardScaler

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### Create the parameter grid based on the results of random search.

In [None]:
'''It consists of two different types of kernels - The radial basis function (rbf) and the linear kernel'''

params_grid = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],'C': [1, 10, 100, 1000]},{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

### Performing CV to tune parameters for best SVM fit with 5 fold cross validation

In [None]:
svm_model = GridSearchCV(SVC(), params_grid, cv=5)
svm_model.fit(X_train_scaled, Y_train)

### Best score for training data, Best kernel, C and gamma

In [None]:
print('Best score for training data:', svm_model.best_score_,"\n") #gives the estimator with the highest score out of the above rbf and linear kernel
final_model = svm_model.best_estimator_  
Y_pred = final_model.predict(X_test_scaled)
Y_pred_label = list(encoder.inverse_transform(Y_pred))

#The below score is the score of the best estimator
print('Best Kernel with minimum classification error:',svm_model.best_estimator_.kernel)
print('Best C:',svm_model.best_estimator_.C) 
print('Best Gamma:',svm_model.best_estimator_.gamma)
#Turns out the bast kernel is the radial basis fuction(rbf)

### Precision, recall, f1-score, support for each of the activities and the testing set accuracy

In [None]:
print(classification_report(Y_test_label,Y_pred_label))

print("Training set score for SVM: %f" % final_model.score(X_train_scaled , Y_train)) #Mean cross-validated score of the best_estimator
print("Testing  set score for SVM: %f" % final_model.score(X_test_scaled  , Y_test ))


### Constructing the Confusion Matrix for true label vs predicted label for classification

In [None]:
l = [0,0,0,0,0,0]

label_acc = [l.copy(), l.copy(), l.copy(), l.copy(), l.copy(), l.copy()]
for i in range(len(Y_test)):
    label_acc[Y_test[i]][Y_pred[i]] += 1

x_axis_labels = ["LAYING", "SITTING", "STANDING" ,"WALKING", "WALKING_DOWN", "WALKING_UP"] # labels for x-axis
y_axis_labels = ["LAYING", "SITTING", "STANDING" ,"WALKING", "WALKING_DOWN", "WALKING_UP"] # labels for y-axis

s = sns.heatmap(label_acc, xticklabels=x_axis_labels, yticklabels=y_axis_labels, cmap="cool", annot = True, fmt='d')
s.set(xlabel="Predicted Label", ylabel = "True Label")

# **K-MEANS CLUSTERING**

## Reading the dataset

In [None]:
df = pd.read_csv(r"train.csv")
df1 = df.loc[:,:'angle(Z,gravityMean)']

## Performing t-SNE reduction on the dataset for easier visualization

In [None]:
tsne_data = df1.copy()
scl = StandardScaler()
tsne_data = scl.fit_transform(tsne_data)

tsne = TSNE(random_state=3)
tsne_transformed = tsne.fit_transform(tsne_data)

## **K-Means Clustering Algorithm for 6 Clusters**
The code in this section takes the Test dataset and runs the K-Means clustering algorithm to split the data into 6 clusters.

#### Performing K-means for 6 clusters

In [None]:
kmeans = KMeans(n_clusters=6, init='random', max_iter=300, n_init=20, random_state=0)
pred_y = kmeans.fit_predict(df1)

#### Plotting the t-SNE reduced data showing the distribution of static and dynamic activites in the output of the model as well as the expected output

In [None]:
Y_train = df.Activity.copy()
label_counts = Y_train.value_counts()

n = label_counts.shape[0]
colormap = get_cmap('viridis')
colors = [rgb2hex(colormap(col)) for col in np.arange(0, 1.01, 1/(n-1))]
fig = plt.figure(figsize = (40,20))

axes1 = fig.add_subplot(121)
axes1.set_title('Expected Activity Visualisation', fontdict = {'fontsize': 40})
plt.setp(axes1.get_xticklabels(), Fontsize=25)
plt.setp(axes1.get_yticklabels(), Fontsize=25)
for i, group in enumerate(label_counts.index):
    mask = (Y_train==group).values
    axes1.scatter(x=tsne_transformed[mask][:,0], y=tsne_transformed[mask][:,1], c=colors[i], alpha=0.5, label=group)

axes2 = fig.add_subplot(122)
axes2.set_title('KMeans Cluster Visualisation', fontdict = {'fontsize': 40})
plt.setp(axes2.get_xticklabels(), Fontsize=25)
plt.setp(axes2.get_yticklabels(), Fontsize=25)
for i, group in enumerate(label_counts.index):
    mask = (kmeans.labels_==i)
    axes2.scatter(x=tsne_transformed[mask][:,0], y=tsne_transformed[mask][:,1], c=colors[i], alpha=0.5, label=group)

#### Calculating and plotting the heatmap of the output of the K-Means model

In [None]:
l = [0,0,0,0,0,0]
label_acc={'LAYING':l.copy(), 'SITTING':l.copy(), 'STANDING': l.copy(), 'WALKING':l.copy(), 'WALKING_DOWNSTAIRS':l.copy(), 'WALKING_UPSTAIRS':l.copy()}
for i in range(len(kmeans.labels_)):
    label_acc[Y_train[i]][kmeans.labels_[i]] += 1

label_acc = pd.DataFrame(label_acc, columns = ['LAYING', 'SITTING','STANDING', 'WALKING', 'WALKING_DOWNSTAIRS', 'WALKING_UPSTAIRS'])
fig = plt.figure(figsize = (7,6))
axes1 = fig.add_subplot(111)
plt.setp(axes1.get_xticklabels(), fontsize = 10)
plt.setp(axes1.get_yticklabels(), fontsize = 10)
s = sns.heatmap(label_acc, cmap = "YlGnBu", annot = True, fmt="d", ax = axes1)

## **K-Means Clustering Algorithm for 2 Clusters**
The code in this section takes the Test dataset and runs the K-Means clustering algorithm. It classfies the datapoints into Static and Dynamic Activities and shows the results

#### Performing K-means for 2 clusters

In [None]:
kmeans = KMeans(n_clusters=2, init='random', max_iter=300, n_init=20, random_state=0)
pred_y = kmeans.fit_predict(df1)

#### Plotting the t-SNE reduced data showing the distribution of static and dynamic activites in the output of the model as well as the expected output

In [None]:
Y_train = df.Activity.copy()

for i in range(len(Y_train)):
    if(Y_train[i] == 'LAYING' or Y_train[i] == 'STANDING' or Y_train[i] == 'SITTING'):
        Y_train[i] = "STATIC"
    else:
        Y_train[i] = "DYNAMIC"
label_counts = Y_train.value_counts()

n = label_counts.shape[0]
colormap = get_cmap('viridis')
colors = [rgb2hex(colormap(col)) for col in np.arange(0, 1.01, 1/(n-1))]
fig = plt.figure(figsize = (40,20))

#plot of the classified data
axes1 = fig.add_subplot(121)
axes1.set_title('Expected Activity Visualisation', fontdict = {'fontsize': 40})
plt.setp(axes1.get_xticklabels(), Fontsize=25)
plt.setp(axes1.get_yticklabels(), Fontsize=25)
for i, group in enumerate(label_counts.index):
    mask = (Y_train==group).values
    axes1.scatter(x=tsne_transformed[mask][:,0], y=tsne_transformed[mask][:,1], c=colors[i], alpha=0.5, label=group)

#plot of the expected data
axes2 = fig.add_subplot(122)
axes2.set_title('KMeans Cluster Visualisation', fontdict = {'fontsize': 40})
plt.setp(axes2.get_xticklabels(), Fontsize=25)
plt.setp(axes2.get_yticklabels(), Fontsize=25)
for i, group in enumerate(label_counts.index):
    mask = (kmeans.labels_==i)
    axes2.scatter(x=tsne_transformed[mask][:,0], y=tsne_transformed[mask][:,1], c=colors[i], alpha=0.5, label=group)

#### Calculating and plotting the heatmap of the output of the K-Means model

In [None]:
c = {"DYNAMIC":[0,0],"STATIC":[0,0]}

c_d = pd.DataFrame(c, columns = ["DYNAMIC", "STATIC"])
for i in range(len(Y_train)):
    c_d[Y_train[i]][kmeans.labels_[i]] += 1

fig = plt.figure(figsize = (6,5))
axes1 = fig.add_subplot(111)
plt.setp(axes1.get_xticklabels(), Fontsize=12)
plt.setp(axes1.get_yticklabels(), Fontsize=12)
s = sns.heatmap(c_d, cmap = "YlGnBu", annot = True, fmt="d", ax = axes1)

# **Artificial Neural Networks**

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

### Checking for null and duplicate values in train and test set

In [None]:
train.isnull().any()

In [None]:
test.isnull().any()

In [None]:
print(sum(train.duplicated()))
print(sum(test.duplicated()))

### Cross tabulation of all activities

In [None]:
pd.crosstab(train.subject, train.Activity, margins=True).style.background_gradient(cmap='autumn_r')

### Creating variables for train and test

In [None]:

X_train=train.iloc[:,:-2]
y_train=train.iloc[:,-1]

X_test=test.iloc[:,:-2]
y_test=test.iloc[:,-1]

### Scaling the features

In [None]:

from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

### Encoding the labels (y)

In [None]:

from sklearn.preprocessing import LabelEncoder
encoder=LabelEncoder()

y_train=encoder.fit_transform(y_train)
y_train=pd.get_dummies(y_train).values


y_test=encoder.fit_transform(y_test)
y_test=pd.get_dummies(y_test).values

### Calculating explained variance using PCA

In [None]:
from sklearn.decomposition import PCA
pca=PCA(n_components=None)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)
explained_variance=pca.explained_variance_ratio_

print(explained_variance)

In [None]:
#Shape of features 
print(X_train.shape , y_train.shape)
print(X_test.shape , y_test.shape)

In [None]:
!pip install tensorflow
!pip install keras 
import keras 

### Callbacks and checkpointing 

In [None]:
filepath="HAR_weights.hdf5"
from keras.callbacks import ReduceLROnPlateau , ModelCheckpoint

lr_reduce = ReduceLROnPlateau(monitor='val_acc', factor=0.1, epsilon=0.0001, patience=1, verbose=1)
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

In [None]:
#making the necesssary imports 
from sklearn.model_selection import train_test_split
from keras.models import Sequential 
from keras.layers import Dense,Dropout,BatchNormalization
from keras.utils import np_utils
from keras.optimizers import Adam #Can try with other optimizers like RMSprop and others

### Definition of the model - Tweak the layers and perform other hyperparameters tuning for testing

In [None]:
model = Sequential()

#Add dense layers (4-5)
model.add(Dense(units=64,kernel_initializer='uniform',activation='relu',input_dim=X_train.shape[1]))

#Can try to add BatchNormalization here to improve accuracy 
model.add(BatchNormalization())
model.add(Dense(units=128,kernel_initializer='uniform',activation='relu'))

model.add(Dense(units=64,kernel_initializer='uniform',activation='relu'))

model.add(Dense(units=32,kernel_initializer='uniform',activation='relu'))

model.add(Dense(units=6,kernel_initializer='uniform',activation='softmax')) #Using softmax instead of sigmoid 

### Compile the model and get the summary

In [None]:
 
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])  #Can explicitly set learning rate and tweak it to see change in results 

print(model.summary())

### Ftting the model based on the train and test data, 50 epochs and a batch size of 256 (found to have shown the best results)

In [None]:
#try tweaking the batch size and the number of epochs to check for best results 
history = model.fit(X_train, y_train , epochs=50 , batch_size = 256 , validation_data=(X_test, y_test) , callbacks=[checkpoint,lr_reduce])

In [None]:
from pylab import rcParams
rcParams['figure.figsize'] = 10, 4
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pred = model.predict(X_test)

y_pred_class = np.argmax(y_pred,axis=1)

y_test_class = np.argmax(y_test,axis=1)

In [None]:
y_test_class

In [None]:
y_pred_class