# Import Package and Dataset

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score


In [50]:
data=pd.read_csv("sleep_scoring_ground_true.csv")
data.head(5)

Unnamed: 0.1,Unnamed: 0,sleep_labels,motion,delta,theta,alpha,beta,gamma,lowgamma,highgamma,fastoscillations,sigma,thetaratio,AnimalName,Geno,Date
0,0,2.0,,0.321147,0.314446,0.186082,0.132685,0.034842,0.010926,0.002769,0.005548,0.214435,0.48529,CAF22,TE4,2020-04-20_12-07-45
1,1,2.0,,0.450377,0.238804,0.130794,0.090249,0.063676,0.016838,0.007529,0.012827,0.208465,0.317277,CAF22,TE4,2020-04-20_12-07-45
2,2,2.0,,0.326391,0.28719,0.1494,0.150537,0.058311,0.023801,0.008066,0.011828,0.201474,0.416426,CAF22,TE4,2020-04-20_12-07-45
3,3,2.0,,0.476368,0.257943,0.109771,0.069822,0.057446,0.021455,0.005776,0.012171,0.139257,0.439963,CAF22,TE4,2020-04-20_12-07-45
4,4,2.0,,0.151375,0.496002,0.190633,0.070405,0.062607,0.024616,0.005654,0.01389,0.189229,0.719037,CAF22,TE4,2020-04-20_12-07-45


# Data Cleaning and Preprocessing

## Remove missing data

In [51]:
# drop irelevant columns
data = data.drop(['Date','Unnamed: 0'], axis=1)

In [52]:
# drop null data
null = pd.DataFrame({'Null Values':data.isna().sum().sort_values(ascending=False),
                   'Percentage Null Values':(data.isna().sum().sort_values(ascending=False))/(data.shape[0])*(100)})
null


Unnamed: 0,Null Values,Percentage Null Values
motion,51336,6.061637
delta,5054,0.596765
theta,5054,0.596765
alpha,5054,0.596765
beta,5054,0.596765
gamma,5054,0.596765
lowgamma,5054,0.596765
highgamma,5054,0.596765
fastoscillations,5054,0.596765
sigma,5054,0.596765


In [53]:
data=data.dropna() # Remove all null value

In [54]:
null = pd.DataFrame({'Null Values':data.isna().sum().sort_values(ascending=False),
'Percentage Null Values':(data.isna().sum().sort_values(ascending=False))/(data.shape[0])*(100)})
null

Unnamed: 0,Null Values,Percentage Null Values
sleep_labels,0,0.0
motion,0,0.0
delta,0,0.0
theta,0,0.0
alpha,0,0.0
beta,0,0.0
gamma,0,0.0
lowgamma,0,0.0
highgamma,0,0.0
fastoscillations,0,0.0


## Remove Duplicate Data

In [55]:
# remove duplicate data
duplicate = data.duplicated()
print(duplicate.sum())

0


## One-Hot Encoding

In [56]:
# one hot encoding to categorical data
data = pd.get_dummies(data, columns=['AnimalName', 'Geno'])
encoded_data_head = data.head()
original_shape = data.shape
encoded_shape = data.shape
encoded_data_head, original_shape, encoded_shape

(      sleep_labels    motion     delta     theta     alpha      beta  \
 6255           1.0  0.197174  0.132964  0.317876  0.195525  0.106984   
 6256           1.0  0.197174  0.203083  0.320782  0.081938  0.169842   
 6257           1.0  0.197174  0.178307  0.419322  0.135473  0.111788   
 6258           1.0  0.297513  0.106156  0.423963  0.152961  0.177532   
 6259           1.0  0.297513  0.179107  0.181379  0.150880  0.250386   
 
          gamma  lowgamma  highgamma  fastoscillations  ...  AnimalName_CAF88  \
 6255  0.210907  0.084864   0.020432          0.023387  ...             False   
 6256  0.178196  0.058919   0.020673          0.024679  ...             False   
 6257  0.111710  0.037711   0.010607          0.019913  ...             False   
 6258  0.121641  0.046641   0.012802          0.018047  ...             False   
 6259  0.185226  0.064529   0.024476          0.035221  ...             False   
 
       AnimalName_CAF89  AnimalName_CAF95  AnimalName_KDR14  AnimalName_

## Remove Outliers

In [None]:
# Remove outliers using Isolation Forest
from sklearn.ensemble import IsolationForest

clf = IsolationForest(random_state=0).fit(data)

outliers_pred = clf.predict(data)
outliers = data[outliers_pred == -1]

non_outlier_indices = outliers_pred != -1
len(non_outlier_indices)

data_if = data[non_outlier_indices]
outliers_removed = len(data) - len(data_if)

data_if_preview = data_if.head()
outliers_removed, data_if_preview


## Apply Standardization

In [58]:
# Standardization function
def Standardization(dataframe):
    df_std = dataframe.copy()
    for col in df_std.columns:
        df_std[col] =(df_std[col]-df_std[col].mean())/df_std[col].std()
    return df_std
features=data_if.drop(['sleep_labels'],axis=1)
label=data_if['sleep_labels']
standardized_features=Standardization(features)

# Model Training

## Split Data

In [60]:
train_X,test_X,train_Y,test_Y=train_test_split(standardized_features,label,test_size=0.2,random_state=543)

## Hyperparameter Tuning

In [None]:
# Grid Search CV to find the best hyperparameter
kernel=['linear', 'poly', 'rbf', 'sigmoid']
gamma_range = ['scale','auto']
C_range = np.linspace(1,10,4)
param_grid = dict(gamma = gamma_range, C=C_range, kernel=kernel,max_iter=[15000])
grid = GridSearchCV(svm.SVC(cache_size=5000), param_grid=param_grid, cv=5)
grid.fit(train_X, train_Y)

# Results

In [None]:
print(classification_report(test_Y,grid.best_estimator_.predict(test_X),digits=4))

In [None]:
print(grid.best_score_)
print(grid.best_params_)
print(grid.best_estimator_.score(test_X,test_Y))

In [None]:
best_model = grid.best_estimator_
predictions = best_model.predict(test_X)
cm = confusion_matrix(test_Y, predictions)
f, ax=plt.subplots(figsize=(7,7))
sns.heatmap(cm, annot=True, fmt="d", cmap='Blues',ax=ax,xticklabels=['1','2','3'],
            yticklabels=['1','2','3'])
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()