In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Loading datasets

In [None]:
# Loading datasets

# data_train = pd.read_csv('data/processed/train_meaningfull.csv')
# data_test = pd.read_csv('data/processed/test_meaningfull.csv')

In [None]:
# Loading raw datasets

data_train = pd.read_csv('data/raw/train.csv')
data_test = pd.read_csv('data/raw/test.csv')

In [None]:
X_train = data_train.drop('Activity', axis='columns')
y_train = data_train['Activity']

In [None]:
X_test = data_test.drop('Activity', axis='columns')
y_test = data_test['Activity']

### Testing different models

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import GridSearchCV

#### Trying Linear Discriminant Analysis (LDA)

In [None]:
lda = LDA()

pipe = Pipeline([("LDA", lda)])

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

##### Hyperparameter tuning LDA

In [None]:
lda = LDA()

pipe = Pipeline([("LDA", lda)])

shrinkage_range = ['auto', 0, 0.2, 0.4, 0.6, 0.8, 1]
param_grid = [ 
  {'LDA__solver': ['svd']},
  {'LDA__solver': ['lsqr'], 'LDA__shrinkage': shrinkage_range},
]

pipe = Pipeline([("LDA", lda)])

gs = GridSearchCV(
  estimator=pipe,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

#### Trying Gaussian Naive Bayes Classifier

In [None]:
clf = GaussianNB()
pipe = Pipeline([("GaussianNB", clf)])
pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

##### Hyperparameter tuning GaussianNB

In [None]:
# Tuning "var_smoothing" parameter of GaussianNB
clf = GaussianNB(var_smoothing=0.00001)
param_range = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
param_grid = [ {'GaussianNB__var_smoothing': param_range} ]

pipe = Pipeline([("GaussianNB", clf)])

gs = GridSearchCV(
  estimator=pipe,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

#### Trying Logistic Regression (already with cross-validation)

In [None]:
logReg = LogisticRegressionCV()
pipe = Pipeline([("LogisticRegression", logReg)])
pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

### Joining LDA transformation with GaussianNB

In [None]:
lda = LDA()

X_transformed_train = lda.fit_transform(X=X_train, y=y_train)
X_transformed_test = lda.fit_transform(X=X_test, y=y_test)
  
clf = GaussianNB()

param_range = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
param_grid = [ {'var_smoothing': param_range} ]

gs = GridSearchCV(
  estimator=clf,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_transformed_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

clf.fit(X=X_transformed_train, y=y_train)
print(clf.score(X=X_transformed_test, y=y_test))

# 0.9854469831410826 >> 0.7852052935188327
# => overfitting