In [13]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Loading data

In [14]:
# Loading datasets

# data_train = pd.read_csv('data/processed/train_meaningfull.csv')
# data_test = pd.read_csv('data/processed/test_meaningfull.csv')

In [15]:
# Loading raw datasets

data_train = pd.read_csv('data/raw/train.csv')
data_test = pd.read_csv('data/raw/test.csv')

In [16]:
X_train = data_train.drop('Activity', axis='columns')
y_train = data_train['Activity']

In [17]:
X_test = data_test.drop('Activity', axis='columns')
y_test = data_test['Activity']

### Testing different models

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.naive_bayes import GaussianNB

from sklearn.linear_model import LogisticRegressionCV

from sklearn.model_selection import GridSearchCV

#### Trying Linear Discriminant Analysis (LDA)

In [19]:
lda = LDA()

pipe = Pipeline([("LDA", lda)])

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

0.9643705463182898

##### Hyperparameter tuning LDA

In [20]:
lda = LDA()

pipe = Pipeline([("LDA", lda)])

shrinkage_range = ['auto', 0, 0.2, 0.4, 0.6, 0.8, 1]
param_grid = [ 
  {'LDA__solver': ['svd']},
  {'LDA__solver': ['lsqr'], 'LDA__shrinkage': shrinkage_range},
]

pipe = Pipeline([("LDA", lda)])

gs = GridSearchCV(
  estimator=pipe,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

0.9575709849157054
{'LDA__solver': 'svd'}


0.9643705463182898

#### Trying Gaussian Naive Bayes Classifier

In [21]:
clf = GaussianNB()
pipe = Pipeline([("GaussianNB", clf)])
pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

0.7702748557855447

##### Hyperparameter tuning GaussianNB

In [22]:
# Tuning "var_smoothing" parameter of GaussianNB
clf = GaussianNB(var_smoothing=0.00001)
param_range = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
param_grid = [ {'GaussianNB__var_smoothing': param_range} ]

pipe = Pipeline([("GaussianNB", clf)])

gs = GridSearchCV(
  estimator=pipe,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

0.8185485063590654
{'GaussianNB__var_smoothing': 0.001}


0.8201560909399389

#### Trying Logistic Regression (already with cross-validation)

In [23]:
logReg = LogisticRegressionCV()
pipe = Pipeline([("LogisticRegression", logReg)])
pipe.fit(X=X_train, y=y_train)
pipe.score(X=X_test, y=y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

0.9602986087546658

### Joining LDA transformation with GaussianNB

In [25]:
lda = LDA()

X_transformed_train = lda.fit_transform(X=X_train, y=y_train)
X_transformed_test = lda.fit_transform(X=X_test, y=y_test)
  
clf = GaussianNB()

param_range = [1e-9, 1e-8, 1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4, 1e5]
param_grid = [ {'var_smoothing': param_range} ]

gs = GridSearchCV(
  estimator=clf,
  param_grid=param_grid,
  scoring='accuracy',
  cv=10,
  refit=True,
  n_jobs=-1
)
gs = gs.fit(X_train, y_train)

print(gs.best_score_)
print(gs.best_params_)

clf.fit(X=X_transformed_train, y=y_train)
print(clf.score(X=X_test, y=y_test))

# 0.9854469831410826 >> 0.7852052935188327
# => overfitting

0.8185485063590654
{'var_smoothing': 0.001}




ValueError: X has 562 features, but GaussianNB is expecting 5 features as input.