### Importing libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.calibration import LabelEncoder

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline


### Loading data

In [None]:
# Loading raw datasets

data_train = pd.read_csv('data/raw/train.csv')
data_test = pd.read_csv('data/raw/test.csv')

In [None]:
le = LabelEncoder()
X_train = data_train.drop('Activity', axis='columns')
y_train = data_train['Activity']
y_train_encoded = le.fit_transform(y_train)

X_test = data_test.drop('Activity', axis='columns')
y_test = data_test['Activity']
y_test_encoded = le.fit_transform(y_test)

### Setting up Classifiers

In [None]:
clf1 = GaussianNB()

clf2 = DecisionTreeClassifier(
  max_depth=4,
  criterion='entropy',
  random_state=0
)

clf3 = KNeighborsClassifier(
  n_neighbors=1,
  p=2,
  metric='minkowski'
)

pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

clf_labels = ['GaussianNB', 'Decision tree', 'KNN']

### Comparing Classifiers

In [None]:
print('10-fold cross validation:\n')

for clf, label in zip([pipe1, clf2, pipe3], clf_labels):
  scores = cross_val_score(
    estimator=clf,
    X=X_train,
    y=y_train_encoded,
    cv=10,
    scoring='roc_auc_ovr'
  )
  
  print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

### Creating Majority Vote Classifier

In [None]:
from MajorityVoteClassifier import MajorityVoteClassifier

mv_clf = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
clf_labels += ['Majority voting']
all_clf = [pipe1, clf2, pipe3, mv_clf]

### Comparing `MajorityVoteClassifier` with the rest

In [None]:

for clf, label in zip(all_clf, clf_labels):
  scores = cross_val_score(
    estimator=clf,
    X=X_train,
    y=y_train_encoded,
    cv=10,
    scoring='roc_auc_ovr'
  )
  
  print("ROC AUC: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))

### Hyperparameter tunning with optuna

In [18]:
mv_clf.get_params()

{'pipeline-1': Pipeline(steps=[['sc', StandardScaler()], ['clf', GaussianNB()]]),
 'decisiontreeclassifier': DecisionTreeClassifier(criterion='entropy', max_depth=4, random_state=0),
 'pipeline-2': Pipeline(steps=[['sc', StandardScaler()],
                 ['clf', KNeighborsClassifier(n_neighbors=1)]]),
 'pipeline-1__memory': None,
 'pipeline-1__steps': [['sc', StandardScaler()], ['clf', GaussianNB()]],
 'pipeline-1__verbose': False,
 'pipeline-1__sc': StandardScaler(),
 'pipeline-1__clf': GaussianNB(),
 'pipeline-1__sc__copy': True,
 'pipeline-1__sc__with_mean': True,
 'pipeline-1__sc__with_std': True,
 'pipeline-1__clf__priors': None,
 'pipeline-1__clf__var_smoothing': 1e-09,
 'decisiontreeclassifier__ccp_alpha': 0.0,
 'decisiontreeclassifier__class_weight': None,
 'decisiontreeclassifier__criterion': 'entropy',
 'decisiontreeclassifier__max_depth': 4,
 'decisiontreeclassifier__max_features': None,
 'decisiontreeclassifier__max_leaf_nodes': None,
 'decisiontreeclassifier__min_impurit

In [47]:
def objective(trial):
  clf1 = GaussianNB(var_smoothing=trial.suggest_float("var_smoothing", 1e-10, 1, log=True),)

  clf2 = DecisionTreeClassifier(
    max_depth=trial.suggest_int("max_depth", 1, 5),
    criterion='entropy',
    random_state=0
  )

  clf3 = KNeighborsClassifier(
    n_neighbors=trial.suggest_int("n_neighbors", 1, 8),
    p=2,
    metric='minkowski'
  )

  pipe1 = Pipeline([['sc', StandardScaler()], ['clf', clf1]])
  pipe3 = Pipeline([['sc', StandardScaler()], ['clf', clf3]])

  model = MajorityVoteClassifier(classifiers=[pipe1, clf2, pipe3])
  
  model.fit(X_train, y_train)

  # Evaluate the model accuracy on the validation set.
  score = model.score(X_test, y_test)
  return score

In [48]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=300)

[I 2023-10-10 13:34:36,372] A new study created in memory with name: no-name-85a4c3a2-7e15-47b1-a3d9-b0ab8396ea3e
[I 2023-10-10 13:34:38,384] Trial 0 finished with value: 0.8588394977943672 and parameters: {'var_smoothing': 3.0944099230002186e-05, 'max_depth': 2, 'n_neighbors': 8}. Best is trial 0 with value: 0.8588394977943672.
[I 2023-10-10 13:34:40,901] Trial 1 finished with value: 0.8941296233457754 and parameters: {'var_smoothing': 1.7531580328680327e-07, 'max_depth': 3, 'n_neighbors': 8}. Best is trial 1 with value: 0.8941296233457754.
[I 2023-10-10 13:34:42,927] Trial 2 finished with value: 0.8456057007125891 and parameters: {'var_smoothing': 2.872745686897227e-09, 'max_depth': 2, 'n_neighbors': 1}. Best is trial 1 with value: 0.8941296233457754.
[I 2023-10-10 13:34:45,393] Trial 3 finished with value: 0.8659653885307091 and parameters: {'var_smoothing': 0.9075269026527946, 'max_depth': 3, 'n_neighbors': 7}. Best is trial 1 with value: 0.8941296233457754.
[I 2023-10-10 13:34:47,