In [1]:
# import all csv files, combined them and shuffle
import os
import pandas as pd

current_dir = os.getcwd()
data_path = current_dir + "/Studio_3_data"
li = []
csv = os.listdir(data_path)
for i in range(len(csv)):
    file_path = os.path.join(data_path, csv[i])
    df = pd.read_csv(file_path, index_col=None, header=0)
    li.append(df)

combined_data_csv = pd.concat(li, axis=0, ignore_index=True)
all_data = combined_data_csv.sample(frac = 1)

In [19]:
# split the data, train and test a svm model
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

y = all_data['class']
X = all_data.drop(columns=['class'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
clf = svm.SVC()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score_1 = accuracy_score(y_test,y_pred)

In [3]:
# use 10 fold cross validation
from sklearn.model_selection import cross_val_score

clf = svm.SVC()
cross_val_score_1 = cross_val_score(clf, X, y, cv=10)
cross_val_score_1

array([0.8822012 , 0.88993981, 0.90455718, 0.89509888, 0.88048151,
       0.88564058, 0.89251935, 0.89767842, 0.89079966, 0.90017212])

In [4]:
# find the best hyperparameters for this model

from sklearn.model_selection import GridSearchCV 
param_grid = {'C': [0.1, 1, 10, 100, 1000],  
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
              'kernel': ['rbf']}  
  
grid = GridSearchCV(svm.SVC(), param_grid, refit = True, verbose = 3, n_jobs=-1) 
grid.fit(X_train, y_train) 

Fitting 5 folds for each of 25 candidates, totalling 125 fits


In [5]:
# print best params and best model
print(grid.best_params_)  
print(grid.best_estimator_) 
# best params is {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}

{'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
SVC(C=10, gamma=0.0001)


In [None]:
#svc_3 = grid.best_estimator_
svc_3 = svm.SVC(gamma = 0.0001, kernel= 'rbf', C=10)
cross_val_score_2 = cross_val_score(svc_3, X, y, cv=5, n_jobs=-1)
cross_val_score_2

array([0.84393809, 0.84350817, 0.83877902, 0.84350817, 0.84129032])

In [None]:
svc_3.fit(X_train, y_train)
y_pred = svc_3.predict(X_test)
accuracy_score_2 = accuracy_score(y_test,y_pred)
accuracy_score_2

0.8495270851246776

In [28]:
# Feature Selection
from sklearn.feature_selection import SelectKBest, f_classif

feature_selector = SelectKBest(score_func=f_classif, k=100)
X_train_selected = feature_selector.fit_transform(X_train, y_train)
X_test_selected = feature_selector.transform(X_test)

In [30]:
svc_4 = svm.SVC(gamma = 0.0001, kernel= 'rbf', C=10)
svc_4.fit(X_train_selected, y_train)
y_pred = svc_4.predict(X_test_selected)
accuracy_score_3 = accuracy_score(y_test,y_pred)
accuracy_score_3

0.8595586127830324

In [31]:
cross_val_score_3 = cross_val_score(svc_4, X_train_selected, y_train, cv=5, n_jobs=-1)
cross_val_score_3

array([0.8495086 , 0.8482801 , 0.84459459, 0.84889435, 0.8507371 ])

In [32]:
# Dimensionality Reduction

from sklearn.decomposition import PCA

pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [33]:
X_train_pca.shape

(8140, 10)

In [34]:
svc_5 = svm.SVC(gamma = 0.0001, kernel= 'rbf', C=10)
svc_5.fit(X_train_pca, y_train)

y_pred = svc_5.predict(X_test_pca)
accuracy_score_4 = accuracy_score(y_test,y_pred)
accuracy_score_4

0.8509601605044426

In [35]:
cross_val_score_4 = cross_val_score(svc_5, X_train_pca, y_train, cv=5, n_jobs=-1)
cross_val_score_4

array([0.83722359, 0.83660934, 0.83783784, 0.84090909, 0.84029484])

In [37]:
# activity 7

from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

sgd = SGDClassifier( random_state=42)
sgd.fit(X_train, y_train)
sgd_test_acc = sgd.score(X_test, y_test)
sgd_cv_acc = cross_val_score(sgd, X_train, y_train, cv=10, n_jobs=-1).mean()

print(f"SGDClassifier - Test Accuracy: {sgd_test_acc:.4f}, Cross-Validation Accuracy: {sgd_cv_acc:.4f}")

SGDClassifier - Test Accuracy: 0.8604, Cross-Validation Accuracy: 0.8705


In [39]:
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
rf_test_acc = rf.score(X_test, y_test)
rf_cv_acc = cross_val_score(rf, X_train, y_train, cv=10, n_jobs=-1).mean()

print(f"RandomForestClassifier - Test Accuracy: {rf_test_acc:.4f}, Cross-Validation Accuracy: {rf_cv_acc:.4f}")

RandomForestClassifier - Test Accuracy: 0.9321, Cross-Validation Accuracy: 0.9198


In [40]:
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)
mlp_test_acc = mlp.score(X_test, y_test)
mlp_cv_acc = cross_val_score(mlp, X_train, y_train, cv=10, n_jobs=-1).mean()

print(f"MLPClassifier - Test Accuracy: {mlp_test_acc:.4f}, Cross-Validation Accuracy: {mlp_cv_acc:.4f}")

MLPClassifier - Test Accuracy: 0.8601, Cross-Validation Accuracy: 0.8496
