In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, LeaveOneOut, GridSearchCV
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

# Dataset setup

## Load the dataset from .csv file

In [2]:
input_file = "3_classes/data.csv"
df = pd.read_csv(input_file, header = 0)
df.describe()

FileNotFoundError: [Errno 2] File b'3_classes_no_outlier/data.csv' does not exist: b'3_classes_no_outlier/data.csv'

## Split into input (X) and output (y) variables

In [None]:
dataset = df.values
X = dataset[:,1:7]
y = dataset[:,7]
y = y.astype('int')

# Analyze dataset

## Check if data is skewed
What should my threshold be? I chose to combine Unstable and Maybe

In [None]:
sns.catplot(x="Category", kind="count", data=df)
count = df["Result"].value_counts()
count = count.tolist()
print(f"Stable: {count[0] / len(X) * 100} %")
print(f"Maybe: {count[1] / len(X) * 100} %")
print(f"Unstable: {count[2] / len(X) * 100} %")

## Finding patterns graphically

### Pair plots and histograms of stable vs unstable
What to do with the outlier?

In [None]:
sns.pairplot(df[df.columns.difference(['Result'])], hue='Category')

# Machine Learning

## Prepare data for models

### Standardize the data for ease of use and increased performance
Should I be scaling the only the train and will it affect my model?

In [None]:
scale = StandardScaler().fit(X)
X_std = scale.transform(X)

### Split into train and test set

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_std, y, train_size=.9)

### Leave one out cross validation

In [None]:
loo = LeaveOneOut()
loo.get_n_splits(X_train)

## PCA

### 2D

In [None]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_std)

In [None]:
stable = np.array([0.0,0.0])
maybe = np.array([0.0,0.0])
unstable = np.array([0.0,0.0])
for i in range(len(X_pca)):
    if y[i] == 2:
        stable = np.append(stable, X_pca[i])
    elif y[i] == 1:
        maybe = np.append(maybe, X_pca[i])
    else:
        unstable = np.append(unstable, X_pca[i])
stable = stable.reshape((count[0]+1, 2))
maybe = maybe.reshape((count[1]+1, 2))
unstable = unstable.reshape((count[2]+1, 2))

In [None]:
fig, ax = plt.subplots()
ax.scatter(stable[1:, 0], stable[1:, 1], alpha=0.5)
ax.scatter(maybe[1:, 0], maybe[1:, 1], marker="^")
ax.scatter(unstable[1:, 0], unstable[1:, 1], marker="x")
ax.set_title("PCA")
fig.legend(["Stable", "Maybe", "Unstable"])

### 3D

In [None]:
%matplotlib notebook

In [None]:
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X)

In [None]:
stable = np.array([0.0,0.0,0.0])
maybe = np.array([0.0,0.0,0.0])
unstable = np.array([0.0,0.0,0.0])
for i in range(len(X_pca)):
    if y[i] == 2:
        stable = np.append(stable, X_pca[i])
    elif y[i] == 1:
        maybe = np.append(maybe, X_pca[i])
    else:
        unstable = np.append(unstable, X_pca[i])
stable = stable.reshape((count[0]+1, 3))
maybe = maybe.reshape((count[1]+1, 3))
unstable = unstable.reshape((count[2]+1, 3))

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(stable[1:, 0], stable[1:, 1], stable[1:, 2], alpha=0.5)
ax.scatter(maybe[1:, 0], maybe[1:, 1], maybe[1:, 2], marker="^")
ax.scatter(unstable[1:, 0], unstable[1:, 1], unstable[1:, 2], marker="x")
ax.set_title("PCA")

## Linear

In [None]:
parameters = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],'C': [1, 100, 1000]}
log = LogisticRegression(multi_class='auto', max_iter=1000)
clf = GridSearchCV(log, parameters, cv=loo)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

## SVC

In [None]:
parameters = {'kernel': ['rbf', 'linear', 'poly'],'C': [1, 100, 1000]}
svc = SVC(gamma="scale")
clf = GridSearchCV(svc, parameters, cv=loo)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

## KNN

In [None]:
parameters = {'n_neighbors': [2, 3, 4, 5, 6], 'p' : [1, 2]}
knn = KNeighborsClassifier()
clf = GridSearchCV(knn, parameters, cv=loo)
clf.fit(X_train, y_train)
print(clf.best_params_)
print(clf.best_score_)

## Decision Tree

In [None]:
score = 0
for train_index, cross_index in loo.split(X_train):
    X_train, X_cross = X[train_index], X[cross_index]
    y_train, y_cross = y[train_index], y[cross_index]
    tree = DecisionTreeClassifier()
    tree.fit(X_train, y_train)
    score += tree.score(X_cross, y_cross)
print(f"Score: {score / 53}")


## Ensemble Methods

### Random Forest

In [None]:
score = 0
for train_index, cross_index in loo.split(X_train):
    X_train, X_cross = X[train_index], X[cross_index]
    y_train, y_cross = y[train_index], y[cross_index]
    forest = RandomForestClassifier(n_estimators=500)
    forest.fit(X_train, y_train)
    score += forest.score(X_cross, y_cross)
print(f"Score: {score / 53}")

### Ada Boost

In [None]:
score = 0
for train_index, cross_index in loo.split(X_train):
    X_train, X_cross = X[train_index], X[cross_index]
    y_train, y_cross = y[train_index], y[cross_index]
    ada = AdaBoostClassifier(n_estimators=100)
    ada.fit(X_train, y_train)
    score += ada.score(X_cross, y_cross)
print(f"Score: {score / 53}")
