
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Training Models </h2>	

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage
from skimage import io
import sklearn
%matplotlib inline

In [None]:
df = pd.read_hdf('feature.h5', 'feature')
TRAIN_RATIO = 0.9

# Split the data into a training set, and test set 
def train_test_split(df):
    df = sklearn.utils.shuffle(df, random_state=29)
    train_n = int(len(df) * TRAIN_RATIO)
    return df[:train_n], df[train_n:]

# Calculate the accuracy percentage of the predicted values
def accuracy(pred, actual):
    return (pred == actual).mean()

def feature_normalize(X):
    """
    normalized features to mean 0 and variance 1.
    """
    for item in X.columns:
        mu = X[item].mean()
        sigma = (X[item] - mu)**2
        sigma = sigma ** 0.5
        X[item] = (X[item] - mu) / (sigma + 1e-10)
        
# load data
train_set, val_set = train_test_split(df)

def get_X_y(df):
    X = df.copy()
    X = X.drop("Encoding", axis=1)
    y = df["Encoding"]
    return X, y

X_train, y_train = get_X_y(train_set)
X_val, y_val = get_X_y(val_set)
feature_normalize(X_train)
feature_normalize(X_val)

<h3>  Train models using all of the following methods below. Be sure to drop the actual image column, and the encoding</h3>	
Take note of the differences in accuracy, and methods.


In [None]:
def print_model_accuracy(model, train_acc, val_acc):
    print(model)
    m = model.fit(X_train, y_train)
    yhat = m.predict(X_train)
    train_acc.append(accuracy(yhat, y_train))
    print("Accuracy on train = {}".format(accuracy(yhat, y_train)))
    yhat = m.predict(X_val)
    val_acc.append(accuracy(yhat, y_val))
    print("Accuracy on val = {}".format(accuracy(yhat, y_val)))
    print()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# search for different parameters
train_list = []
val_list = []
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)

In [None]:
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# search for different parameters
train_list = []
val_list = []
KNNClassifier = KNeighborsClassifier(n_neighbors=5)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=10)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=15)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=20)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=25)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=30)
print_model_accuracy(KNNClassifier, train_list, val_list)

In [None]:
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# search for different parameters
train_list = []
val_list = []

RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

In [None]:
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC

# search for different parameters
train_list = []
val_list = []

SVMClassifier = SVC(C=0.1, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

In [None]:
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.9)
plt.legend(["train", "validation"])
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Apply best model