
<h1> DS200A Computer Vision Assignment</h1>

<h2>  Part Three: Training Models </h2>	

In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import skimage
import sklearn
from sklearn import preprocessing
%matplotlib inline

In [None]:
df = pd.read_hdf('feature.h5', 'feature')
TRAIN_RATIO = 0.9

# Split the data into a training set, and test set 
def train_test_split(df):
    df = sklearn.utils.shuffle(df, random_state=29)
    train_n = int(len(df) * TRAIN_RATIO)
    return df[:train_n], df[train_n:]

# Calculate the accuracy percentage of the predicted values
def accuracy(pred, actual):
    return (pred == actual).mean()

def get_X_y(df):
    """
    for a DataFrame, split it for feature data X and label y
    """
    X = df.copy()
    X = X.drop("Encoding", axis=1).as_matrix()
    y = df["Encoding"].values
    return X, y

def feature_normalize(X):
    """
    normalized features to mean 0 and variance 1.
    """
    min_max_scaler = preprocessing.MinMaxScaler()
    X_scaled = min_max_scaler.fit_transform(X)
    return X_scaled

# load data
train_set, val_set = train_test_split(df)

X_train, y_train = get_X_y(train_set)
X_val, y_val = get_X_y(val_set)
X_train = feature_normalize(X_train)
X_val = feature_normalize(X_val)

<h3>  Train models using all of the following methods below. Be sure to drop the actual image column, and the encoding</h3>	
Take note of the differences in accuracy, and methods.


In [None]:
# support function for evaluate model and print accuracy
def print_model_accuracy(model, train_acc, val_acc):
    # print parameters of model
    print(model)
    
    # train the model
    m = model.fit(X_train, y_train)
    
    # get prediction
    yhat = m.predict(X_train)
    
    # calculate, store and print accuracy on training 
    # set and validation set
    train_acc.append(accuracy(yhat, y_train))
    print("Accuracy on train = {}".format(accuracy(yhat, y_train)))
    yhat = m.predict(X_val)
    val_acc.append(accuracy(yhat, y_val))
    print("Accuracy on val = {}".format(accuracy(yhat, y_val)))
    print()

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

# search for different parameters
train_list = []
val_list = []
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, solver='lbfgs', multi_class='multinomial', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=0.1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=1, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)
LRClassifier = LogisticRegression(random_state=42, penalty='l1', C=10, max_iter=5000)
print_model_accuracy(LRClassifier, train_list, val_list)

In [None]:
plt.figure(figsize=(6, 6))
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
plt.xticks(range(len(train_list)), ["'lbfgs', C=0.1", "'lbfgs', C=1", "'lbfgs', C=10", 
                                    "'lbfgs', 'multiomial', C=0.1",
                                    "'lbfgs', 'multiomial', C=1",
                                    "'lbfgs', 'multiomial', C=10",
                                    "penalty='l1', C=0.1", "penalty='l1', C=1", "penalty='l1', C=10"], rotation='vertical')
plt.xlabel("model with different hyperparameter")
plt.ylabel("accuracy")
plt.title("Performance of Logistic Regression")
plt.tight_layout()
plt.savefig("lg.jpg", dpi=300)
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### K-nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# search for different parameters
train_list = []
val_list = []
KNNClassifier = KNeighborsClassifier(n_neighbors=5)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=10)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=15)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=20)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=25)
print_model_accuracy(KNNClassifier, train_list, val_list)
KNNClassifier = KNeighborsClassifier(n_neighbors=30)
print_model_accuracy(KNNClassifier, train_list, val_list)

In [None]:
plt.figure(figsize=(6, 6))
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.6)
plt.legend(["train", "validation"])
plt.xticks(range(len(train_list)), ["5", "10", "15", "20", "35", "30"])
plt.xlabel("model with different hyperparameter $k$")
plt.ylabel("accuracy")
plt.title("Performance of K-nearest Neighbors")
plt.tight_layout()
plt.savefig("knn.jpg", dpi=300)
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

# search for different parameters
train_list = []
val_list = []

RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=10, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=50, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=4, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)
RFClassifier = RandomForestClassifier(n_estimators=100, max_depth=6, random_state=42)
print_model_accuracy(RFClassifier, train_list, val_list)

In [None]:
plt.figure(figsize=(6, 6))
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
plt.xticks(range(len(train_list)), ["# of tree=10, max_depth=3", "# of tree=10, max_depth=4", 
                                    "# of tree=10, max_depth=5", "# of tree=10, max_depth=6",
                                    "# of tree=50, max_depth=3", "# of tree=50, max_depth=4", 
                                    "# of tree=50, max_depth=5", "# of tree=50, max_depth=6",
                                    "# of tree=100, max_depth=3", "# of tree=100, max_depth=4", 
                                    "# of tree=100, max_depth=5", "# of tree=100, max_depth=6",], rotation='vertical')
plt.xlabel("model with different hyperparameter")
plt.ylabel("accuracy")
plt.title("Performance of Random Forest")
plt.tight_layout()
plt.savefig("rf.jpg", dpi=300)
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Support Vector Machine

In [None]:
from sklearn.svm import SVC

# search for different parameters
train_list = []
val_list = []

SVMClassifier = SVC(C=0.1, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='linear', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='rbf', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='poly', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

SVMClassifier = SVC(C=0.1, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=1, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)
SVMClassifier = SVC(C=10, kernel='sigmoid', random_state=42)
print_model_accuracy(SVMClassifier, train_list, val_list)

In [None]:
plt.figure(figsize=(6, 6))
plt.bar(left=range(len(train_list)), height=train_list, width=0.25, color='skyblue')
plt.bar(left=np.array(range(len(train_list))) + 0.4, height=val_list, width=0.25, color='wheat')
plt.ylim(0, 0.5)
plt.legend(["train", "validation"])
plt.xticks(range(len(train_list)), ["'linear', C=0.1", "'linear', C=1", "'linear', C=10", 
                                    "'rbf', C=0.1", "'rbf', C=1", "'rbf', C=10",
                                    "'poly', C=0.1", "'poly', C=1", "'poly', C=10",
                                    "'sigmoid', C=0.1", "'sigmoid', C=1", "'sigmoid', C=10",], rotation='vertical')
plt.xlabel("model with different hyperparameter")
plt.ylabel("accuracy")
plt.title("Performance of Support Vector Machine")
plt.tight_layout()
plt.savefig("svm.jpg", dpi=300)
print("Best train accuracy: {}".format(train_list[np.argmax(val_list)]))
print("Best validation accuracy: {}".format(max(val_list)))

### Apply best model

The best model we found is SVM model, with `rbf` kernel function and `C` = 1.

In [None]:
# train best model
best_model = LogisticRegression(random_state=42, penalty='l1', C=10, max_iter=5000)
best_model.fit(X_train, y_train)

In [None]:
# load test data
test_df = pd.read_hdf("test_feature.h5", "feature")
X_test = feature_normalize(test_df)

# predict and save results
test_prediction = best_model.predict(X_test)
pd.DataFrame(test_prediction).T.to_csv("submission.csv", index=False, header=False)