#Four kinds of classifiers are separately shown below, SVM classifier is our best model.

## 1.K-Nearest Neighbour

In [None]:
import pandas as pd

#Reading training data:
data_td = pd.read_csv('./Input/train/train.csv') #Reading the file of training data
d_ta_feature = data_td.loc[:, "v1":"v784"].to_numpy() #Take features of traning data
d_ta_label = data_td.loc[:, 'label'].to_numpy() #Take labels of training data

X_ta = d_ta_feature
y_ta = d_ta_label

#Normilisation for training data
from sklearn.preprocessing import MinMaxScaler #Use min-max scaling method
tool = MinMaxScaler()
tool.fit(X_ta) #Calculate the min and the max value of the training data
X_ta_n = tool.transform(X_ta)

In [None]:
#Grid search based on KNN
'''
Hyperparameters:
n for the number of neighbour, p for different kinds of distance calculational methods
In order to reduce the computational complexity,
we choose n<=10, and we set interval 3(i.e. n=1,4,7,10)
As for hyperparameter p: p=1 -> Manhattan distance, p=2 -> Euclidean distance
'''
param_grid = {'n_neighbors': list(range(1,11,3)), 'p': [1, 2]}
#Call the function of gridsearch based on cross-validation
from sklearn.model_selection import GridSearchCV
#Call the function of KNN
from sklearn.neighbors import KNeighborsClassifier
#8-fold cross-validation
gd_knn = GridSearchCV(KNeighborsClassifier(), param_grid, cv=8, return_train_score=False)
gd_knn.fit(X_ta_n, y_ta)
print("best hyperparameter: {}".format(gd_knn.best_params_))
#Best hyperparameter:{'n_neighbors': 4, 'p': 1}

In [None]:
#Reading test dataset
data_td = pd.read_csv('./Input/test/test_input.csv') #Reading the file of test data
d_te_feature = data_td.loc[:, "v1":"v784"].to_numpy() #Take features of test data
X_te = d_te_feature

#Normalisation for test data
from sklearn.preprocessing import MinMaxScaler
tool = MinMaxScaler()
tool.fit(X_te)
X_te_n = tool.transform(X_te)

In [None]:
#Build KNN model based on best hyperparameters('n_neighbors': 4, 'p': 1)
from sklearn.neighbors import KNeighborsClassifier
model_knn = KNeighborsClassifier(n_neighbors=4, p=1)
model_knn.fit(X_ta_n, y_ta) #Build KNN model using training data
y_pd = model_knn.predict(X_te_n) #Use the best model predict labels of test data
op = pd.DataFrame(y_pd, columns = ['label']) #Input values to a column which named label
#Upload the file
op.to_csv('./Output/test_output.csv', sep=",", float_format='%d', index_label="id")
'''
RESULTS:
Running time-> 6m20s(Local) 2m1s(Colaboratory)
Accuracy    -> 0.84550
'''

## 2.Decision Tree

In [None]:
import pandas as pd

#Reading training data:
data_td = pd.read_csv('./Input/train/train.csv') #Reading the file of training data
d_ta_feature = data_td.loc[:, "v1":"v784"].to_numpy() #Take features of traning data
d_ta_label = data_td.loc[:, 'label'].to_numpy() #Take labels of training data

X_ta = d_ta_feature
y_ta = d_ta_label

#Normalisation for training data
from sklearn.preprocessing import MinMaxScaler #Use min-max scaling method
tool = MinMaxScaler()
tool.fit(X_ta) #Calculate the min and the max value of the training data
X_ta_n = tool.transform(X_ta)

In [None]:
#Find the depth of the tree which is fully growth
#In order to set the number in the step of grid search
from sklearn.tree import DecisionTreeClassifier #Call the function of decision tree
t = DecisionTreeClassifier(random_state=42)
t.fit(X_ta_n, y_ta) #Build a fully growth decision tree model using training data
print(t.get_depth())
#Get the max_depth of tree-->37

In [None]:
#Grid search based on DecisionTree
'''
Hyperparameters:
1.criterion(i.e.calculation based on Gini impurity or information gain)
2.max_depth->We already know the depth of fully growth tree is 37,
consider of the computational complexity, we set interval as 5
'''
p_g = {'criterion': ['gini', 'entropy'], 'max_depth': range(2,38,5)}
#Call the function of gridsearch based on cross-validation
from sklearn.model_selection import GridSearchCV
#8-fold cross-validation
gd_dt = GridSearchCV(DecisionTreeClassifier(), p_g, cv=8, return_train_score=False)
gd_dt.fit(X_ta_n, y_ta)
print("best hyperparameter: {}".format(gd_dt.best_params_))
#Best hyperparameter:{'criterion': 'entropy', 'max_depth': 12}

In [None]:
import pandas as pd
#Reading test dataset
data_td = pd.read_csv('./Input/test/test_input.csv') #Reading the file of test data
d_te_feature = data_td.loc[:, "v1":"v784"].to_numpy() #Take features of test data
X_te = d_te_feature

#Normalisation for test data
from sklearn.preprocessing import MinMaxScaler
tool = MinMaxScaler()
tool.fit(X_te)
X_te_n = tool.transform(X_te)

In [None]:
#Build DecisionTree model based on best hyperparameters{'criterion': 'entropy', 'max_depth': 12}
from sklearn.tree import DecisionTreeClassifier
model_DT = DecisionTreeClassifier(criterion='entropy', max_depth=12, random_state=42)
model_DT.fit(X_ta_n, y_ta) #Build DecisionTree model using training data
y_pd = model_DT.predict(X_te_n) #Use the best model predict labels of test data
op = pd.DataFrame(y_pd, columns = ['label']) #Input values to a column which named label
#Upload the file
op.to_csv('./Output/test_output.csv', sep=",", float_format='%d', index_label="id")
'''
RESULTS:
Running time-> 14s(Local) 18s(Colaboratory)
Accuracy    -> 0.81300
'''

## 3.Support Vector Machine【Our best model】

In [None]:
import pandas as pd

#Reading training data:
data_td = pd.read_csv('./Input/train/train.csv') #Reading the file of training data
d_ta_feature = data_td.loc[:, "v1":"v784"].to_numpy() #Take features of traning data
d_ta_label = data_td.loc[:, 'label'].to_numpy() #Take labels of training data

X_ta = d_ta_feature
y_ta = d_ta_label

#Normalisation for training data
from sklearn.preprocessing import MinMaxScaler #Use min-max scaling method
tool = MinMaxScaler()
tool.fit(X_ta) #Calculate the min and the max value of the training data
X_ta_n = tool.transform(X_ta)

In [None]:
#Grid search based on SVM
'''
Hyperparameters:
1.kernel->different kernel functions
2.C for trading off the relative importance of maximizing the margin and fitting the training data.
Large C: more emphasis on minimizing the training error than maximizing the margin.
Consider of the computational complexity, we only choose three values of C :0.1,1,10.
(The Latter one is ten times than the former one, which could allow a significant difference
between these models)
'''
p_g = {'kernel': ['linear', 'poly', 'rbf'], 'C':[0.1, 1, 10]}
#Call the function of gridsearch based on cross-validation
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC #Call the function of SVM
grid_search = GridSearchCV(SVC(), p_g, cv=8, return_train_score=False) #8-fold cross-validation
grid_search.fit(X_ta_n, y_ta)
print("best hyperparameter: {}".format(grid_search.best_params_))
#Best hyperparameter: {'C': 10, 'kernel': 'rbf'}

In [None]:
#Reading test dataset
data_td = pd.read_csv('./Input/test/test_input.csv')
d_te_feature = data_td.loc[:, "v1":"v784"].to_numpy()
X_te = d_te_feature

#Normalisation for test data
from sklearn.preprocessing import MinMaxScaler
tool = MinMaxScaler()
tool.fit(X_te)
X_te_n = tool.transform(X_te)

In [None]:
#Build SVM model based on best hyperparameters{'C': 10, 'kernel': 'rbf'}
from sklearn.svm import SVC
model_svm = SVC(C=10, kernel="rbf")
model_svm.fit(X_ta_n, y_ta) #Build SVM model using training data
y_pd = model_svm.predict(X_te_n) #Use the best model predict labels of test data
op = pd.DataFrame(y_pd, columns = ['label']) #Input values to a column which named label
#Upload the file
op.to_csv('./Output/test_output.csv', sep=",", float_format='%d', index_label="id")
'''
RESULTS:
Running time-> 2min(Local) 1m52s(Colaboratory)
Accuracy    -> 0.89800
'''

## 4.Random Forest(Based on PCA)

In [None]:
import pandas as pd

#reading training and test data
data_ta = pd.read_csv('./Input/train/train.csv') #Reading the file of training data
y_ta = data_ta.loc[:, 'label'].to_numpy()
data_te = pd.read_csv('./Input/test/test_input.csv') #Reading the file of test data

In [None]:
#PCA for training and test data
from sklearn.decomposition import PCA #Call the function of PCA
pca=PCA(n_components=0.95) #Choose 95% variance
#Concatenate traning and test data
all_da = pd.concat((data_ta.loc[:,'v1':'v784'],data_te.loc[:,'v1':'v784']))
all_da_P = pca.fit_transform(all_da) #Using PCA to all data

X_ta_P = all_da_P[:data_ta.shape[0]] #Extract training data
X_te_P = all_da_P[data_ta.shape[0]:] #Extract test data
print("Reduced shape of training data: {}".format(str(X_ta_P.shape)))
print("Reduced shape of test data: {}".format(str(X_te_P.shape)))
'''
Print content:
Reduced shape of training data: (30000, 187)
Reduced shape of test data: (5000, 187)
'''

In [None]:
#Normalisation for training and test data
from sklearn.preprocessing import MinMaxScaler #Use min-max scaling method
tool = MinMaxScaler()
tool.fit(X_ta_P) #Calculate the min and the max value of the training data
X_ta_n = tool.transform(X_ta_P)
tool.fit(X_te_P)
X_te_n = tool.transform(X_te_P)

In [None]:
'''
Hyperparameters:
1.n_estimators->the number of base classifiers.
We set it start from 100 to expect a better model accuracy(compared with 1 or 10)
2.max_features->different number of subset of features,
namely sqrt[max_features=sqrt(n_features)] and log2[max_features=log2(n_features)].
'''
#Grid search for RandomForest
p_g = {'n_estimators':[100,200,300,400,500], 'max_features':['sqrt','log2']}
from sklearn.ensemble import RandomForestClassifier #Call the function of RandomForest
#Call the function of gridsearch based on cross-validation
from sklearn.model_selection import GridSearchCV
#8-fold cross-validation
grid_search = GridSearchCV(RandomForestClassifier(), p_g, cv=8, return_train_score=False)
grid_search.fit(X_ta_n, y_ta)
print("best hyperparameter: {}".format(grid_search.best_params_))
#Best hyperparameter: {'max_features': 'sqrt', 'n_estimators': 400}

In [None]:
#Build RandomForest model based on best hyperparameters{'max_features': 'sqrt', 'n_estimators': 400}
#second time:3min40s
from sklearn.ensemble import RandomForestClassifier
model_RT = RandomForestClassifier(n_estimators=400, max_features='sqrt')
model_RT.fit(X_ta_n,y_ta) #Build RandomForest model using training data
y_pd = model_RT.predict(X_te_n) #Use the best model predict labels of test data
op = pd.DataFrame(y_pd, columns = ['label']) #Input values to a column which named label
#Upload the file
op.to_csv('./Output/test_output.csv', sep=",", float_format='%d', index_label="id")
'''
RESULTS:
Running time-> 3m40s(Local) 3m40s(Colaboratory)
Accuracy    -> 0.84600
'''

In [None]:
#Comparisions among different classifiers
'''
                       accuracy  Running time(Local)
K-Nearest Neighbour    0.8455    6m20s
Decision Tree          0.8130    14s
Support Vector Machine 0.8980    2m
Random Forest          0.8460    3m40s

Description:
It is apparent that Support Vector Machine has the best accuracy(0.8980) and less running time(2min).
On the contrary, K-Nearest Neighbour, which has the longest running time among the classifiers,
needs 6 minutes and 20 seconds for building a model and predicting labels.
And the accuracy of K-Nearest Neighbour is 0.8455,
which is similar to the accuracy of Random Forest(0.8460).
Additionally, Decision Tree has the shortest running time(14s) and the worst accuracy(0.8130).

'''

In [None]:
#Hardware and software specifications:
'''
Local environment: CPU: Intel Core i7-6700 /GPU: Nvidia GeForce GTX 970M
Colaboratory: GPU: K80
Version of python：Python 3
Used packages: Pandas, Sklearn
'''