# Import packages and list potential classifiers

In [54]:
# Common data handling libraries
import os
import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from keras.preprocessing import image
import json

# Tensorflow for model building
import tensorflow as tf

# Keras for text-based classifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.inspection import DecisionBoundaryDisplay

names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    #"Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    #GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
]

# Load and preprocess dataset

In [55]:
TEST_IMG = 'train_test_data/test'
TRAIN_IMG = 'train_test_data/train'
TEST_CSV = 'test.csv'
TRAIN_CSV = 'train.csv'

traindata_csv = pd.read_csv(TRAIN_CSV)
testdata_csv = pd.read_csv(TEST_CSV)

# Text-based classifier with trainDataset

In [56]:
# Assign values to the X and y variables:
X = traindata_csv[['latitude', 'longitude', 'year']]
y = traindata_csv['label']
X_eval = testdata_csv[['latitude', 'longitude', 'year']]

# Split dataset into random train and test subsets:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test) 

for i, classifier in enumerate(classifiers):
    print(names[i])
    classifier.fit(X_train, y_train) 

    # Predict y data with classifier: 
    y_predict = classifier.predict(X_test)

    # Print results: 
    #print(confusion_matrix(y_test, y_predict))
    print(':', accuracy_score(y_test, y_predict))
    #print(classification_report(y_test, y_predict)) 

Nearest Neighbors
: 0.6466019417475728
Linear SVM
: 0.5184466019417475
RBF SVM
: 0.654368932038835
Decision Tree
: 0.6407766990291263
Random Forest
: 0.6504854368932039
Neural Net
: 0.625242718446602
AdaBoost
: 0.6427184466019418
Naive Bayes
: 0.5514563106796116
QDA
: 0.5631067961165048


# Text-based classifier with testDataset

In [60]:
import os
# Assign values to the X and y variables:
X = traindata_csv[['latitude', 'longitude', 'year']]
y = traindata_csv['label']
X_eval = testdata_csv[['latitude', 'longitude', 'year']]

# Standardize features by removing mean and scaling to unit variance:
scaler = StandardScaler()

X = scaler.fit_transform(X)
X_eval = scaler.fit_transform(X_eval)

classifier = SVC(gamma=2, C=1)
classifier.fit(X, y) 

y_predict = classifier.predict(X_eval)

img_list = []
for s in testdata_csv.example_path:
    name = os.path.split(s)[1].replace(".png","")
    img_list.append(name)

result = {'target': {name: int(pred) for name, pred in zip(img_list, y_predict)}}
with open("predictions.json", "w") as outfile:
    json.dump(result, outfile)