In [6]:
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.cluster import DBSCAN
from sklearn.cluster import MeanShift

In [7]:
# Load data from the train data csv file
Feature = []
Label = []
with open('data/train/train.csv', 'r') as f:
    reader = csv.reader(f)
    train_data = list(reader)
    # remove the first row
    train_data = train_data[1:]
    for row in train_data:
        Feature.append(np.array([float(x) for x in row[1:-1]]))
        Label.append(row[-1])  
Features = np.array(Feature)
Labels = np.array(Label)

In [8]:
# Load data from the test data csv file
test_features = []
with open('data/test/test.csv', 'r') as f:
    reader = csv.reader(f)
    test_data = list(reader)
    # remove the first row
    test_data = test_data[1:]
    for row in test_data:
        test_features.append(np.array([float(x) for x in row[1:]]))
test_features = np.array(test_features)

In [9]:
pca = PCA(n_components=415)
pca.fit(Features)
Features = pca.transform(Features)
test_features = pca.transform(test_features)
lda = LinearDiscriminantAnalysis()
lda.fit(Features, Labels)
Features = lda.transform(Features)
test_features = lda.transform(test_features)

In [10]:
# agglomerative clustering
agglomerative = AgglomerativeClustering(n_clusters=4)
cluster_labels_agglomerative = agglomerative.fit_predict(Features)
new_features_agglomerative = np.expand_dims(cluster_labels_agglomerative, axis=1)
new_test_features_agglomerative = np.expand_dims(agglomerative.fit_predict(test_features), axis=1)
Features_agglomerative = np.concatenate((Features, new_features_agglomerative), axis=1)
test_features_agglomerative = np.concatenate((test_features, new_test_features_agglomerative), axis=1)
logistic_agglomerative = LogisticRegression(max_iter = 10000)
logistic_agglomerative.fit(Features_agglomerative, Labels)

# DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)
cluster_labels_dbscan = dbscan.fit_predict(Features)
new_features_dbscan = np.expand_dims(cluster_labels_dbscan, axis=1)
new_test_features_dbscan = np.expand_dims(dbscan.fit_predict(test_features), axis=1)
Features_dbscan = np.concatenate((Features, new_features_dbscan), axis=1)
test_features_dbscan = np.concatenate((test_features, new_test_features_dbscan), axis=1)
logistic_dbscan = LogisticRegression(max_iter = 10000)
logistic_dbscan.fit(Features_dbscan, Labels)

# Mean shift clustering
mean_shift = MeanShift(bandwidth=4.2)
cluster_labels_mean_shift = mean_shift.fit_predict(Features)
new_features_mean_shift = np.expand_dims(cluster_labels_mean_shift, axis=1)
new_test_features_mean_shift = np.expand_dims(mean_shift.fit_predict(test_features), axis=1)
Features_mean_shift = np.concatenate((Features, new_features_mean_shift), axis=1)
test_features_mean_shift = np.concatenate((test_features, new_test_features_mean_shift), axis=1)
logistic_mean_shift = LogisticRegression(max_iter = 10000)
logistic_mean_shift.fit(Features_mean_shift, Labels)

# voting classifier
voting_classifier = VotingClassifier(estimators=[('logistic_agglomerative', logistic_agglomerative), ('logistic_dbscan', logistic_dbscan), ('logistic_mean_shift', logistic_mean_shift)], voting='hard')
voting_classifier.fit(Features, Labels)

# predict the test data
predictions = voting_classifier.predict(test_features)
# write the predictions to the submission file
with open('data/test/submission84.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Category'])
    for i in range(len(predictions)):
        writer.writerow([i, predictions[i]])