In [18]:
import csv
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder


In [19]:
# Load data from the train data csv file
Feature = []
Label = []
with open('data/train/train.csv', 'r') as f:
    reader = csv.reader(f)
    train_data = list(reader)
    # remove the first row
    train_data = train_data[1:]
    for row in train_data:
        Feature.append(np.array([float(x) for x in row[1:-1]]))
        Label.append(row[-1])  
Features = np.array(Feature)
Labels = np.array(Label)
label_encoder = LabelEncoder()
Labels = label_encoder.fit_transform(Labels)

In [20]:
# Load data from the test data csv file
test_features = []
with open('data/test/test.csv', 'r') as f:
    reader = csv.reader(f)
    test_data = list(reader)
    # remove the first row
    test_data = test_data[1:]
    for row in test_data:
        test_features.append(np.array([float(x) for x in row[1:]]))
test_features = np.array(test_features)

In [21]:
pca = PCA(n_components=431)
pca.fit(Features)
Features = pca.transform(Features)
lda = LinearDiscriminantAnalysis()
lda.fit(Features, Labels)
Features = lda.transform(Features)
# perform k-means
k_means = KMeans(n_clusters = 19, random_state = 42).fit(Features)
Features_k_means = k_means.labels_
Features_k_means = Features_k_means.reshape(-1,1)
Features_k_means = preprocessing.scale(Features_k_means)
Features_k_means = np.concatenate((Features, Features_k_means), axis=1)
Features = Features_k_means

In [23]:
meta_model = LogisticRegression(max_iter=10000)
k_folds = 10
skf = StratifiedKFold(n_splits = k_folds, shuffle = True, random_state = 42)
meta_features_train = np.zeros((Features.shape[0], 2))
# Train the logistic regression model and the KNN model and generate meta features
for i, (train_index, test_index) in enumerate(skf.split(Features, Labels)):
    training_data = Features[train_index]
    training_labels = Labels[train_index]
    testing_data = Features[test_index]
    testing_labels = Labels[test_index]
    logreg = LogisticRegression(max_iter=10000)
    knn = KNeighborsClassifier(n_neighbors=1)
    logreg.fit(training_data, training_labels)
    knn.fit(training_data, training_labels)
    log_predictions = logreg.predict(testing_data)
    knn_predictions = knn.predict(testing_data)
    meta_features_train[test_index, 0] = log_predictions
    meta_features_train[test_index, 1] = knn_predictions
    print("finished the k fold for i = ", i)
# Train the meta model
meta_model.fit(meta_features_train, Labels)

finished the k fold for i =  0
finished the k fold for i =  1
finished the k fold for i =  2
finished the k fold for i =  3
finished the k fold for i =  4
finished the k fold for i =  5
finished the k fold for i =  6
finished the k fold for i =  7
finished the k fold for i =  8
finished the k fold for i =  9


In [24]:
# Load data from the test data csv file
test_features = []
with open('data/test/test.csv', 'r') as f:
    reader = csv.reader(f)
    test_data = list(reader)
    # remove the first row
    test_data = test_data[1:]
    for row in test_data:
        test_features.append(np.array([float(x) for x in row[1:]]))
test_features = np.array(test_features)
test_features = pca.transform(test_features)
test_features = lda.transform(test_features)
# perform k-means
test_features_k_means = k_means.predict(test_features)
test_features_k_means = test_features_k_means.reshape(-1,1)
test_features_k_means = preprocessing.scale(test_features_k_means)
test_features_k_means = np.concatenate((test_features, test_features_k_means), axis=1)
test_features = test_features_k_means
# Generate meta features for the test set
meta_features_test = np.zeros((test_features.shape[0], 2))
log_predictions = logreg.predict(test_features)
knn_predictions = knn.predict(test_features)
meta_features_test[:, 0] = log_predictions
meta_features_test[:, 1] = knn_predictions
# Evaluate the meta model
predictions = meta_model.predict(meta_features_test)
# convert the predictions back to string labels which was converted using labelEncoder
test_result_labels = label_encoder.inverse_transform(predictions)
# write the predictions to a csv file
with open('data/test/submission_57_results.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Id', 'Category'])
    for i in range(len(test_result_labels)):
        writer.writerow([i, test_result_labels[i]])