In [None]:
import csv
from numpy import genfromtxt
import numpy as np
import pandas as pd
from random import random
import math
import sys
import matplotlib
import matplotlib.pyplot as plt
import sklearn.linear_model
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Import relevant files and create master data set
filename1 = 'train_set_attr_scld.'
train_set_attr_scld = genfromtxt(filename1, delimiter=',')

filename2 = 'test_set_scld'
test_set_scld = genfromtxt(filename2, delimiter=',')

train_set_attr_scld = np.append(train_set_attr_scld,test_set_scld, axis=0)

# filenameExp is an experiments text file specifying hyperparameters being tested for each algorithm
# The remaining lines for each algorithm are for running processes in parallel on a cluster of cores, 
#  from which they are printed into log files or saved as .png figures.


# RANDOM FOREST CLASSIFICATION

def RFC(arr, num_trees, max_features, ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsRF.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            num_trees = float(entries[0])
            max_features = int(entries[1])
            ftr_size = int(entries[2])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[3 + feature]))

print('Random Forest Classifier')
print(RFC(train_set_attr_scld, num_trees, max_features,ftrs))

# K-MEANS CLUSTERING
def KMeansAlgorithm(arr, n_clusters, n_init, max_iter):
    X = arr[:,1:9]
    Y = arr[:,0]
    kfold = KFold(n_splits=10, random_state=7)
    model = KMeans(n_clusters=n_clusters,n_init=n_init,max_iter=max_iter)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsKM.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            num_trees = int(entries[0])
            max_features = int(entries[1])
            max_iter = int(entries[2])
            ftr_size = int(entries[3])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[4 + feature]))

print('K-Means Clustering')
print(KMeansAlgorithm(train_set_attr_scld, num_trees, max_features, max_iter, ftrs))

# T-DISTRIBUTED STOCHASTIC NEIGHBOR EMBEDDING
def TSNE_Alg(arr,perplexity,learning_rate,n_iter,ftrs):
    n_sne = 3505790
    inputData = train_set_attr_scld[:n_sne,ftrs]
    tsne = TSNE(perplexity=perplexity, learning_rate = learning_rate, n_iter=n_iter)
    tsne_results = tsne.fit_transform(inputData)
    X = tsne_results[:n_sne,0]
    Y = tsne_results[:n_sne,1]
    label = train_set_attr_scld[:n_sne,0]
    colors = ['red','blue','green']
    fig = plt.figure(figsize=(25,25))
    plt.scatter(x,y, c = label, cmap = matplotlib.colors.ListedColormap(colors)) 
    fig.savefig('output_' + str(int(sys.argv[1])) + '.png')

filenameExp = 'experimentsTSNE.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            perplexity = int(entries[0])
            lrate = int(entries[1])
            n_iter = int(entries[2])
            ftr_size = int(entries[3])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[4 + feature]))

print('TSNE')
TSNE_Alg(train_set_attr_scld, perplexity, lrate, n_iter,ftrs)

# K-NEAREST NEIGHBORS
def KNN(arr,n_neighbors,ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = KNeighborsClassifier(n_neighbors=n_neighbors)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsKNN.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            n_neighbors = int(entries[0])
            ftr_size = int(entries[1])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[2 + feature]))

print('KNN')
print(KNN(train_set_attr_scld, n_neighbors,ftrs))

# GRADIENT TREE BOOSTING
def GTB(arr, num_trees, ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = GradientBoostingClassifier(n_estimators=num_trees)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsGTB.txt'

lineNum = 45

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            num_trees = int(entries[0])
            ftr_size = int(entries[1])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[2 + feature]))

print('Gradient Tree Boosting')
print(GTB(train_set_attr_scld,num_trees,ftrs))

# EXTRA TREES
def ET(arr, num_trees, ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = ExtraTreesClassifier(n_estimators=num_trees, criterion = "entropy", max_features = "log2")
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsET.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            num_trees = int(entries[0])
            ftr_size = int(entries[1])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[2 + feature]))

print('Extra Trees')
print(ET(train_set_attr_scld,num_trees,ftrs))

# LOGISTIC REGRESSION
def LR(arr, Cval, ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = LogisticRegression(penalty = "l1", C = Cval, fit_intercept = True)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsLR.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            Cval = float(entries[0])
            ftr_size = int(entries[1])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[2 + feature]))

print('Logistic Regression')
print(LR(train_set_attr_scld,Cval,ftrs))

# SUPPORT VECTOR CLASSIFICATION
def SVC(arr, penalty, ftrs):
    X = arr[:,ftrs]
    Y = arr[:,0] 
    kfold = KFold(n_splits=10, random_state=7)
    model = SVC(C = penalty, kernel = "poly", degree = 3, gamma = 0.1, coef0 = 10.0)
    results = cross_val_score(model, X, Y, cv=kfold)
    return results.mean()

filenameExp = 'experimentsSVC.txt'

lineNum = 0

with open(filenameExp) as f:
    for line in f:
        lineNum = lineNum + 1
        if(lineNum == int(sys.argv[1])):
            entries = line.split(",")
            penalty = float(entries[0])
            ftr_size = int(entries[1])
            ftrs = list()
            for feature in range(ftr_size):
                ftrs.append(int(entries[2 + feature]))

print('Support Vector Classifier')
print(SVC(train_set_attr_scld,num_trees,ftrs))