In [0]:
import numpy as np
import scipy as sp
import math
import pickle # just cuz
import io
from scipy.sparse import coo_matrix

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import Perceptron 
from sklearn.dummy import DummyClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier

# import hyperparameter optimization tools
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score 

In [0]:
from google.colab import drive
drive.mount('/content/gdrive') 

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def get_data(name = 'train'):
    filepath = "/content/gdrive/My Drive/data/"+name+".txt"

    data = io.open(filepath, "r", encoding='latin-1')
    X = data.read().split("\n")
    if X[-1] == '': X.pop()
    return X

In [0]:
test_variations = ['easy', 'big', 'massive', 'hard', 'exam']
def initialize_datasets():
    train = get_data('train')
    valid = get_data('validate')
    test = []
    for variation in test_variations:
        test.append(get_data('test_' + variation))
    return train, valid, test

train, valid, test = initialize_datasets()

In [0]:

def find_replace(string, dictionary):
    for char in string:
        if char in dictionary.keys():
            string = string.replace(char, dictionary[char])
    return string

def parse_vars(data_line):
    count = 0
    var_dict = {}
    alphabet = []
    for letter in range(97,123):
        alphabet.append(chr(letter))
    for char in data_line:
        if char.isalpha():
            if char not in var_dict.keys():
                var_dict[char] = alphabet[0]
                #print(var_dict)
                alphabet.pop(0)
                count+=1 
    parsed_string = find_replace(data_line,var_dict)
    outputs = parsed_string.split(",")
    
    a = outputs[0]
    b = outputs[1]
    y = outputs[2]
    return a, b, y, count




In [0]:
def generate_vocab(n = 24):
    vocab = {'~': 0, '&': 1, '|': 2, '>': 3}
    index = 0
    while index < n:
        vocab[str(chr(97 + index))] = index + 4
        index += 1
    return vocab

# line, count = parse_vars(str)
def vectorize(dataset, vocab_length = 24, ngram = 1):
    vocab = generate_vocab(vocab_length)
    X = []
    Y = [] 
    for line in dataset:  
        a, b, y, count = parse_vars(line)  
        X.append(a+b)
        Y.append(y) 
        
    vectorizer = CountVectorizer(analyzer = 'char', ngram_range = (1,ngram), vocabulary = vocab) 
    vectorizer.fit(X)
    X = vectorizer.fit_transform(X) 
    return X, np.array(Y)
    
        

In [0]:
X_train, Y_train = vectorize(train)
X_valid, Y_valid = vectorize(valid)
#print(X_train.shape)
X_test, Y_test = [], [] 
for data in test:  
    X, Y = vectorize(data)
    X_test.append(X)
    Y_test.append(Y)


In [0]:
# Initialize testfold
test_fold = np.zeros(X_train.shape[0] + X_valid.shape[0])
for i in range(X_train.shape[0]): test_fold[X_train.shape[0]]     = -1
for i in range(X_valid.shape[0]): test_fold[X_train.shape[0] + i] = 1
ps = PredefinedSplit(test_fold)

# Prepare input/output matrices  

X_train_coo = coo_matrix(X_train)
X_valid_coo = coo_matrix(X_valid)
X = sp.sparse.csr_matrix(sp.sparse.vstack([X_train_coo, X_valid_coo]))  
Y = np.concatenate((np.array(Y_train),np.array(Y_valid)))

In [0]:
def fit_and_print(model, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test):
    
    model.fit(X, Y)
    accuracy_train = accuracy_score(Y_train, model.predict(X_train))
    accuracy_valid = accuracy_score(Y_valid, model.predict(X_valid))
    test_accuracy = [] 
    for i in range(len(X_test)):
        test_accuracy.append(accuracy_score(Y_test[i], model.predict(X_test[i])))
    
    print("Train Accuracy: ", accuracy_train)
    print("Valid Accuracy: ", accuracy_valid) 
    for i in range(len(X_test)):
        print("Test", test_variations[i],"Accuracy:", test_accuracy[i])
    print('Hyperparams:', model.best_params_)    


In [0]:
SLP_parameters = [{     'penalty' : ['l2', 'l1', 'elasticnet', None], 
						'random_state': [2,3,4,11],
                        'alpha': [0.0001, 0.00001, 0.001, 0.002, 0.004, 0.007, 0.01, 0.025, 0.05]
						}]
SLP = GridSearchCV(Perceptron(), SLP_parameters, cv=ps) 
fit_and_print(SLP, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test)




Train Accuracy:  0.5030437742801074
Valid Accuracy:  0.4848
Test easy Accuracy: 0.4924
Test big Accuracy: 0.5005896226415094
Test massive Accuracy: 0.5
Test hard Accuracy: 0.5014
Test exam Accuracy: 0.53
Hyperparams: {'alpha': 0.007, 'penalty': 'l2', 'random_state': 3}




In [0]:
MLP_parameters = [{     'alpha': [1e-4, 1e-5],#, 1e-5, 1e-3
                        'activation': ['identity', 'logistic', 'tanh', 'relu'],#
						'hidden_layer_sizes': [(25,5)],#(155,3) #good (149,2) (20,4)
                        'random_state':[9]
						}]

MLP = GridSearchCV(MLPClassifier(), MLP_parameters, cv=ps)
fit_and_print(MLP, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test)
#8:31PM 12.5.2018 C.E.



Train Accuracy:  0.5788878208979135
Valid Accuracy:  0.5544
Test easy Accuracy: 0.5428
Test big Accuracy: 0.4988207547169811
Test massive Accuracy: 0.49955156950672647
Test hard Accuracy: 0.509
Test exam Accuracy: 0.56
Hyperparams: {'activation': 'relu', 'alpha': 0.0001, 'hidden_layer_sizes': (25, 5), 'random_state': 9}


In [0]:
# Random Classifier
print('\nDummy')
DUM = DummyClassifier(strategy='uniform')
DUM.fit(X, Y) #
for i in range(len(X_test)): 
    print("Test", test_variations[i],"Accuracy:", accuracy_score(Y_test[i], DUM.predict(X_test[i])))




Dummy
Test easy Accuracy: 0.4998
Test big Accuracy: 0.49233490566037735
Test massive Accuracy: 0.4923766816143498
Test hard Accuracy: 0.498
Test exam Accuracy: 0.66


In [0]:
# Naive Bayes
print('\nNaive Bayes')
smoothing = [0.0001, 0.00001, 0.001,0.002, 0.003, 0.004, 0.005, 0.006, 0.007, 0.008, 0.009, 0.01, 0.011, 0.012, 0.013, 0.014, 0.015, 0.017, 0.019, 0.02,0.021, 0.025, 0.03, 0.035, 0.037, 0.039, 0.04, 0.041, 0.042, 0.044, 0.048, 0.05,0.1, 0.2, 0.5, 0.8, 1]
#NB_parameters = [{'alpha': smoothing, 'binarize': [0.5]}]
NB_parameters = [{'alpha': smoothing}]
#MultinomialNB
#NB = GridSearchCV(BernoulliNB(), NB_parameters, cv=ps)
NB = GridSearchCV(MultinomialNB(), NB_parameters, cv=ps)
fit_and_print(NB, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test)


In [0]:
# Linear SVM
print('\nLinear SVM')
SVM_parameters = [{ 'penalty' : ['l2'],
                    'loss' : ['squared_hinge'],
                    'dual' : [False],
                    'tol': [0.00005, 0.0001, 0.001],
                    'C': [0.0001, 0.00001, 0.001, 0.002, 0.004, 0.007, 0.01, 0.025, 0.05] }]
SVM = GridSearchCV(LinearSVC(), SVM_parameters, cv=ps)
fit_and_print(SVM, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test)


In [0]:
# DecisionTreeClassifier  
print('\nDecision Tree Classifier')
DT_parameters = [{  'criterion': ['gini','entropy'],#'gini',
                    'splitter' : ['random', 'best'],#
                    'min_samples_split': [4],
                    'min_samples_leaf': [2],
                    'min_weight_fraction_leaf': [0.04, 0.4,0.2],
                    #'max_features':['auto','sqrt','log2',None],
                    'random_state':[None, 8],
                    'max_leaf_nodes':[3,4,5,None],
                    'min_impurity_decrease':[0.0,0.01,0.1],
                    #'class_weight':['balanced',None]
                    }]
DT = GridSearchCV(DecisionTreeClassifier(), DT_parameters, cv=ps)
fit_and_print(DT, X, Y, X_train, X_valid, X_test, Y_train, Y_valid, Y_test)
#3:35