### Read training, dev and unlabeled test data

The following provides a starting code (Python 3) of how to read the labeled training and dev cipher text, and unlabeled test cipher text, into lists.

In [1]:
train, dev, test = [], [], []

In [2]:
for x in open('./train_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    train.append(x)
print (len(train))
print (train[:3])

16220
[[0, 'lkêcê yoúc cêêö y#êjl lw mówám Újám j Úêê# ütlk Úol lkêú z#ê ctöé8ú ówl xoóóú éê#xw#öê#c .'], [0, '6êcétlê jolêot8 zc éê#xw#öjóáê , tl zc j #jlkê# 8tcl8êcc jöÚ8ê 6wüó lkê öt668ê wx lkê #wj6 , ükê#ê lkê lkêöjltá t#wótêc j#ê lww wÚ2twoc jó6 lkê cê+oj8 éw8tltác lww cöoy .'], [0, 'tx lktc kw8t6jú öw2tê tc coééwcê6 lw Úê j ytxl , cwöêÚw6ú oóü#jééê6 tl êj#8ú , lwwm wol j88 lkê yww6 cloxx , jó6 8êxl Úêktó6 lkê á#jé ( 8tlê#j88ú ) .']]


In [3]:
for x in open('./dev_enc.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r').split('\t')
    # x[0] will be the label (0 or 1), and x[1] will be the ciphertext sentence.
    x[0] = int(x[0]) 
    dev.append(x)
print (len(dev))
print (dev[:3])

2027
[[1, 'ów8jó Ú#j2ê8ú l#êj6c ükê#ê xêü jöê#tájó xt8öc 6j#ê lw 6ê82ê 77 tólw lkê üw#86 wx jöÚt2j8êóáê jó6 jöÚtyotlú <<<'], [0, 'ê2êó öo#ékú zc ê+éê#l áwötá ltötóy jó6 xjöê6 ákj#tcöj áj ózl #êcáoê lktc êxxw#l .'], [1, 'üt88 jcco#ê68ú #jóm jc wóê wx lkê á8ê2ê#êcl , öwcl 6êáêélt2ê8ú jöoctóy áwöê6têc wx lkê úêj# .']]


#### Different from 'train' and 'dev' that are both list of tuples, 'test' will be just a list.

In [4]:
for x in open('./test_enc_unlabeled.tsv', encoding='utf-8'):
    x = x.rstrip('\n\r')
    test.append(x)
print (len(test))
print (test[:3])

2028
['j 6t6jáltá jó6 6o88 6wáoöêólj#ú y8w#txútóy cwxlüj#ê jój#ákú .', 'ówlktóy cltámc , #êj88ú , ê+áêél j 8tóyê#tóy á#êêétóêcc wóê xêê8c x#wö Úêtóy 6#jyyê6 lk#woyk j cj6 , cw#6t6 oót2ê#cê wx yoóc , 6#oyc , j2j#táê jó6 6jöjyê6 6#êjöc .', 'öo#ékú jó6 üt8cwó jáloj88ú öjmê j é#êllú yww6 lêjö <<< Úol lkê é#wvêál co##woó6tóy lkêö tc 6tcl#êcctóy8ú #wlê .']


#### You can split every sentence into lists of words by white spaces.

In [5]:
train_split = [[x[0], x[1].split(' ')] for x in train]
dev_split = [[x[0], x[1].split(' ')] for x in dev]
test_split = [[x.split(' ')] for x in test]

### Main Code Body

You may choose to experiment with different methods using your program. However, you need to embed the training and inference processes at here. We will use your prediction on the unlabeled test data to grade, while checking this part to understand how your method has produced the predictions.

In [6]:

import numpy as np
import random
from statistics import mean
from random import shuffle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier as knn
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from keras.backend import clear_session
from keras.models import Sequential
from keras import layers
from keras.backend import clear_session

from warnings import filterwarnings
filterwarnings('ignore')


In [7]:
# X is the cyphertext sentences and Y is the binary classifications
trainL_X = [line[1] for line in train]
trainL_Y = [line[0] for line in train]

devL_X = [line[1] for line in dev]
devL_Y = [line[0] for line in dev]

testL_X = [line for line in test]


In [8]:
# preprocessing ???

In [55]:
random.randint(50, 100)

60

In [9]:
# accuracy when guessing

guess_errors = 0

for label in devL_Y:
    # our labels are 0 and 1
    random_guess = random.randint(0, 1)
    if label != random_guess:
        guess_errors += 1

accuracy = 1-guess_errors/len(devL_Y)
print(accuracy)


0.512086827824371


In [10]:
# Data was already loaded for us
# Now we need to transform the dataset to make the cypher text lines compatible with our classifier; 
    # eg using tfidf or CountVectorizer (ie BOW)
# Will use a Classifier (eg Multinomial NB) for the binary sentence classification


In [11]:
def mnb_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=False):
    # Multinomial Naive Bayes

    # hyper-parameter tuning (on the train set) for the smoothing parameter alpha
    alpha_setL = [0.01, 0.1, 0.4, 0.8, 1, 2]
    # alpha_setL = [0, 0.001, 0.01, 0.1, 1, 2, 5, 10, 100]

    # using cross validation
    cv_scoreL = []
    for alpha in alpha_setL:
        nb_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', ngram_range=(1, 2), lowercase=False)), ("mnb", MultinomialNB(alpha=alpha))] )
        # binary=True, smooth_idf=False, 

        # using a 5-fold cross validation
        if test == True:
            cv_score = cross_val_score(nb_classifier, trainL_X+devL_X, trainL_Y+devL_Y, cv=5, scoring="accuracy").mean()
        else:
            cv_score = cross_val_score(nb_classifier, trainL_X, trainL_Y, cv=5, scoring="accuracy").mean()
        cv_scoreL.append((alpha, cv_score))
    #     print(cv_score)

    # We use the parameter with the highest CV accuracy to train our MNB classifier model
    cv_scoreL.sort(reverse=True, key=lambda x:x[1])
    best_alpha = cv_scoreL[0][0]
    
    print("Best alpha:", best_alpha)
    
    
    # Building our binary classification model (NB)
    # Term Frequency - Inverse Document Frequency (TFIDF) is a technique for text vectorization based on the Bag of words (BoW) model. It performs better than the BoW model as it considers the importance of the word in a document into consideration.

    nb_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', ngram_range=(1, 2), lowercase=False)), ("mnb", MultinomialNB(alpha=best_alpha))] )

    if test == True:
        # Training on the train set, by using TFIDF for text vectorization, which is used to fit the naive bayes model
        nb_classifier.fit(trainL_X+devL_X, trainL_Y+devL_Y)
    else:
        # Training on the train set, by using TFIDF for text vectorization, which is used to fit the naive bayes model
        nb_classifier.fit(trainL_X, trainL_Y)

    # Binary label predictions for the cyphertexts; tested on dev data
    predictionsL = nb_classifier.predict(testL_X)

    
    return predictionsL
    
    


In [12]:
# # for dev set predictions -- using train set for training
# predL = mnb_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)


# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_NB = predL.copy()


In [13]:
# for test set predictions -- using train+dev sets for training
predL = mnb_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=True)


predictionsL_NB = predL.copy()


Best alpha: 0.1


In [14]:
# def knn_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=False):
#     # KNN 

#     # hyper-parameter tuning (on the train set) for the smoothing parameter k
#     k_setL = list(range(10, 0, -1))

#     # using cross validation
#     cv_scoreL = []
#     for k in k_setL:
#         knn_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', binary=True, ngram_range=(1, 2))), ("knn", knn(n_neighbors=k, metric = "euclidean"))] )
#         # binary=True, smooth_idf=False, 

#         # using a 5-fold cross validation
#         if test == True:
#             cv_score = cross_val_score(knn_classifier, trainL_X+devL_X, trainL_Y+devL_X, cv=5, scoring="accuracy").mean()
#         else:
#             cv_score = cross_val_score(knn_classifier, trainL_X, trainL_Y, cv=5, scoring="accuracy").mean()

#         cv_scoreL.append((k, cv_score))
#     #     print(cv_score)

#     # We use the parameter with the highest CV accuracy to train our MNB classifier model
#     cv_scoreL.sort(reverse=True, key=lambda x:x[1])
#     best_k = cv_scoreL[0][0]

#     print("Best k", best_k)
    
    
#     knn_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', binary=True, ngram_range=(1, 2))), ("knn", knn(n_neighbors=best_k, metric = "euclidean"))] )

#     if test == True:
#         knn_classifier.fit(trainL_X+devL_X, trainL_Y+devL_Y)
#     else:
#         knn_classifier.fit(trainL_X, trainL_Y)

#     # Binary label predictions for the cyphertexts; tested on dev data
#     predictionsL = knn_classifier.predict(testL_X)
    
    
#     return predictionsL
    
    
    

In [15]:
# # for dev set predictions -- using train set for training
# predL = knn_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_KNN = predL.copy()


In [16]:
# # for test set predictions -- using train+dev sets for training
# predL = knn_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=True)


# predictionsL_KNN = predL.copy()


In [17]:
# the actual model runs better with smooth_idf=False
# however, including that in the CV gives a worse best_C
# look here

# !!! ???
# at cv=5, we get c=100 which is good
# at cv=10, we get c=30 which is bad

# same with using smooth_idf=False; get better accuracy without it

# test these things on randomized train+dev sets; overview function
# same with NB model


In [18]:
def logit_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=False):
    
    # logistic regression

    # hyper-parameter tuning (on the train set) for the penalty/regularization parameter C
    C_setL = [1, 5, 10, 30, 50, 100, 200, 500]

    cv_scoreL = []
    for C in C_setL:
        svc_model = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', binary=True, lowercase=False, ngram_range=(1, 2))), ("logit", LogisticRegression(C=C))] )
        # sublinear_tf=True, min_df=5, norm='l2', 
        # why did not using pipeline, not give an error here; such as the combination of l1 with dual=True

        # using a 5-fold cross validation
        if test == True:
            cv_score = cross_val_score(svc_model, trainL_X+devL_X, trainL_Y+devL_Y, cv=5, scoring="accuracy").mean()
        else:
            cv_score = cross_val_score(svc_model, trainL_X, trainL_Y, cv=5, scoring="accuracy").mean()
        cv_scoreL.append((C, cv_score))

    # We use the parameter combo with the highest CV accuracy to train our SVC model
    cv_scoreL.sort(reverse=True, key=lambda x:x[1])
    best_C = cv_scoreL[0][0]

    print("Best C:", best_C)


    # logit_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(binary=True, ngram_range=(1, 2))), ("logit", LogisticRegression())] )
    logit_classifier = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', binary=True, smooth_idf=False, lowercase=False, ngram_range=(1, 2))), ("logit", LogisticRegression(C=best_C))] )
    # penalty = none gives accuracy: 0.894; makes C useless ???

    if test == True:
        logit_classifier.fit(trainL_X+devL_X, trainL_Y+devL_Y)
    else:
        logit_classifier.fit(trainL_X, trainL_Y)

    # predictionsL = list( logit_classifier.predict(testL_X) )
    predictionsL = list( logit_classifier.predict(testL_X) )
    
    
    return predictionsL
    


In [19]:
# # for dev set predictions -- using train set for training
# predL = logit_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_Logit = predL.copy()


In [20]:
# for test set predictions -- using train+dev sets for training 
predL = logit_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=True)


predictionsL_Logit = predL.copy()


Best C: 30


In [21]:
def svc_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=False):

    # (Gaussian) Linear SVC

    # It takes VERY long to parameter tune; hours; a gridsearch is practically impossible with a standard device
    # That is why I had to only limit the C list to [1, 10] and the cross-val to 3 (even though its too small)
    C_setL = C_setL = [1, 10]

    cv_scoreL = []
    for C in C_setL:
        svc_model = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', ngram_range=(1, 2))), ("svc", SVC(kernel="rbf", C=C))] )
        # sublinear_tf=True, min_df=5, norm='l2', 
        # why did not using pipeline, not give an error here; such as the combination of l1 with dual=True


        # using a 4-fold cross validation
        if test == True:
            cv_score = cross_val_score(svc_model, trainL_X+devL_X, trainL_Y+devL_Y, cv=4, scoring="accuracy").mean()
        else:
            cv_score = cross_val_score(svc_model, trainL_X, trainL_Y, cv=4, scoring="accuracy").mean()
        cv_scoreL.append((C, cv_score))

    # We use the parameter combo with the highest CV accuracy to train our SVC model
    cv_scoreL.sort(reverse=True, key=lambda x:x[1])
    best_C = cv_scoreL[0][0]

    print("Best C:", best_C)


    svc_model = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', ngram_range=(1, 2))), ("svc", SVC(kernel="rbf", C=best_C))] )

    if test == True:
        svc_model.fit(trainL_X+devL_X, trainL_Y+devL_Y)
    else:
        svc_model.fit(trainL_X, trainL_Y)

    # our classifier's label predictions
    predictionsL = svc_model.predict(testL_X)


    return predictionsL

    

In [22]:
# # for dev set predictions -- using train set for training
# predL = svc_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_SVC = predL.copy()


In [23]:
# for test set predictions -- using train+dev sets for training
predL = svc_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=True)


predictionsL_SVC = predL.copy()


Best C: 10


In [24]:
# Perceptron (ie FFNN)

# clear_session()

In [25]:
# batch_size=128 -> 0.8846
# sometimes 0.8806;  NN has randomness

# for predictionsL (for test set)
# rerun this 5 (???) times and get its majority voting

# do a grid cross_val
# optimizer: adam, sgd, adamaz, nadam, rmsprop
# loss:
# unit size
# activation 

# neural networks use a dev/validation set when fitting
# So, we cannot join the train+dev unlike the other classifiers


In [26]:
def ffnn_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X):

    # Perceptron (FFNN)

    vectorizer = TfidfVectorizer(encoding='utf-8', binary=True, lowercase=False, ngram_range=(1, 2))

    vectorizer.fit(trainL_X)

    trainL_X_vec = vectorizer.transform(trainL_X).toarray()
    devL_X_vec = vectorizer.transform(devL_X).toarray()
    testL_X_vec = vectorizer.transform(testL_X).toarray()


    # Number of features
    input_dim = trainL_X_vec.shape[1] 

    model = Sequential()
    # model.add(layers.LSTM(20, input_shape=(10, 1), return_sequences=True))
    # model.add(layers.Dense(20, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(10, input_dim=input_dim, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))

    # model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
#     model.summary()

    model.fit(trainL_X_vec, np.array(trainL_Y), epochs=11, verbose=False, 
              validation_data=(devL_X_vec, np.array(devL_Y)), batch_size=128)


#     loss, accuracy = model.evaluate(devL_X_vec, np.array(devL_Y), verbose=False)
#     print(accuracy)

    # !!! ??? *** look here 
    # add testL_X to parameters
    # vectorize the test set too
    # make the devL_X_vec below (for prediction) into testL_X_vec
    # do this for all classifiers
    # not for FFNN, but for the rest:
        # after the standard parameter tuning, train final model on train+dev sets
        # and test on test set; use "if test==True" condition to join them
    predictionsL_NN = model.predict(testL_X_vec)
    # Applying transformation to get binary values predictions with 0.5 as threshold
    predictionsL_NN = [round(pred[0]) for pred in predictionsL_NN]
    
    clear_session()
    
    return predictionsL_NN

    

In [27]:
# # for dev set predictions -- using train set for training
# predL = ffnn_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_NN = predL.copy()


In [28]:
# for test set predictions -- using train set for training
# cant use train+dev here
predL = ffnn_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X)


predictionsL_NN = predL.copy()


In [29]:
def ensemble_voting3(predL_classifier1, predL_classifier2, predL_classifier3):

    predictionsL_MajVote = []

    # for label1, label2, label3 in zip(predictionL_SVC, predictionsL_Logit, predictionsL_NB):
    for label1, label2, label3 in zip(predL_classifier1, predL_classifier2, predL_classifier3):
        zero = 0
        one = 0

        for label in (label1, label2, label3):
            if label == 0:
                zero += 1
            elif label ==1:
                one += 1

        if zero > one:
            predictionsL_MajVote.append(0)
        elif one > zero:
            predictionsL_MajVote.append(1)
            
    return predictionsL_MajVote


In [33]:
def rforest_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=False):
    
    # Random Forest Classifier
    
    # the variability in random forest is fairly significant, so I will use the max vote of its 3 predictions
    prediction_setsL = []
    
    for i in range(3):

        rforest_model = Pipeline( steps = [("vectorizer", TfidfVectorizer(encoding='utf-8', ngram_range=(1, 2))), ("forest", RandomForestClassifier(n_estimators=700, bootstrap=False))] )    

        if test == True:
            rforest_model.fit(trainL_X+devL_X, trainL_Y+devL_Y)
        else:
            rforest_model.fit(trainL_X, trainL_Y)

        # our classifier's label predictions
        predictionsL = rforest_model.predict(testL_X)
        
        prediction_setsL.append(predictionsL)

    
    predictionsL_MajVote = ensemble_voting3(*prediction_setsL)
    
    return predictionsL_MajVote
    

    

In [34]:
# # for dev set predictions -- using train set for training
# predL = rforest_classification(trainL_X, trainL_Y, devL_X, devL_Y, devL_X)

# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, predL):
#     if label != pred:
#         errors += 1

# accuracy = 1-(errors/total)
# print(accuracy)

# predictionsL_RF = predL.copy()



In [35]:
# for test set predictions -- using train set for training
predL = rforest_classification(trainL_X, trainL_Y, devL_X, devL_Y, testL_X, test=True)


predictionsL_RF = predL.copy()



In [36]:
# boosting my predictor using ensemble learning: bagging
# For each observation, I take the majority vote of (so far) 3 ~89.7% classifiers
# This "combination" of predictions hopefully yields better results; max is ~90% so far


In [37]:
# predictionsL_SVC
# predictionsL_Logit
# predictionsL_KNN
# predictionsL_NB
# predictionsL_NN

# predictionsL_KNN = predictionsL.copy()
    # only have an accuracy of ~87%


# Since its a binary classification, aggragating 3 algorithms should be enough
# at least its acceptable, until I get 2 more good (>89 ?) models


In [38]:
# def ensemble_voting4(predL_classifier1, predL_classifier2, predL_classifier3, predL_classifier4):

#     predictionsL_MajVote = []

#     for label1, label2, label3, label4 in zip(predL_classifier1, predL_classifier2, predL_classifier3, predL_classifier4):
#         zero = 0
#         one = 0

#         for label in (label1, label2, label3, label4):
#             if label == 0:
#                 zero += 1
#             elif label == 1:
#                 one += 1

#         if zero > one:
#             predictionsL_MajVote.append(0)
#         elif one > zero:
#             predictionsL_MajVote.append(1)
#         elif one == zero:
#             predictionsL_MajVote.append(label1)
#             # tie breaker goes to Logit as it has the highest accuracy
    
#     return predictionsL_MajVote
        

In [52]:
def ensemble_voting5(predL_classifier1, predL_classifier2, predL_classifier3, predL_classifier4, predL_classifier5):

    predictionsL_MajVote = []

    for label1, label2, label3, label4, label5 in zip(predL_classifier1, predL_classifier2, predL_classifier3, predL_classifier4, predL_classifier5):
        zero = 0
        one = 0

        for label in (label1, label2, label3, label4, label5):
            if label == 0:
                zero += 1
            elif label == 1:
                one += 1

        if zero > one:
            predictionsL_MajVote.append(0)
        elif one > zero:
            predictionsL_MajVote.append(1)
        print(label1, label2, label3, label4, label5)
    
    return predictionsL_MajVote
            

In [53]:
# ensemble_voting5(predictionsL_Logit, predictionsL_NB, predictionsL_SVC, predictionsL_RF, predictionsL_NN)


0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 1 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 0 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 1 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 0 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 1 1 0 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 1 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 1 0
0 0 0 0 0
1 0 0 0 1


1 1 1 0 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 1 1 0 1
1 1 1 1 0
0 1 0 0 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 1 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 1 0 0 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 0 1
1 1 1 1 1
1 1 1 1 1
0 0 0 1 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 0 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 0 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 1 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0


0 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 1 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 0 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 0 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 1 0 1 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
0 0 0 0 0
1 1 1 1 1
0 0 0 0 0
1 0 0 1 1
1 0 1 1 1
0 0 0 0 0
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1
1 1 1 1 1


[0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,


In [40]:
# majVote_predL = []
# majVote_predL = ensemble_voting5(predictionsL_Logit, predictionsL_NB, predictionsL_SVC, predictionsL_KNN, predictionsL_NN)
# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, majVote_predL):
#     if label != pred:
#         errors += 1
# accuracy = 1-(errors/total)
# print(accuracy)
        

In [45]:
# majVote_predL = []
# majVote_predL = ensemble_voting5(predictionsL_Logit, predictionsL_NB, predictionsL_SVC, predictionsL_RF, predictionsL_NN)
# total = len(devL_Y)
# errors = 0

# for label, pred in zip(devL_Y, majVote_predL):
#     if label != pred:
#         errors += 1
# accuracy = 1-(errors/total)
# print(accuracy)


In [42]:
# ensemble with KNN: 0.9003453379378392
# ensemble with RF: 0.900838677849038

In [43]:
# testing data 

In [46]:
# Eventually, results need to be a list of 2028 0 or 1's
results = ensemble_voting5(predictionsL_Logit, predictionsL_NB, predictionsL_SVC, predictionsL_RF, predictionsL_NN)

# predictionsL_SVC
# predictionsL_Logit
# predictionsL_RF
# predictionsL_NB
# predictionsL_NN



In [47]:
print(len(results))

2028


In [48]:
# test_pred_setsL = (predictionsL_Logit, predictionsL_NB, predictionsL_SVC, predictionsL_RF, predictionsL_NN)

# for pred1 in test_pred_setsL:
#     for pred2 in test_pred_setsL:
#         total = len(pred1)
#         errors = 0

#         for label, pred in zip(pred1, pred2):
#             if label != pred:
#                 errors += 1

#         accuracy = 1-(errors/total)
#         print(accuracy)
#     print()


1.0
0.9571005917159763
0.985207100591716
0.945759368836292
0.9644970414201184

0.9571005917159763
1.0
0.9610453648915187
0.9245562130177515
0.9580867850098619

0.985207100591716
0.9610453648915187
1.0
0.9497041420118343
0.965483234714004

0.945759368836292
0.9245562130177515
0.9497041420118343
1.0
0.9289940828402367

0.9644970414201184
0.9580867850098619
0.965483234714004
0.9289940828402367
1.0



### Output Prediction Result File

You will need to submit a prediction result file. It should have 2028 lines, every line should be either 0 or 1, which is your model's prediction on the respective test set instance.

In [49]:
# suppose you had your model's predictions on the 2028 test cases read from test_enc_unlabeled.tsv, and 
#those results are in the list called 'results'
assert (len(results) == 2028)

In [50]:
# make sure the results are not float numbers, but intergers 0 and 1
results = [int(x) for x in results]

In [51]:
# write your prediction results to 'upload_predictions.txt' and upload that later
with open('upload_predictions.txt', 'w', encoding = 'utf-8') as fp:
    for x in results:
        fp.write(str(x) + '\n')