In [1]:
"""loads the data, pre-trainied embeddings, feature sets, and trains a voting classifier and subsequently tests the model
    on the held-out test data"""

from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn import metrics
import numpy as np

import gensim.models

# import from the scripts provided by the creator(s) of Twitter Word2vec model to read pre-trained embeddings
# source: https://www.fredericgodin.com/software/
import word2vecReaderUtils as utils
from word2vecReader import *

from load import parse_dataset

In [2]:
# to run the following, download the word2vec Twitter model and set its path in the body of the function  
def bisectioned_embeddings_avg(corpus):
    """splits each tweet into 2 sections, averages word and emoji embeddings for each part separately"""
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    # SET THE PATH TO TWITTER WORD2VEC MODEL
    wvModel = Word2Vec.load_word2vec_format('../install_dependencies/word2vec_twitter_model.bin', binary=True)
    wvModel_size = wvModel.layer1_size
    # SET THE PATH TO TWITTER EMOJI2VEC MODEL
    emojiModel = gensim.models.KeyedVectors.load_word2vec_format('../install_dependencies/emoji2vec.bin', binary=True)
    emojiModel_size = emojiModel.vector_size 
    meanVectors = []
    for tweet in corpus:
        t = tokenizer(tweet)
        rightWords, rightEmojis, leftWords, leftEmojis = ([], [], [], [])
         # meaning of the following variables:
         # mRW: mean right words
         # mRE: mean right emojis 
         # mLW: mean left words
         # mLE: mean left emojis
        mRW, mRE, mLW, mLE = (np.zeros(wvModel_size), np.zeros(emojiModel_size), np.zeros(wvModel_size), np.zeros(emojiModel_size)) 
        for i in range(int(len(t)/2)):
            # don't look up embeddings for generic '@user'
            if t[i] in wvModel and not t[i].startswith('@'):
                rightWords.append(wvModel[t[i]])
            if t[i] in emojiModel:
                rightEmojis.append(emojiModel[t[i]])
        for i in range(int(len(t)/2), len(t)):
            if t[i] in wvModel and not t[i].startswith('@'):
                leftWords.append(wvModel[t[i]])
            if t[i] in emojiModel:
                leftEmojis.append(emojiModel[t[i]])
        if len(rightWords)>0:
            mRW = np.mean(rightWords, axis=0)
        if len(rightEmojis)>0:
            mRE = np.mean(rightEmojis, axis=0)
        if len(leftWords)>0:
            mLW = np.mean(leftWords, axis=0)
        if len(leftEmojis)>0:
            mLE = np.mean(leftEmojis, axis=0)
        # concatenate all the vector averages 
        meanVectors.append(np.concatenate((mRW, mRE, mLW, mLE)))
            
    return meanVectors

In [None]:
# Experiment settings

DATASET_FP = "../datasets/train/SemEval2018-T3-train-taskA_emoji.txt"
TASK = "A" # Define, A or B
FNAME = './predictions-task' + TASK + '.txt'
PREDICTIONSFILE = open(FNAME, "w")
EXTRA_FEATURES = 1 # we set this flag to 1 when we want to use handcrafted features in combination with embeddings 

K_FOLDS = 10 # 10-fold crossvalidation

random_state=11
# These CLF's are defined based on the code output of the original repository
CLF1 = SVC(C=1.0, 
           cache_size=200, 
           class_weight=None, 
           coef0=0.0,
           decision_function_shape=None, 
           degree=3, 
           gamma='auto', 
           kernel='rbf',
           max_iter=-1, 
           probability=True, 
           random_state=random_state, 
           shrinking=True,
           tol=0.001, 
           verbose=False)
# Previous, less verbose code: SVC(random_state=random_state, probability=True)

CLF2 = LogisticRegression(C=1.0, 
                          class_weight=None, 
                          dual=False, 
                          fit_intercept=True,
                          intercept_scaling=1,
                          l1_ratio=None,
                          max_iter=100,
                          multi_class='auto', 
                          n_jobs=-1,
                          penalty='l2', 
                          random_state=random_state, 
                          solver='liblinear', 
                          tol=0.0001,
                          verbose=0, 
                          warm_start=False)
# Previous, less verbose code: LogisticRegression(random_state=random_state, n_jobs=-1) 

CLF = VotingClassifier(estimators=[('svm', CLF1), ('lr', CLF2)], 
                       voting='soft', 
                       n_jobs=-1, 
                       weights=None)
# Previous, less verbose code: VotingClassifier(estimators=[('svm', CLF1), ('lr', CLF2)], voting='soft', n_jobs=-1)

# Loading dataset 
corpus, y = parse_dataset(DATASET_FP)

X = bisectioned_embeddings_avg(corpus)

if EXTRA_FEATURES:
    
    extraFeatures = np.load(open('train_feats_taskA.npy','rb'))
    indices = np.load(open('./indices', 'rb'), allow_pickle=True)
    
    extraFeatures =[extraF[indices] for extraF in extraFeatures]

    for i in range(len(X)):
        X[i] = np.concatenate((X[i],extraFeatures[i]))

class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
print ("class counts:",class_counts)

# Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated
predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS)

# Modify F1-score calculation depending on the task
if TASK.lower() == 'a':
    score = metrics.f1_score(y, predicted, pos_label=1)
    p = metrics.precision_score(y, predicted, pos_label=1)
    r = metrics.recall_score(y, predicted, pos_label=1)
    acc = metrics.accuracy_score(y, predicted)
elif TASK.lower() == 'b':
    # if you set average to None, it will return results for each class separately 
    score = metrics.f1_score(y, predicted, average=None)
    score_ = metrics.f1_score(y, predicted, average='macro')
    p = metrics.precision_score(y, predicted, average="macro")
    r = metrics.recall_score(y, predicted, average="macro")
    acc = metrics.accuracy_score(y, predicted)
print ("F1-score Task", TASK, score)
print ("Precision Task", TASK, p)
print ("Recall Task", TASK, r)
print ("Accuracy Task", TASK, acc)
for p in predicted:
    PREDICTIONSFILE.write("{}\n".format(p))
PREDICTIONSFILE.close()

In [None]:
print("Fit on the whole Train ...")
CLF.fit(X, y)

In [None]:
# save the model 
# import pickle
# filename = 'finalized_model.sav'
# pickle.dump(CLF, open(filename, 'wb'))

## if later you want to load the model, execute the following
# loaded_model = pickle.load(open(filename, 'rb'))

In [None]:
print("Ready to TEST")

test_corpus, y  = parse_dataset('../datasets/goldtest_TaskA/SemEval2018-T3_gold_test_taskA_emoji.txt')

X_test = bisectioned_embeddings_avg(test_corpus)
print("len(X_test)", len(X_test))

extraFeatures = np.load(open('./test_feats.npy', 'rb'))
extraFeatures =[extraF[indices] for extraF in extraFeatures]
for i in range(len(X_test)):
        X_test[i] = np.concatenate((X_test[i],extraFeatures[i]))

print("test X dimension",len(X_test[0]))

y_test_predicted = CLF.predict(X_test)

with open('predictions-taskA.txt', 'w') as f:
    for yp in y_test_predicted:
        f.write(str(yp)+"\n")

score = metrics.f1_score(y, y_test_predicted, pos_label=1)
p = metrics.precision_score(y, y_test_predicted, pos_label=1)
r = metrics.recall_score(y, y_test_predicted, pos_label=1)
acc = metrics.accuracy_score(y, y_test_predicted)

print ("F1-score Task", TASK, score)
print ("Precision Task", TASK, p)
print ("Recall Task", TASK, r)
print ("Accuracy Task", TASK, acc)

In [None]:
import pickle
filename = 'finalized_model_crowd.sav'
pickle.dump(CLF, open(filename, 'wb'))