Baseline F1-score Task A: 0.626344086022
One way to improve baseline: using emoticon-feature (though according to Barbieri, Saggion 2014 this feature is more effective with Humour corpora, because "ironic authors usually avoid emoticons and leave words to be the central thing").

Opinion lexicon: https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon

In [1]:
from nltk.tokenize import TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.datasets import dump_svmlight_file
from sklearn import metrics
import numpy as np
import logging
import codecs
from scipy import sparse

In [2]:
logging.basicConfig(level=logging.INFO)

In [3]:
def emoticons_dict(fz):
    z = {}
    with open(fz, 'rt', encoding="utf-8") as data_in:
        for line in data_in:
            emot = line.split()[0]
            label = line.split()[1]
            z[emot] = label
    return z

In [4]:
def polarity(fk):
    arr = []
    with open(fk, 'rt', encoding="utf-8") as data_in:
        try:
            for line in data_in:
                line = line.strip("\n")
                if fk.endswith("positive-words.txt"):
                    arr.append(line)
                else:
                    arr.append(line)
        except:
            pass
    return arr

In [5]:
def parse_dataset(fp):
    y = []
    corpus = []
    with open(fp, 'rt', encoding="utf-8") as data_in:
        for line in data_in:
            if not line.lower().startswith("tweet index"):
                line = line.rstrip()
                label = int(line.split("\t")[1])
                tweet = line.split("\t")[2]
                words_tweet = tweet.split()
                for item in words_tweet:
                    for key, value in dic.items():
                        if item == key:
                            tweet = tweet.replace(item, value)
                corpus.append(tweet)
                y.append(label)
    return corpus, y

In [6]:
def featurize(corpus):
    tokenizer = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True).tokenize
    vectorizer = TfidfVectorizer(strip_accents="unicode", analyzer="word", tokenizer=tokenizer, stop_words="english")
    X = vectorizer.fit_transform(corpus).toarray()
    i = 0
    while i < len(corpus):
        tweet = corpus[i].split()
        for word in tweet:
            if word in neg and "happy" in tweet:
                X[i] = X[i] * (-1)
            if word in pos and "sad" in tweet:
                X[i] = X[i] * (-1)
        i +=1
    X = sparse.csr_matrix(X)
    return X

In [7]:
if __name__ == "__main__":
    EMOTICS_FZ = "C:/Users/1/Desktop/emoticons.txt"
    dic = emoticons_dict(EMOTICS_FZ)
    
    LIST_POS = "C:/Users/1/Desktop/positive-words.txt"
    pos = polarity(LIST_POS)
    
    LIST_NEG = "C:/Users/1/Desktop/negative-words.txt"
    neg = polarity(LIST_NEG)
    
    DATASET_FP = "C:/Users/1/Desktop/taskA.txt"
    
    TASK = "A"
    FNAME = 'C:/Users/1/Desktop/predictions-task' + TASK + '.txt'
    PREDICTIONSFILE = open(FNAME, "w")
    
    K_FOLDS = 10
    CLF = LinearSVC()

    corpus, y = parse_dataset(DATASET_FP)
    X = featurize(corpus)

    class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
    print (class_counts)
    
    predicted = cross_val_predict(CLF, X, y, cv=K_FOLDS)
    
    if TASK.lower() == 'a':
        score = metrics.f1_score(y, predicted, pos_label=1)
    elif TASK.lower() == 'b':
        score = metrics.f1_score(y, predicted, average="macro")
    print ("F1-score Task", TASK, score)
    for p in predicted:
        PREDICTIONSFILE.write("{}\n".format(p))
    PREDICTIONSFILE.close()

[[0, 1923], [1, 1911]]
F1-score Task A 0.630283574104
