In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
import string
import nltk 
from wordsegment import load, segment
from nltk.corpus import sentiwordnet as swn
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

import re 
nltk.download('stopwords')
nltk.download('sentiwordnet')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/sentiwordnet.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [None]:
! pip install wordsegment
! pip install vaderSentiment
! pip install Afinn

Collecting wordsegment
[?25l  Downloading https://files.pythonhosted.org/packages/cf/6c/e6f4734d6f7d28305f52ec81377d7ce7d1856b97b814278e9960183235ad/wordsegment-1.3.1-py2.py3-none-any.whl (4.8MB)
[K     |████████████████████████████████| 4.8MB 8.4MB/s 
[?25hInstalling collected packages: wordsegment
Successfully installed wordsegment-1.3.1
Collecting vaderSentiment
[?25l  Downloading https://files.pythonhosted.org/packages/76/fc/310e16254683c1ed35eeb97386986d6c00bc29df17ce280aed64d55537e9/vaderSentiment-3.3.2-py2.py3-none-any.whl (125kB)
[K     |████████████████████████████████| 133kB 4.5MB/s 
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Collecting Afinn
[?25l  Downloading https://files.pythonhosted.org/packages/86/e5/ffbb7ee3cca21ac6d310ac01944fb163c20030b45bda25421d725d8a859a/afinn-0.1.tar.gz (52kB)
[K     |████████████████████████████████| 61kB 3.4MB/s 
[?25hBuilding wheels for collected packages: Afinn
  Building wheel for Afinn 

**Fetature Creation**

In [None]:
# VADER Sentiment
def feature_extractor_vedar(sentences):
    intensity_anly = SentimentIntensityAnalyzer()

    features = [] 
    for sent in sentences:  
        sentiment_dict = intensity_anly.polarity_scores(sent)
        pos = sentiment_dict['pos']
        neg = sentiment_dict['neg']
        neu = sentiment_dict['neu']
        comp = sentiment_dict['compound']
        features.append([pos, neg, neu, comp])

    return np.array(features)

In [None]:
# senti word net features
from nltk import sent_tokenize, word_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

def penn_to_wn(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None

def sentiwordnet(sentences):
    features_sent = []
    for sent in sentences:
        tagged_sentence = pos_tag(word_tokenize(sent))
        pos_score = neg_score = 0
        for word, tag in tagged_sentence:
            wn_tag = penn_to_wn(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue

            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            pos_score += swn_synset.pos_score()
            neg_score += swn_synset.neg_score()
        features_sent.append([pos_score, neg_score])
    return np.array(features_sent)

In [None]:
from afinn import Afinn

def affin_features(sentences):
    af = Afinn()
    feature_affin = []
    for sentence in sentences:
        score = af.score(sentence)
        feature_affin.append([score])
    
    return np.array(feature_affin)

In [None]:
def mpqalexicon(sentences):

    f = open('/content/lexicons/lexicons/2. mpqa.txt')
    text = f.read().split('\n')
    dict_mpqalexicon = {}

    for word in text:
        spl = word.split('\t')
        if ( len(spl) == 2 ):
            dict_mpqalexicon[spl[0]] = spl[1]

    mpqla_features = []
    for sent in sentences:
        pos_score = 0
        neg_score = 0
        for word in sent.split(' '):
            if (word in dict_mpqalexicon):
                if ( dict_mpqalexicon[word] == 'negative' ):
                    neg_score += 1
                else:
                    pos_score += 1
        mpqla_features.append([pos_score, neg_score])

    return np.array(mpqla_features)


In [None]:
def bingliu_lexicon(sentences):

    df = pd.read_csv('/content/lexicons/lexicons/1. BingLiu.csv')
    data = df.to_numpy()
    bing_liu_dict = {}

    for text in data:
        spl = text[0].split('\t')
        if ( len(spl) == 2 ):
            bing_liu_dict[spl[0]] = spl[1]

    bing_liu_features = []
    for sent in sentences:
        pos_score = 0
        neg_score = 0
        for word in sent.split(' '):
            if (word in bing_liu_dict):
                if ( bing_liu_dict[word] == 'negative' ):
                    neg_score += 1
                else:
                    pos_score += 1
        bing_liu_features.append([pos_score, neg_score])

    return np.array(bing_liu_features)


In [None]:
def sentiment_140_lexicon(sentences):

    f = open('/content/lexicons/lexicons/3. Sentiment140-Lexicon-v0.1/unigrams-pmilexicon.txt')
    data = f.read().split('\n')

    senti_dict = {}
    for text in data:
        spl = text.split('\t')
        if ( len(spl) == 4 ):
            word, pos, neg, neu = spl
            senti_dict[word] = [pos, neg, neu]

    senti_features = []
    for sent in sentences:
        pos_score = 0
        neg_score = 0
        neu_score = 0
        for word in sent.split(' '):
            if (word in senti_dict):
                pos_score += float (senti_dict[word][0])
                neg_score += float (senti_dict[word][1])
                neu_score += float (senti_dict[word][2])
        
        senti_features.append([pos_score, neg_score, neu_score])
        
    return np.array(senti_features)

In [None]:
def nrc_hastag_senti(sentences):

    f = open('/content/lexicons/lexicons/7. NRC-Hashtag-Sentiment-Lexicon-v0.1/unigrams-pmilexicon.txt')
    data = f.read().split('\n')
    
    nrc_hashtag_dict = {}
    for text in data:
        spl = text.split('\t')
        if ( len(spl) == 4 ):
            word, score, pos, neg = spl
            nrc_hashtag_dict[word] = [score, pos, neg]
    
    nrc_hashtag_features = []
    for sent in sentences:
        total_score = 0
        pos_score = 0
        neg_score = 0
        for word in sent.split(' '):
            if (word in nrc_hashtag_dict):
                total_score += float (nrc_hashtag_dict[word][0])
                pos_score += float (nrc_hashtag_dict[word][1])
                neg_score += float (nrc_hashtag_dict[word][2])
        
        nrc_hashtag_features.append([total_score, pos_score, neg_score])
        
    return np.array(nrc_hashtag_features)


In [None]:
def nrc_word_emotion_lexicon(sentences):

    f = open('/content/lexicons/lexicons/8. NRC-word-emotion-lexicon.txt')
    data = f.read().split('\n')

    # print(data[:100])
    nrc_word_emotion_dict = {}
    for word in data:
        spl = word.split('\t')
        if ( len(spl) == 3 ):
            word, emotion, score = spl
            # nrc_word_emotion_dict[word] = {}
            # print(emotion)
            if (emotion == 'anger'):
                nrc_word_emotion_dict[word] = score

    nrc_word_emotion_features = []
    for sent in sentences:
        total_score = 0
        for word in sent.split(' '):
            if (word in nrc_word_emotion_dict):
                total_score += float (nrc_word_emotion_dict[word])
        
        nrc_word_emotion_features.append([total_score])
        
    return np.array(nrc_word_emotion_features)

In [None]:
def nrc_10_expanded(sentences):
    df = pd.read_csv('/content/lexicons/lexicons/6. NRC-10-expanded.csv', error_bad_lines=False)
    text = df.to_numpy()

    nrc_10_expanded_dict = {}

    for data in text:
        sample = data[0]
        spl = sample.split('\t')
        
        if (len(spl) == 11):
            word = spl[0]
            score = spl[5]
            nrc_10_expanded_dict[word] = score 

    nrc_10_expanded_features = []
    for sent in sentences:
        total_score = 0
        for word in sent.split(' '):
            if (word in nrc_10_expanded_dict):
                total_score += float (nrc_10_expanded_dict[word])
        
        nrc_10_expanded_features.append([total_score])
        
    return np.array(nrc_10_expanded_features)

In [None]:
def nrc_hashtag_emotion_lexicon(sentences):
    f = open('/content/lexicons/lexicons/5. NRC-Hashtag-Emotion-Lexicon-v0.2.txt')
    text = f.read().split('\n')[35:]
    
    nrc_hashtag_emotion_lexicon_dict = {}
    for data in text:
        spl = data.split('\t')
        if (len(spl) == 3):
            if (spl[0] == 'anger'):
                nrc_hashtag_emotion_lexicon_dict[spl[1]] = spl[2]

    
    nrc_hashtag_emotion_lexicon_features = []
    for sent in sentences:
        total_score = 0
        for word in sent.split(' '):
            if (word in nrc_hashtag_emotion_lexicon_dict):
                total_score += float (nrc_hashtag_emotion_lexicon_dict[word])
        
        nrc_hashtag_emotion_lexicon_features.append([total_score])
        
    return np.array(nrc_hashtag_emotion_lexicon_features)


In [None]:
def afinn_emoticon(sentences):
    f = open('/content/lexicons/lexicons/9. AFINN-emoticon-8.txt')
    text = f.read().split('\n')

    affin_emoticon_dict = {}
    for data in text:
        spl = data.split('\t')
        if (len(spl) == 2):
            affin_emoticon_dict[spl[0]] = spl[1]

    affin_emoticon_features = []
    for sent in sentences:
        total_score = 0
        for word in sent.split(' '):
            if (word in affin_emoticon_dict):
                total_score += float (affin_emoticon_dict[word])
        
        affin_emoticon_features.append([total_score])
        
    return np.array(affin_emoticon_features)

In [None]:
# negation_features_ex(tokenised_data)
def negation_features_ex(tokenised_data):

    keywordSet = {"don't","never", "nothing", "nowhere", "noone", "none", "not",
                "hasn't","hadn't","can't","couldn't","shouldn't","won't",
                "wouldn't","don't","doesn't","didn't","isn't","aren't","ain't"}

    negation_features = []
    for sentence in tokenised_data:
        punct = re.findall(r'[.:;!?]',sentence)
        if (len(punct) > 0):
            punct = punct[0]
            wordSet = { x for x in re.split("[.:;!?, ]",sentence) if x }
            
            neg_words = wordSet & keywordSet
            tagged_sent = sentence
            for word in neg_words:
                start_to_w = sentence[:sentence.find(word)+len(word)]
                w_to_punct =  re.sub(r'\b([A-Za-z\']+)\b',r'\1_NEG',
                                sentence[sentence.find(word)+len(word):sentence.find(punct)])
                punct_to_end = sentence[sentence.find(punct):]
                tagged_sent = start_to_w + w_to_punct + punct_to_end
            negation_features.append(tagged_sent)
        else:
            negation_features.append(sentence)


    negation_feature_count = []

    for neg_feat in negation_features:
        count = neg_feat.count('_NEG')
        negation_feature_count.append([count])

    return negation_features, np.array(negation_feature_count)

In [None]:
def to_sentences(filename):
    # filename = "joy-ratings-0to1train.txt"
    f = open(filename)
    data = f.read().split('\n')

    score_arr = []
    tweet_arr = ""

    for text in data:
        spl = text.split('\t')
        if ( len(spl) > 2 ):
            score = spl[-1]
            tweet = spl[-3]

            tweet_arr += (tweet + '\n')
            score_arr.append(score)

    f = open('text_' + filename, 'w')
    f.write(tweet_arr)
    return score_arr

In [None]:
def pos_tagged_tokenised(filename):
    f = open(filename, 'r')
    text = f.read().split('\n')
    tokenised_data = []
    for data in text:
        spl = data.split('\t')
        if ( len(spl) == 4):
            tokenised_data.append(spl[0])
    return np.array(tokenised_data)

In [None]:
train_score = to_sentences('anger_train.txt')
! java -Xmx500m -jar ark-tweet-nlp-0.3.2.jar text_anger_train.txt > text_anger_train_pos_tagged.txt

test_score = to_sentences('anger_test.txt')
! java -Xmx500m -jar ark-tweet-nlp-0.3.2.jar text_anger_test.txt > anger_test_pos_tagged.txt

Detected text input format
Tokenized and tagged 857 tweets (14774 tokens) in 3.0 seconds: 286.0 tweets/sec, 4929.6 tokens/sec
Detected text input format
Tokenized and tagged 760 tweets (13677 tokens) in 2.8 seconds: 273.9 tweets/sec, 4928.6 tokens/sec


In [None]:
train_tokenised_data = pos_tagged_tokenised('text_anger_train_pos_tagged.txt')
test_tokeinsed_data = pos_tagged_tokenised('anger_test_pos_tagged.txt')

In [None]:
import zipfile
filename = "lexicons.zip"
with zipfile.ZipFile(filename, 'r') as zip_ref:
    zip_ref.extractall()

In [None]:
train_score = np.array(train_score, 'float')
X_train, X_test, y_train, y_test = train_test_split(train_tokenised_data, train_score, test_size=0.2, random_state=42)
print(len(X_train), len(y_train), len(X_test), len(y_test))

685 685 172 172


In [None]:
def get_concatenated_features(tokeinsed_data):
    # 2. Vedar features
    vedar = feature_extractor_vedar(tokeinsed_data)
    # 3. Lexicon based Features:
        # a Polar word count:
            # mpqa subjective lexicon
    mpqla_features = mpqalexicon(tokeinsed_data)
    # bing liu lexicon
    bingliu_features = bingliu_lexicon(tokeinsed_data)
    # b. Aggregate polarity scores:
    # Sentiment140
    sentiment_140_features = sentiment_140_lexicon(tokeinsed_data)
    # affin
    affin_feat = affin_features(tokeinsed_data)
    # senti word net
    senti_word_net_features = sentiwordnet(tokeinsed_data)

    # Aggregate polarity scores (Hashtags):
    # NRC Hashtag Sentiment lexicon
    nrc_hashtag_sentiment_features = nrc_hastag_senti(tokeinsed_data)
    # Emotion word count:
    # NRC Word-Emotion Association Lexicon
    nrc_word_emotion_features = nrc_word_emotion_lexicon(tokeinsed_data)

    # Aggregate emotion score: 
    # NRC-10 Expanded lexicon
    nrc_10_expanded_features = nrc_10_expanded(tokeinsed_data)

    # Aggregate emotion score (Hashtags):
    # NRC Hashtag Emotion Association Lexicon
    nrc_hashtag_emotion_features = nrc_hashtag_emotion_lexicon(tokeinsed_data)

    # Emoticons score
    # AFINN emoticon
    affin_emotion_features = afinn_emoticon(tokeinsed_data)

    ## Negated fetaures
    text_data, count_data = negation_features_ex(tokeinsed_data)
    # concat = np.concatenate((ngram_features, ngram_features_bi, vedar, mpqla_features, 
    #                      bingliu_features, sentiment_140_features, affin_feat, 
    #                      senti_word_net_features, nrc_hashtag_sentiment_features,
    #                      nrc_word_emotion_features, nrc_10_expanded_features,
    #                      nrc_hashtag_emotion_features, affin_emotion_features), axis=1)

    concat = np.concatenate((vedar, mpqla_features,bingliu_features,
                             affin_feat, nrc_word_emotion_features,
                             nrc_10_expanded_features, affin_emotion_features), axis=1)
    return concat

**Training bi-grms and uni-grms**

In [None]:
rest_features = get_concatenated_features(X_train)
cv = CountVectorizer(analyzer='word', ngram_range=(1,1), 
                        stop_words = nltk.corpus.stopwords.words('english'))
# 1. uni ngram features
ngram_features = cv.fit_transform(X_train).toarray()

cv2 = CountVectorizer(analyzer='word', ngram_range=(2,2), 
                    stop_words = nltk.corpus.stopwords.words('english'))
# 1. bi ngram features
ngram_features_bi = cv2.fit_transform(X_train).toarray()

train_features = np.concatenate((ngram_features, ngram_features_bi, rest_features), axis = 1)

b'Skipping line 42: expected 1 fields, saw 2\nSkipping line 49: expected 1 fields, saw 2\nSkipping line 59: expected 1 fields, saw 2\nSkipping line 69: expected 1 fields, saw 2\nSkipping line 3301: expected 1 fields, saw 2\nSkipping line 3400: expected 1 fields, saw 2\nSkipping line 3401: expected 1 fields, saw 2\nSkipping line 3402: expected 1 fields, saw 3\nSkipping line 3403: expected 1 fields, saw 4\nSkipping line 3404: expected 1 fields, saw 5\nSkipping line 3405: expected 1 fields, saw 6\nSkipping line 3406: expected 1 fields, saw 7\nSkipping line 3407: expected 1 fields, saw 2\nSkipping line 3408: expected 1 fields, saw 2\nSkipping line 3409: expected 1 fields, saw 2\nSkipping line 3410: expected 1 fields, saw 2\nSkipping line 3411: expected 1 fields, saw 2\nSkipping line 3412: expected 1 fields, saw 2\nSkipping line 3422: expected 1 fields, saw 2\nSkipping line 3637: expected 1 fields, saw 2\nSkipping line 3638: expected 1 fields, saw 3\nSkipping line 3639: expected 1 fields, s

**Testing data bi-grms and uni-grms**

In [None]:
rest_features = get_concatenated_features(X_test)
# 1. uni ngram features
ngram_features = cv.transform(X_test).toarray()
# 1. bi ngram features
ngram_features_bi = cv2.transform(X_test).toarray()
test_features = np.concatenate((ngram_features, ngram_features_bi, rest_features), axis = 1)

b'Skipping line 42: expected 1 fields, saw 2\nSkipping line 49: expected 1 fields, saw 2\nSkipping line 59: expected 1 fields, saw 2\nSkipping line 69: expected 1 fields, saw 2\nSkipping line 3301: expected 1 fields, saw 2\nSkipping line 3400: expected 1 fields, saw 2\nSkipping line 3401: expected 1 fields, saw 2\nSkipping line 3402: expected 1 fields, saw 3\nSkipping line 3403: expected 1 fields, saw 4\nSkipping line 3404: expected 1 fields, saw 5\nSkipping line 3405: expected 1 fields, saw 6\nSkipping line 3406: expected 1 fields, saw 7\nSkipping line 3407: expected 1 fields, saw 2\nSkipping line 3408: expected 1 fields, saw 2\nSkipping line 3409: expected 1 fields, saw 2\nSkipping line 3410: expected 1 fields, saw 2\nSkipping line 3411: expected 1 fields, saw 2\nSkipping line 3412: expected 1 fields, saw 2\nSkipping line 3422: expected 1 fields, saw 2\nSkipping line 3637: expected 1 fields, saw 2\nSkipping line 3638: expected 1 fields, saw 3\nSkipping line 3639: expected 1 fields, s

**SVM Regression Training**

In [None]:
from sklearn.svm import SVR

svm_model = SVR(kernel='linear', 
                degree = 3, max_iter = -1, gamma = 'auto')
svm_model.fit(train_features, y_train)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
    kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [None]:
# mpqla : 0.3542180321668468
# bingliu : 0.3620280056526838
# affin : 0.37217170874188976
# nrc word emotion : 0.37249692201319107
# nrc 10 expanded : 0.37656293113960404
# affin 0.37688994220118466
y_pred_svm = svm_model.predict(test_features)
r2_score(y_test, y_pred_svm)

0.37688994220118466

**Decision Tree Regression Training**

In [None]:
from sklearn.tree import DecisionTreeRegressor

decision_tree_model = DecisionTreeRegressor(max_depth= 5, 
                                            random_state= 48)
decision_tree_model.fit(train_features, y_train)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=5,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=48, splitter='best')

In [None]:
y_pred_dtree = decision_tree_model.predict(test_features)
r2_score(y_test, y_pred_dtree)

0.22758606099567713

**MLP Regression Training**

In [None]:
from sklearn.neural_network import MLPRegressor

mlpRegressor_model = MLPRegressor(random_state=1, max_iter = 350, 
                                  early_stopping = True, warm_start = True
                                  , learning_rate_init = 0.1, solver = 'sgd', validation_fraction = 0.2)
mlpRegressor_model.fit(train_features, y_train)

In [None]:
y_pred_mlp = mlpRegressor_model.predict(test_features)
r2_score(y_test, y_pred_mlp)
# 0.3530634846189322

0.3530634846189322

**Submission Code**

In [None]:
def make_submission(filename, y_pred):
    f = open('anger_test.txt')
    data = f.read().split('\n')
    score_arr = []
    tweet_arr = ""
    a = ''
    i = 0
    for text in data:
            spl = text.split('\t')
            if ( len(spl) == 4 ):
                spl[-1] = str(y_pred[i])

            a += '\t'.join(spl) + '\n'
            i += 1
    f = open(filename, 'w')
    f.write(a)

In [None]:
rest_features = get_concatenated_features(train_tokenised_data)
cv = CountVectorizer(analyzer='word', ngram_range=(1,1), 
                        stop_words = nltk.corpus.stopwords.words('english'))
# 1. uni ngram features
ngram_features = cv.fit_transform(train_tokenised_data).toarray()

cv2 = CountVectorizer(analyzer='word', ngram_range=(2,2), 
                    stop_words = nltk.corpus.stopwords.words('english'))
# 1. bi ngram features
ngram_features_bi = cv2.fit_transform(train_tokenised_data).toarray()

train_features = np.concatenate( (ngram_features, ngram_features_bi, rest_features), axis = 1)

b'Skipping line 42: expected 1 fields, saw 2\nSkipping line 49: expected 1 fields, saw 2\nSkipping line 59: expected 1 fields, saw 2\nSkipping line 69: expected 1 fields, saw 2\nSkipping line 3301: expected 1 fields, saw 2\nSkipping line 3400: expected 1 fields, saw 2\nSkipping line 3401: expected 1 fields, saw 2\nSkipping line 3402: expected 1 fields, saw 3\nSkipping line 3403: expected 1 fields, saw 4\nSkipping line 3404: expected 1 fields, saw 5\nSkipping line 3405: expected 1 fields, saw 6\nSkipping line 3406: expected 1 fields, saw 7\nSkipping line 3407: expected 1 fields, saw 2\nSkipping line 3408: expected 1 fields, saw 2\nSkipping line 3409: expected 1 fields, saw 2\nSkipping line 3410: expected 1 fields, saw 2\nSkipping line 3411: expected 1 fields, saw 2\nSkipping line 3412: expected 1 fields, saw 2\nSkipping line 3422: expected 1 fields, saw 2\nSkipping line 3637: expected 1 fields, saw 2\nSkipping line 3638: expected 1 fields, saw 3\nSkipping line 3639: expected 1 fields, s

In [None]:
rest_features = get_concatenated_features(test_tokeinsed_data)
# 1. uni ngram features
ngram_features = cv.transform(test_tokeinsed_data).toarray()
# 1. bi ngram features
ngram_features_bi = cv2.transform(test_tokeinsed_data).toarray()
test_features = np.concatenate( (ngram_features, ngram_features_bi, rest_features), axis = 1)

b'Skipping line 42: expected 1 fields, saw 2\nSkipping line 49: expected 1 fields, saw 2\nSkipping line 59: expected 1 fields, saw 2\nSkipping line 69: expected 1 fields, saw 2\nSkipping line 3301: expected 1 fields, saw 2\nSkipping line 3400: expected 1 fields, saw 2\nSkipping line 3401: expected 1 fields, saw 2\nSkipping line 3402: expected 1 fields, saw 3\nSkipping line 3403: expected 1 fields, saw 4\nSkipping line 3404: expected 1 fields, saw 5\nSkipping line 3405: expected 1 fields, saw 6\nSkipping line 3406: expected 1 fields, saw 7\nSkipping line 3407: expected 1 fields, saw 2\nSkipping line 3408: expected 1 fields, saw 2\nSkipping line 3409: expected 1 fields, saw 2\nSkipping line 3410: expected 1 fields, saw 2\nSkipping line 3411: expected 1 fields, saw 2\nSkipping line 3412: expected 1 fields, saw 2\nSkipping line 3422: expected 1 fields, saw 2\nSkipping line 3637: expected 1 fields, saw 2\nSkipping line 3638: expected 1 fields, saw 3\nSkipping line 3639: expected 1 fields, s

In [None]:
svm_model.fit(train_features, train_score)
submission_svm_pred = svm_model.predict(test_features)

In [None]:
decision_tree_model.fit(train_features, train_score)
submission_dtree_pred = decision_tree_model.predict(test_features)

In [None]:
mlpRegressor_model.fit(train_features, train_score)
submission_mlp_pred = mlpRegressor_model.predict(test_features)

In [None]:
! git clone https://github.com/felipebravom/EmoInt.git

Cloning into 'EmoInt'...
remote: Enumerating objects: 264, done.[K
remote: Total 264 (delta 0), reused 0 (delta 0), pack-reused 264[K
Receiving objects: 100% (264/264), 1016.20 KiB | 2.00 MiB/s, done.
Resolving deltas: 100% (136/136), done.


In [None]:
make_submission('svm_model_submission.txt', submission_svm_pred)
make_submission('decision_tree_model_submission.txt', submission_dtree_pred)
make_submission('mlp_submission.txt', submission_mlp_pred)

In [None]:
! python2 /content/EmoInt/evaluate.py 1 '/content/svm_model_submission.txt' '/content/anger-pred.txt'

Pearson correlation between /content/svm_model_submission.txt and /content/anger-pred.txt:	0.7852915877156502
Spearman correlation between /content/svm_model_submission.txt and /content/anger-pred.txt:	0.7748515715394922
Pearson correlation for gold scores in range 0.5-1 between /content/svm_model_submission.txt and /content/anger-pred.txt:	0.6436375012978239
Spearman correlation for gold scores in range 0.5-1 between /content/svm_model_submission.txt and /content/anger-pred.txt:	0.5805008849820491

Average Pearson correlation:	0.7852915877156502
Average Spearman correlation:	0.7748515715394922
Average Pearson correlation for gold scores in range 0.5-1:	0.6436375012978239
Average Spearman correlationfor gold scores in range 0.5-1:	0.5805008849820491


In [None]:
! python2 /content/EmoInt/evaluate.py 1 '/content/decision_tree_model_submission.txt' '/content/anger-pred.txt'

Pearson correlation between /content/decision_tree_model_submission.txt and /content/anger-pred.txt:	0.7029358766445881
Spearman correlation between /content/decision_tree_model_submission.txt and /content/anger-pred.txt:	0.7314147810017549
Pearson correlation for gold scores in range 0.5-1 between /content/decision_tree_model_submission.txt and /content/anger-pred.txt:	0.4950080818920411
Spearman correlation for gold scores in range 0.5-1 between /content/decision_tree_model_submission.txt and /content/anger-pred.txt:	0.45583600381925915

Average Pearson correlation:	0.7029358766445881
Average Spearman correlation:	0.7314147810017549
Average Pearson correlation for gold scores in range 0.5-1:	0.4950080818920411
Average Spearman correlationfor gold scores in range 0.5-1:	0.45583600381925915


In [None]:
! python2 /content/EmoInt/evaluate.py 1 '/content/mlp_submission.txt' '/content/anger-pred.txt'

Pearson correlation between /content/mlp_submission.txt and /content/anger-pred.txt:	0.7529335653765463
Spearman correlation between /content/mlp_submission.txt and /content/anger-pred.txt:	0.7382077278755588
Pearson correlation for gold scores in range 0.5-1 between /content/mlp_submission.txt and /content/anger-pred.txt:	0.660883611535093
Spearman correlation for gold scores in range 0.5-1 between /content/mlp_submission.txt and /content/anger-pred.txt:	0.5546694993659812

Average Pearson correlation:	0.7529335653765463
Average Spearman correlation:	0.7382077278755588
Average Pearson correlation for gold scores in range 0.5-1:	0.660883611535093
Average Spearman correlationfor gold scores in range 0.5-1:	0.5546694993659812
