In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import spacy

In [3]:
test_data = pd.read_csv('test-1.txt', sep='\t')
train_data = pd.read_csv('train-3.txt', sep='\t')

In [4]:
test_data.head(3)

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2018-En-01964,Gm and have a #Tuesday!,valence,0: neutral or mixed emotional state can be inf...
1,2018-En-01539,@realDonaldTrump But you have a lot of time fo...,valence,0: neutral or mixed emotional state can be inf...
2,2018-En-04235,I graduated yesterday and already had 8 family...,valence,0: neutral or mixed emotional state can be inf...


In [5]:
train_data.head(3)

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Class
0,2017-En-30153,@liamch88 yeah! :) playing well,valence,0: neutral or mixed emotional state can be inf...
1,2017-En-40929,At least I don't have a guy trying to discoura...,valence,0: neutral or mixed emotional state can be inf...
2,2017-En-22012,UPLIFT: If you're still discouraged it means y...,valence,0: neutral or mixed emotional state can be inf...


In [6]:
train_data.columns

Index(['ID', 'Tweet', 'Affect Dimension', 'Intensity Class'], dtype='object')

In [7]:
sentiment_mapping = {"0: neutral or mixed emotional state can be inferred": 'neutral',
                    "1: slightly positive emotional state can be inferred": 'neutral',
                    "2: moderately positive emotional state can be inferred": 'positive',
                    "3: very positive emotional state can be inferred": 'positive',
                    "-1: slightly negative emotional state can be inferred": 'neutral',
                    "-2: moderately negative emotional state can be inferred": 'negative',
                    "-3: very negative emotional state can be inferred": 'negative'
                }

In [8]:
train_data['Intensity'] = train_data['Intensity Class'].replace(sentiment_mapping)

In [9]:
test_data['Intensity'] = test_data['Intensity Class'].replace(sentiment_mapping)

In [10]:
train_data.drop(columns=['Intensity Class','ID','Affect Dimension'], inplace=True)
test_data.drop(columns=['Intensity Class','ID','Affect Dimension'], inplace=True)

In [11]:
train_data.head(10)

Unnamed: 0,Tweet,Intensity
0,@liamch88 yeah! :) playing well,neutral
1,At least I don't have a guy trying to discoura...,neutral
2,UPLIFT: If you're still discouraged it means y...,neutral
3,"...at your age, the heyday in the blood is tam...",neutral
4,i was so embarrassed when she saw us i was lik...,negative
5,Really planned on making videos this week. The...,negative
6,I hate having ideas but being too afraid to sh...,negative
7,"At the regular cheerfulness of any emotion, he...",neutral
8,A pessimist sees the difficulty in every oppor...,neutral
9,Just because I'm hurting \nDoesn't mean I'm hu...,negative


In [12]:
train_data['Tweet'] = train_data['Tweet'].str.lower()
test_data['Tweet'] = test_data['Tweet'].str.lower()

In [13]:
df_all = pd.concat([train_data, test_data], axis=0)

In [14]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

df_all['tokens'] = df_all['Tweet'].apply(word_tokenize)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shivamsinghrawat/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
df_all.head(10)

Unnamed: 0,Tweet,Intensity,tokens
0,@liamch88 yeah! :) playing well,neutral,"[@, liamch88, yeah, !, :, ), playing, well]"
1,at least i don't have a guy trying to discoura...,neutral,"[at, least, i, do, n't, have, a, guy, trying, ..."
2,uplift: if you're still discouraged it means y...,neutral,"[uplift, :, if, you, 're, still, discouraged, ..."
3,"...at your age, the heyday in the blood is tam...",neutral,"[..., at, your, age, ,, the, heyday, in, the, ..."
4,i was so embarrassed when she saw us i was lik...,negative,"[i, was, so, embarrassed, when, she, saw, us, ..."
5,really planned on making videos this week. the...,negative,"[really, planned, on, making, videos, this, we..."
6,i hate having ideas but being too afraid to sh...,negative,"[i, hate, having, ideas, but, being, too, afra..."
7,"at the regular cheerfulness of any emotion, he...",neutral,"[at, the, regular, cheerfulness, of, any, emot..."
8,a pessimist sees the difficulty in every oppor...,neutral,"[a, pessimist, sees, the, difficulty, in, ever..."
9,just because i'm hurting \ndoesn't mean i'm hu...,negative,"[just, because, i, 'm, hurting, \ndoes, n't, m..."


In [16]:
from gensim.models import Word2Vec

sentences = df_all['tokens'].tolist()

In [18]:
cbow_model = Word2Vec(
    sentences=sentences,
    vector_size=100,    
    window=5,          
    min_count=5,       
    sg=0,              
    workers=4 
)

skipgram_model = Word2Vec(
    sentences=sentences,
    vector_size=100,
    window=5,
    min_count=5,
    sg=1,              # Skip-gram (1)
    workers=4
)

In [19]:
# 5. Save models
cbow_model.save("word2vec_cbow.model")
skipgram_model.save("word2vec_skipgram.model")

In [20]:
test_words = ["people", "smile", "amazing", "time"]

def get_similar_words(model, word, n=20):
    try:
        return model.wv.most_similar(word, topn=n)
    except KeyError:
        return f"{word} not in vocabulary"

# CBOW Results
print("CBOW Similar Words:")
for word in test_words:
    print(f"\n{word}:")
    print(get_similar_words(cbow_model, word))

# Skip-gram Results
print("\nSkip-gram Similar Words:")
for word in test_words:
    print(f"\n{word}:")
    print(get_similar_words(skipgram_model, word))

CBOW Similar Words:

people:
[('and', 0.9996721744537354), ('she', 0.9996697306632996), ('for', 0.9996540546417236), ('up', 0.9996422529220581), ('to', 0.9996315240859985), ('an', 0.9996191263198853), ('is', 0.9996159672737122), ('by', 0.9996126294136047), ('my', 0.9996110200881958), ('the', 0.9996082186698914), ('are', 0.999606192111969), ('about', 0.999605119228363), ('has', 0.9996013641357422), ('when', 0.9995970129966736), ('they', 0.9995940923690796), ('her', 0.9995924830436707), ('can', 0.999592125415802), ('a', 0.9995900392532349), ('out', 0.9995893836021423), ('being', 0.9995850324630737)]

smile:
[('this', 0.9991061091423035), ('.', 0.9990772008895874), ('-', 0.9990461468696594), ('or', 0.9990441203117371), ("'", 0.999039888381958), ('?', 0.9990136027336121), ('happy', 0.9990071654319763), ('on', 0.9989997744560242), ('of', 0.9989823698997498), ('for', 0.9989822506904602), ('their', 0.9989784359931946), ('is', 0.9989719986915588), ('them', 0.9989688396453857), ('your', 0.99896

## Task 2 ##

In [17]:
from gensim.models import KeyedVectors
glove = KeyedVectors.load_word2vec_format("glove.6B.100d.100K-1.w2v.txt", binary=False)

In [20]:
test_words = ["people", "smile", "amazing", "time"]
for word in test_words:
    print(f"\n{word}:")
    print(glove.most_similar(word, topn=20))


people:
[('others', 0.8318450450897217), ('those', 0.8050370812416077), ('many', 0.7967616319656372), ('some', 0.7750677466392517), ('they', 0.7717718482017517), ('residents', 0.7647792100906372), ('them', 0.7625812888145447), ('than', 0.7592476010322571), ('all', 0.758562445640564), ('families', 0.7561567425727844), ('say', 0.755620002746582), ('there', 0.7546865344047546), ('lives', 0.7536002397537231), ('children', 0.753533124923706), ('citizens', 0.7532268166542053), ('least', 0.7485319972038269), ('more', 0.7447736263275146), ('so', 0.7420145273208618), ('have', 0.7417193651199341), ('americans', 0.7411874532699585)]

smile:
[('grin', 0.8655877709388733), ('smiles', 0.7909191846847534), ('eyes', 0.7565146088600159), ('smiling', 0.7020254135131836), ('laugh', 0.7009447813034058), ('tears', 0.6666248440742493), ('smirk', 0.6415873765945435), ('sight', 0.635871410369873), ('smiled', 0.6354667544364929), ('hug', 0.6348837614059448), ('eyed', 0.6320615410804749), ('touch', 0.611414670

In [19]:
glove.get_vector('king')

array([-0.32307 , -0.87616 ,  0.21977 ,  0.25268 ,  0.22976 ,  0.7388  ,
       -0.37954 , -0.35307 , -0.84369 , -1.1113  , -0.30266 ,  0.33178 ,
       -0.25113 ,  0.30448 , -0.077491, -0.89815 ,  0.092496, -1.1407  ,
       -0.58324 ,  0.66869 , -0.23122 , -0.95855 ,  0.28262 , -0.078848,
        0.75315 ,  0.26584 ,  0.3422  , -0.33949 ,  0.95608 ,  0.065641,
        0.45747 ,  0.39835 ,  0.57965 ,  0.39267 , -0.21851 ,  0.58795 ,
       -0.55999 ,  0.63368 , -0.043983, -0.68731 , -0.37841 ,  0.38026 ,
        0.61641 , -0.88269 , -0.12346 , -0.37928 , -0.38318 ,  0.23868 ,
        0.6685  , -0.43321 , -0.11065 ,  0.081723,  1.1569  ,  0.78958 ,
       -0.21223 , -2.3211  , -0.67806 ,  0.44561 ,  0.65707 ,  0.1045  ,
        0.46217 ,  0.19912 ,  0.25802 ,  0.057194,  0.53443 , -0.43133 ,
       -0.34311 ,  0.59789 , -0.58417 ,  0.068995,  0.23944 , -0.85181 ,
        0.30379 , -0.34177 , -0.25746 , -0.031101, -0.16285 ,  0.45169 ,
       -0.91627 ,  0.64521 ,  0.73281 , -0.22752 , 

## Task 3 ##

In [23]:
# Classification training input: df_train “Tweet” column;
# training labels: df_train “Intensity Class” column;
# test input: df_test “Tweet” column;
# test labels: df_test “Intensity Class” column.
# ● Use the token embeddings, which are generated by the embedding models you created
# (experiment with different kinds of embeddings you created: word2vec – skip-gram and
# word2vec-CBOW), as features to train logistic regression classifiers.
# ● Use the token embeddings generated by the pretrained GloVe model as features to train
# another logistic regression classifier.
# ● Compare the performance of different logistic regression models created in the previous two
# steps. Discuss your observations.
# ● Compare the models that use embeddings in this assignment and the models created in
# assignment 3 using count-based features. Discuss your observations.

In [22]:
X_train = train_data[['Tweet']]
y_train = train_data[['Intensity']]
X_test = test_data[['Tweet']]
y_test = test_data[['Intensity']]

print(train_data['Intensity'].value_counts())

Intensity
neutral     586
negative    378
positive    217
Name: count, dtype: int64


In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, auc


lg = LogisticRegression(random_state=0, solver='liblinear')

In [35]:
import numpy as np

def transform_data_for_word_model(model, data_df):
    v = model.wv.get_vector('king')
    X = np.zeros((len(data_df), v.shape[0]))
    n = 0
    for index, row in data_df.iterrows():
        tokens = row["Tweet"].split()
        vecs = []
        m = 0
        emptycount = 0
        for word in tokens:
            try:
                # throws KeyError if word not found
                vec = model.wv.get_vector(word)
                vecs.append(vec)
                m += 1
            except KeyError:
                pass
        if len(vecs) > 0:
            vecs = np.array(vecs)
            X[n] = vecs.mean(axis=0)
        else:
            emptycount += 1
        n+=1
    return X

In [36]:
xtrain_cbow = transform_data_for_word_model(cbow_model,X_train)
xtest_cbow = transform_data_for_word_model(cbow_model,X_test)

In [42]:
def log_reg(xtrain,y_train,xtest,y_test):
    lg = LogisticRegression(random_state=0, solver='liblinear')
    lg.fit(xtrain, y_train)
    predictions = lg.predict(xtest)

    print("Accuracy score: ", accuracy_score(y_test, predictions))
    print("Precision score: ", precision_score(y_test, predictions, average="weighted"))
    print("Recall score: ", recall_score(y_test, predictions, average = "weighted"))
    print("F1 score: ", f1_score(y_test, predictions, average = "weighted"))

In [43]:
log_reg(xtrain_cbow,y_train,xtest_cbow, y_test)

Accuracy score:  0.4791889007470651
Precision score:  0.2296220025991806
Recall score:  0.4791889007470651
F1 score:  0.3104701535864823


  y = column_or_1d(y, warn=True)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
xtrain_skip = transform_data_for_word_model(skipgram_model,X_train)
xtest_skip = transform_data_for_word_model(skipgram_model,X_test)

In [44]:
log_reg(xtrain_skip,y_train,xtest_skip, y_test)

Accuracy score:  0.4727854855923159
Precision score:  0.2593118343447321
Recall score:  0.4727854855923159
F1 score:  0.30991406707651653


  y = column_or_1d(y, warn=True)


In [49]:
def transform_data_for_glove(model, data_df):
    v = model.get_vector('king')
    X = np.zeros((len(data_df), v.shape[0]))
    n = 0
    for index, row in data_df.iterrows():
        tokens = row["Tweet"].split()
        vecs = []
        m = 0
        emptycount = 0
        for word in tokens:
            try:
                # throws KeyError if word not found
                vec = model.get_vector(word)
                vecs.append(vec)
                m += 1
            except KeyError:
                pass
        if len(vecs) > 0:
            vecs = np.array(vecs)
            X[n] = vecs.mean(axis=0)
        else:
            emptycount += 1
        n+=1
    return X


xtrain_glove = transform_data_for_glove(glove,X_train)
xtest_glove = transform_data_for_glove(glove,X_test)

In [50]:
log_reg(xtrain_glove,y_train,xtest_glove, y_test)

Accuracy score:  0.5208110992529349
Precision score:  0.5302978450289926
Recall score:  0.5208110992529349
F1 score:  0.5021804438200395


  y = column_or_1d(y, warn=True)
