In [4]:
from nltk.tokenize import TweetTokenizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec, FastText
from multiprocessing import cpu_count

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import numpy as np

import plotly.graph_objects as go
import plotly.express as px

from src.preprocessing import load_semeval_taskb

In [5]:
ttokenizer = TweetTokenizer()

In [6]:
full = load_semeval_taskb('full')
full['nltk_tokens'] = full.text.apply(lambda x: [t.strip().lower() for t in word_tokenize(x)])
full['tweet_tokens'] = full.text.apply(lambda x: [t.strip().lower() for t in ttokenizer.tokenize(x)])
full.head()

Unnamed: 0,example_id,label_id,text,split,nltk_tokens,tweet_tokens
0,train_1,1,Sweet United Nations video. Just in time for C...,train,"[sweet, united, nations, video, ., just, in, t...","[sweet, united, nations, video, ., just, in, t..."
1,train_2,1,@user We are rumored to have talked to Erv's a...,train,"[@, user, we, are, rumored, to, have, talked, ...","[@user, we, are, rumored, to, have, talked, to..."
2,train_3,1,Hey there! Nice to see you Minnesota/ND Winter...,train,"[hey, there, !, nice, to, see, you, minnesota/...","[hey, there, !, nice, to, see, you, minnesota,..."
3,train_4,0,3 episodes left I'm dying over here,train,"[3, episodes, left, i, 'm, dying, over, here]","[3, episodes, left, i'm, dying, over, here]"
4,train_5,2,I can't breathe! was chosen as the most notabl...,train,"[i, ca, n't, breathe, !, was, chosen, as, the,...","[i, can't, breathe, !, was, chosen, as, the, m..."


In [7]:
w2v = FastText(full.tweet_tokens, vector_size=100, window=5, min_count=3, workers=cpu_count()).wv

In [8]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(np.array([w2v.get_mean_vector(example) for example in full.nltk_tokens]))

In [9]:
fig = go.Figure(data=go.Scatter(
    x = X_pca[:, 0],
    y = X_pca[:, 1],
    hovertext = list(zip(full.example_id, full.text, full.label_id)),
    mode='markers+text',
    marker=dict(
        size=4,
        color=[px.colors.qualitative.Plotly[i] for i in full.label_id],
        colorscale='Viridis',
        showscale=True,
    )
))
fig.show()

In [10]:
full['love'] = full.tweet_tokens.apply(lambda x: 'love' in x)
full.groupby('label_id')['love'].mean()

label_id
0    0.026371
1    0.113769
2    0.012469
3    0.007576
Name: love, dtype: float64

In [11]:
full['fun'] = full.tweet_tokens.apply(lambda x: 'fun' in x)
full.groupby('label_id')['fun'].mean()

label_id
0    0.005442
1    0.045249
2    0.000000
3    0.003788
Name: fun, dtype: float64

In [12]:
full['christmas'] = full.tweet_tokens.apply(lambda x: 'christmas' in x)
full.groupby('label_id')['christmas'].mean()

label_id
0    0.021766
1    0.032967
2    0.019950
3    0.007576
Name: christmas, dtype: float64