In [30]:
import os
import warnings

import nltk
import re
import random
import networkx as nx
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as F

from utils import *

from tqdm import tqdm
from gensim.models.word2vec import Word2Vec
from nltk.corpus import stopwords

SEED = 777
random.seed(SEED)
np.random.seed(SEED)

warnings.filterwarnings("ignore")

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

In [None]:
# nltk.download('stopwords') 
# nltk.download('punkt') 

In [2]:
config = yaml_read('config.yaml')
config

{'dataset': './dataset.csv',
 'model': {'max_seq_len': 75,
  'n_label': 13,
  'walk_len': 10,
  'sg': 1,
  'vector_size': 10,
  'min_count': 5,
  'window': 2,
  'workers': 2,
  'seed': 0}}

In [3]:
data_dir = config['dataset']
df = pd.read_csv(data_dir)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


In [12]:
sentiment_lb = df.sentiment.value_counts().index.to_list()
sentiment_dict = {value: idx for idx, value in enumerate(sentiment_lb)}

def remove_stopwords(text, stopwords=stopwords.words("english")): 
    tokens = nltk.word_tokenize(text)
    tokens = [w.lower().strip() for w in tokens if not w.lower() in stopwords]
    return tokens

def clean_content(text):
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)     # remove urls
    text = re.sub(r'\@\w+|\#', '', text)                                        # remove at and hash tags
    text = text.encode('ascii', 'ignore').decode('ascii')                       # remove emojis
    text = re.sub(r'\s+', ' ', text).strip()                                    # remove extra spaces
    tokenized_text = remove_stopwords(text)                                     # remove stopwords
    return tokenized_text 

def process_label(sentiment, sentiment_dict=sentiment_dict): 
    label = sentiment_dict[sentiment]
    return label

In [13]:
df['clean_content'] = df.content.apply(lambda x: clean_content(x))
df['labels'] = df.sentiment.apply(lambda x: process_label(x))

df.head()

Unnamed: 0,tweet_id,sentiment,content,clean_content,labels
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...,"[know, listenin, bad, habit, earlier, started,...",9
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...,"[layin, n, bed, headache, ughhhh, ..., waitin,...",3
2,1956967696,sadness,Funeral ceremony...gloomy friday...,"[funeral, ceremony, ..., gloomy, friday, ...]",3
3,1956967789,enthusiasm,wants to hang out with friends SOON!,"[wants, hang, friends, soon, !]",10
4,1956968416,neutral,@dannycastillo We want to trade with someone w...,"[want, trade, someone, houston, tickets, ,, on...",0


In [14]:
def snt_enum(df, col='sentiment'):
    sentiment_list = df[f'{col}'].value_counts().index.to_list()
    snt2idx = {value: idx for idx, value in enumerate(sentiment_list)}
    idx2snt = {idx: value for idx, value in enumerate(sentiment_list)}
    return (snt2idx, idx2snt)

# Dataset

In [23]:
special_tokens = ['unk']

def get_vocab(content: list[list[str]], special_tokens=None): 
    vocab = []
    if special_tokens: 
        for token in special_tokens: 
            vocab.append(token)

    for row in content: 
        for word in row: 
            vocab.append(word)
        
    vocab = list(set(vocab))
    return vocab

content = df.clean_content.to_list()
vocab = get_vocab(content, special_tokens)
vocab[: 10]

['+uploaded',
 'yoyo',
 'cookoutofthecentury',
 'aot',
 'scarlett',
 'dealt',
 'now.i',
 'delete',
 'pretty',
 'stamford']

In [24]:
def get_token(vocab: list): 
    word2idx = {word: idx for idx, word in enumerate(vocab)}
    idx2word = {idx: word for word, idx in word2idx.items()}
    return (word2idx, idx2word)

word2idx, idx2word = get_token(vocab)
word2idx

{'+uploaded': 0,
 'yoyo': 1,
 'cookoutofthecentury': 2,
 'aot': 3,
 'scarlett': 4,
 'dealt': 5,
 'now.i': 6,
 'delete': 7,
 'pretty': 8,
 'stamford': 9,
 'gigwise': 10,
 'rooobbbbbiiieeeee': 11,
 'bmore': 12,
 "'little": 13,
 'fatass': 14,
 'alrer': 15,
 'exicted': 16,
 'sofas': 17,
 'reliable': 18,
 'fed': 19,
 'dunno': 20,
 "q'doba": 21,
 '2008': 22,
 'wks': 23,
 'arcadia': 24,
 'pero': 25,
 'leavers': 26,
 'derham': 27,
 'steals': 28,
 'rose': 29,
 'toget': 30,
 'geogeektv': 31,
 'crappier': 32,
 'ankit': 33,
 'wiii': 34,
 'logs': 35,
 'intrusive': 36,
 'shots': 37,
 'sbs': 38,
 'correspondents': 39,
 'haircuts': 40,
 'banned': 41,
 '36': 42,
 'workkk': 43,
 'bumukas': 44,
 'freddie': 45,
 'face|': 46,
 'wu': 47,
 'bcreative': 48,
 'hypervenilating': 49,
 'kaotic': 50,
 'digital': 51,
 'scotland': 52,
 'freekin': 53,
 'peroni': 54,
 'nakaka-hyper': 55,
 'freckle': 56,
 'cyalater': 57,
 'birdy': 58,
 'automobiles': 59,
 'bard': 60,
 'billing': 61,
 'wlw': 62,
 'solitude': 63,
 '1.1.s

In [25]:
vocab_len = len(vocab)
print(vocab_len)
labels = df.labels

32527


In [36]:
# Vectorize 
vector_content = []

for row in df.clean_content: 
    tokens = []
    for word in row: 
        idx = word2idx[word]
        tokens.append(idx)
    tokens = torch.LongTensor(tokens).to(device)
    vector_content.append(tokens)

vector_content

[tensor([11912, 10039,   425, 15510, 22716, 27862, 21963, 12688, 24681,  4854],
        device='cuda:0'),
 tensor([10892, 23499, 12886,  2279, 21056, 26764,  4073,  4134, 26764],
        device='cuda:0'),
 tensor([ 3734, 25937, 26764,  6256, 22415, 26764], device='cuda:0'),
 tensor([26712, 21751,  9633,  5326, 13181], device='cuda:0'),
 tensor([ 4377, 24158, 23061, 11615, 22680, 24254, 24752,  5629],
        device='cuda:0'),
 tensor([ 2151, 12411, 16690,  5374,  9797, 27844, 22316, 30798, 16690,  3784,
          9633], device='cuda:0'),
 tensor([16639, 24254, 24357, 13181, 21030, 23708, 16927,  4377,  5629, 27301,
          2044,  5629, 16429, 24254, 14841, 15837, 15171, 26712,  7089, 13181,
          4895, 13181], device='cuda:0'),
 tensor([31519,  5629], device='cuda:0'),
 tensor([26160,   891,  5629, 14569], device='cuda:0'),
 tensor([ 8176, 13833, 17619, 27301, 22415, 27844], device='cuda:0'),
 tensor([15186, 12183, 19764], device='cuda:0'),
 tensor([19977, 24321], device='cuda:0'

In [37]:
one_hot_content = []
for tensor in vector_content: 
    one_hot = F.one_hot(tensor, num_classes=vocab_len)
    one_hot_content.append(one_hot)

one_hot_content

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 4.00 GiB total capacity; 3.49 GiB already allocated; 0 bytes free; 3.52 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

# Model Implementation

In [None]:
def random_walk(start, length):
    walk = [str(start)] # starting node
    for i in range(length):
        neighbors = [node for node in G.neighbors(start)]
        next_node = np.random.choice(neighbors, 1)[0]
        walk.append(str(next_node))
        start = next_node
    return walk

In [18]:
texts = df.clean_content.to_list()
m_conf = config['model']
embedding_model = Word2Vec(texts,
                sg=m_conf['sg'], # Skip-gram
                vector_size=m_conf['max_seq_len'],
                min_count=m_conf['min_count'],
                window=m_conf['window'],
                workers=m_conf['workers'],
                seed=m_conf['seed'])

In [None]:
embedding_model.wv['oov']

In [29]:
def get_sentence_embedding(sentence, embedding_model=embedding_model): 
    embedding_vectors = []
    for word in sentence: 
        embedding_word = embedding_model.wv[word].tolist()
        embedding_vectors.append(embedding_word)

    return embedding_vectors

In [30]:
sample = df.iloc[1].clean_content
get_sentence_embedding(sample)

[[0.003538550343364477,
  -0.039276961237192154,
  -0.03537347912788391,
  0.018587317317724228,
  0.0003710710443556309,
  0.035546861588954926,
  0.03177529573440552,
  0.04119330272078514,
  -0.00612398236989975,
  0.006629781797528267,
  0.047561898827552795,
  0.08555041998624802,
  0.009939181618392467,
  -0.018305566161870956,
  0.014947901479899883,
  -0.07851515710353851,
  0.01988905295729637,
  0.002092757262289524,
  -0.05570698156952858,
  -0.04852515086531639,
  -0.04331056401133537,
  -0.018653076142072678,
  -0.05726311728358269,
  -0.016782527789473534,
  -0.019355641677975655,
  0.030038734897971153,
  0.004750273190438747,
  -0.059439681470394135,
  -0.033075690269470215,
  0.04902944341301918,
  -0.03916976600885391,
  -0.02622310258448124,
  -0.03355654701590538,
  -0.05663544684648514,
  -0.0030330284498631954,
  -0.004786680918186903,
  0.0180455781519413,
  0.03175083547830582,
  0.005862732417881489,
  0.059311747550964355,
  -0.016771439462900162,
  -0.0557851