In [1]:
import pandas as pd
import numpy as np
import torch
from nltk.util import ngrams
import datasets
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
data = pd.DataFrame()

In [16]:
with open("/home/thasin/class-projects/project-pytorch/lemmatized_tweets.pkl","rb") as file:
    data['text'] = pickle.load(file)

In [17]:
data.head()

Unnamed: 0,text
0,"[twitpic, com, zl, awww, bummer, shoulda, get,..."
1,"[upset, update, facebook, texte, cry, result, ..."
2,"[manage, save, rest, bound]"
3,"[body, feel, itchy, like, fire]"
4,"[behave, mad]"


In [18]:
str = list(data['text'].apply(lambda x: " ".join(x)))

In [19]:
len(str)

1600000

In [20]:
str = list(filter(lambda x: len(x) > 1,str))

In [21]:
str

['twitpic com zl awww bummer shoulda get david carr day',
 'upset update facebook texte cry result school today blah',
 'manage save rest bound',
 'body feel itchy like fire',
 'behave mad',
 'need hug',
 'yes rain bit bit lol fine thank',
 'nope',
 'spring break plain city snowing',
 'pierce ear',
 'think ua loss embarrass',
 'know talk anymore',
 'gun zac snyder doucheclown',
 'miss iamlilnicki premiere',
 'hollis death scene hurt severely watch film wry director cut',
 'file taxis',
 'drink forget table drink',
 'friend call asked meet mid valley today time sigh',
 'barista bake cake ate',
 'week go hope',
 'blagh class tomorrow',
 'hate wake people',
 'go cry sleep watch marley',
 'sad miss lilly',
 'ooooh lol leslie ok leslie mad',
 'meh lover exception track get depressed time',
 'hack account aim new',
 'think sleeping option tomorrow realize evaluation morning work afternoon',
 'miss',
 'ok sick spend hour sit shower sick stand hold puke like champ bed',
 'bed time come gmt',
 

In [28]:
first_100k_row = str[:100000]

In [29]:
len(first_100k_row)

100000

In [30]:
with open("cleaned_tweet.txt", "wt") as file:
    for row in first_100k_row:
        file.write(f"{row}\n")

<h5>Now took every tweet and make tuples of trigram</h5>

In [2]:
dset = datasets.load_dataset("text",data_files={"train":"cleaned_tweet.txt"})

In [3]:
dset

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 100000
    })
})

In [4]:
dset['train']['text'][0:10]

['twitpic com zl awww bummer shoulda get david carr day',
 'upset update facebook texte cry result school today blah',
 'manage save rest bound',
 'body feel itchy like fire',
 'behave mad',
 'need hug',
 'yes rain bit bit lol fine thank',
 'nope',
 'spring break plain city snowing',
 'pierce ear']

In [5]:
def tokenize_tweets(single_row):
    
    single_row["tokenized_tweets"] = single_row['text'].split()
    return single_row

In [6]:
dset = dset.map(tokenize_tweets)

In [7]:
dset

DatasetDict({
    train: Dataset({
        features: ['text', 'tokenized_tweets'],
        num_rows: 100000
    })
})

In [8]:
dset['train']['tokenized_tweets']

[['twitpic',
  'com',
  'zl',
  'awww',
  'bummer',
  'shoulda',
  'get',
  'david',
  'carr',
  'day'],
 ['upset',
  'update',
  'facebook',
  'texte',
  'cry',
  'result',
  'school',
  'today',
  'blah'],
 ['manage', 'save', 'rest', 'bound'],
 ['body', 'feel', 'itchy', 'like', 'fire'],
 ['behave', 'mad'],
 ['need', 'hug'],
 ['yes', 'rain', 'bit', 'bit', 'lol', 'fine', 'thank'],
 ['nope'],
 ['spring', 'break', 'plain', 'city', 'snowing'],
 ['pierce', 'ear'],
 ['think', 'ua', 'loss', 'embarrass'],
 ['know', 'talk', 'anymore'],
 ['gun', 'zac', 'snyder', 'doucheclown'],
 ['miss', 'iamlilnicki', 'premiere'],
 ['hollis',
  'death',
  'scene',
  'hurt',
  'severely',
  'watch',
  'film',
  'wry',
  'director',
  'cut'],
 ['file', 'taxis'],
 ['drink', 'forget', 'table', 'drink'],
 ['friend', 'call', 'asked', 'meet', 'mid', 'valley', 'today', 'time', 'sigh'],
 ['barista', 'bake', 'cake', 'ate'],
 ['week', 'go', 'hope'],
 ['blagh', 'class', 'tomorrow'],
 ['hate', 'wake', 'people'],
 ['go', 'c

In [None]:
# dset['train'] = dset['train'].map(tokenize_tweets)


In [10]:
def convert_to_trigram(single_row):
    
    single_row['tri_gram'] = list(ngrams(single_row['tokenized_tweets'],n = 3))
    return single_row

In [11]:
dset['train'] = dset['train'].map(convert_to_trigram)

In [12]:
dset['train']['tri_gram'][0]

[['twitpic', 'com', 'zl'],
 ['com', 'zl', 'awww'],
 ['zl', 'awww', 'bummer'],
 ['awww', 'bummer', 'shoulda'],
 ['bummer', 'shoulda', 'get'],
 ['shoulda', 'get', 'david'],
 ['get', 'david', 'carr'],
 ['david', 'carr', 'day']]

In [22]:
str[0]

'twitpic com zl awww bummer shoulda get david carr day'

In [23]:
vocabulary = set()

for raw_text in str[:100000]:
    vocabulary.update(raw_text.split())

In [24]:
len(vocabulary)

43953

In [25]:
vocab2idx = dict(zip(vocabulary,range(len(vocabulary))))

In [26]:
len(vocab2idx)

43953

In [27]:
def convert_to_bigrams(single_row):
    
    center_token_target_token_pair = list()
    
    for single_trigram in single_row['tri_gram']:
        bigram = list()
        
        bigram.append([vocab2idx[single_trigram[1]], vocab2idx[single_trigram[0]]])
        bigram.append([vocab2idx[single_trigram[1]], vocab2idx[single_trigram[2]]])
        
        center_token_target_token_pair.append(bigram)
        
        single_row['tri_gram'] = center_token_target_token_pair
        
    return single_row

In [28]:
dset['train'] = dset['train'].map(convert_to_bigrams)

Map: 100%|██████████| 100000/100000 [00:11<00:00, 8700.78 examples/s]


In [29]:
dset['train']['tri_gram'][0]

[[[15513, 19304], [15513, 5651]],
 [[5651, 15513], [5651, 27425]],
 [[27425, 5651], [27425, 38725]],
 [[38725, 27425], [38725, 11291]],
 [[11291, 38725], [11291, 36302]],
 [[36302, 11291], [36302, 11619]],
 [[11619, 36302], [11619, 35499]],
 [[35499, 11619], [35499, 31807]]]

In [30]:
input_target_token_pair = list()

for single_tweet_bigram in dset['train']['tri_gram']:

    for bigram_list in single_tweet_bigram:
        
        input_target_token_pair.append(bigram_list[0])
        input_target_token_pair.append(bigram_list[1])
        

In [31]:
input_target_token_pair[:4]

[[15513, 19304], [15513, 5651], [5651, 15513], [5651, 27425]]

In [36]:
class SkipGramDataset(torch.utils.data.Dataset):
    
    def __init__(self,input_target_pair):
        self.data = input_target_pair
        
    def __getitem__(self,index):
        return self.data[index] 
    
    def __len__(self):
        return len(self.data)
      

In [37]:
training_data_obj = SkipGramDataset(input_target_token_pair)

In [38]:
import os

In [43]:
training_data_generator = torch.utils.data.DataLoader(training_data_obj, batch_size = 32, num_workers = os.cpu_count())

In [46]:
training_data_generator

<torch.utils.data.dataloader.DataLoader at 0x7f899347a8e0>

In [47]:
class Word2VecSkipGramNN(torch.nn.Module):
    
    def __init__(self,vocabulary_size,topic_vector_dim):
        super().__init__()
        
        self.hidden_layer = torch.nn.Embedding(num_embeddings=vocabulary_size, embedding_dim=topic_vector_dim)
        #linear layer
        self.output_layer = torch.nn.Linear(in_features=topic_vector_dim, out_features=vocabulary_size)
        
        self.output_layer_activation = torch.nn.Softmax()
        
    def forward(self,center_token):
        embedding_layer_out = self.hidden_layer(center_token)
        linear_layer_out = self.output_layer(embedding_layer_out)
        
        nn_out = self.output_layer_activation(linear_layer_out)   
        
        return nn_out  # (64,43953)     

In [48]:
our_word2vec_skip_gram_nw = Word2VecSkipGramNN(len(vocab2idx), 64)