# Imports and downloads

In [1]:
import torch
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math 
import string
import re
from sklearn.pipeline import Pipeline
from nltk.tokenize import wordpunct_tokenize
import nltk

In [2]:
torch.cuda.empty_cache()


In [3]:
torch.cuda.memory_allocated()

0

In [4]:
torch.cuda.memory_reserved()

0

In [5]:
nltk.download("brown")
nltk.download("movie_reviews")
nltk.download("stopwords")
nltk.download('sentence_polarity')
nltk.download('gutenberg')


nltk.download('words')


[nltk_data] Downloading package brown to /home/sebas/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package movie_reviews to
[nltk_data]     /home/sebas/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sebas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package sentence_polarity to
[nltk_data]     /home/sebas/nltk_data...
[nltk_data]   Package sentence_polarity is already up-to-date!
[nltk_data] Downloading package gutenberg to /home/sebas/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!
[nltk_data] Downloading package words to /home/sebas/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [6]:
from nltk.corpus import stopwords
from nltk.corpus import words as nltk_vocabulary
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import brown, movie_reviews


# Creating the corpus for word2vec training

In [7]:
stops = set(stopwords.words('english'))
nltk_vocabulary = nltk_vocabulary.words()

root = '/home/sebas/nltk_data/corpora/sentence_polarity/'
sentence_polarity = PlaintextCorpusReader(root, ".*\.txt")

root = '/home/sebas/nltk_data/corpora/gutenberg/'
gutenberg_text = PlaintextCorpusReader(root, "melville-moby_dick.txt")
gutenberg_text_2 = PlaintextCorpusReader(root, "whitman-leaves.txt")
gutenberg_text_3 = PlaintextCorpusReader(root, "bible-kjv.txt")

In [8]:
corpus = list(brown.words()) \
                + list(movie_reviews.words()) \
                + list (sentence_polarity.words()) \
                + list(gutenberg_text.words()) \
                + list(gutenberg_text_2.words()) \
                + list(gutenberg_text_3.words())

In [9]:
len(corpus)

4414840

## Text processing

In [10]:
corpus_aux = [x.lower() for x in corpus]

In [11]:
corpus = [x for x in corpus_aux if ((x not in stops) and (x not in string.punctuation) and (x != '\'\'')  and (x != '``') and (x != '--') and (not x.isnumeric()))]


In [12]:
len(corpus)

1902441

In [13]:
%pprint

Pretty printing has been turned OFF


In [14]:
corpus[:100]

['fulton', 'county', 'grand', 'jury', 'said', 'friday', 'investigation', "atlanta's", 'recent', 'primary', 'election', 'produced', 'evidence', 'irregularities', 'took', 'place', 'jury', 'said', 'term-end', 'presentments', 'city', 'executive', 'committee', 'over-all', 'charge', 'election', 'deserves', 'praise', 'thanks', 'city', 'atlanta', 'manner', 'election', 'conducted', 'september-october', 'term', 'jury', 'charged', 'fulton', 'superior', 'court', 'judge', 'durwood', 'pye', 'investigate', 'reports', 'possible', 'irregularities', 'hard-fought', 'primary', 'mayor-nominate', 'ivan', 'allen', 'jr.', 'relative', 'handful', 'reports', 'received', 'jury', 'said', 'considering', 'widespread', 'interest', 'election', 'number', 'voters', 'size', 'city', 'jury', 'said', 'find', 'many', "georgia's", 'registration', 'election', 'laws', 'outmoded', 'inadequate', 'often', 'ambiguous', 'recommended', 'fulton', 'legislators', 'act', 'laws', 'studied', 'revised', 'end', 'modernizing', 'improving', 'g

In [15]:
%pprint

Pretty printing has been turned ON


# Creating necessary structures for Word2Vec training

In [16]:
corpus_series = pd.Series(corpus)

In [17]:
#corpus_series.unique()

In [18]:
word_counts = corpus_series.value_counts()
word_counts

one             13159
film            11217
shall           10507
unto             9023
lord             8212
                ...  
rilly               1
unprocurable        1
moire               1
doled               1
chrysoprasus        1
Name: count, Length: 80357, dtype: int64

In [19]:
min_freq = 20
relevant_words = word_counts[word_counts > min_freq]
#list(relevant_words.index)[:20]

In [20]:
indices = pd.DataFrame(relevant_words).reset_index()[['index']].reset_index().iloc[:,0]
words = pd.DataFrame(relevant_words).reset_index()[['index']].reset_index().iloc[:,1]

ind_to_words_dict = dict(zip(indices, words))
words_to_ind_dict = dict(zip(words, indices))
dict_len = len(list(words_to_ind_dict.keys()))
dict_len

11216

In [21]:
#words

In [22]:
set_words = set( words.values)
corpus_valids = [x in set_words for x in corpus]
sum(corpus_valids)

1649424

# Creating the training dataset with context-target pairs and negative sampling

In [23]:
def get_context_pairs(i, words, word):#wordn2, wordn1, word, wordp1, wordp2):
    positive_exs = list()
    valids = list()

    n_words = len(words)

    for k, word_ in enumerate(words, int(-((n_words-1)/2))):
        #if i == 0:
        #    print("inside", k)
        if k != 0:
            if corpus_valids[i+k]:
                positive_exs.append(word_)
                valids.append(k)
            #if ((k == -1) or (k == 1)) and brown_valids[i+k]:
                
    return positive_exs, valids        

In [24]:
context_window = 5
training_set = list()

for i, word in enumerate(corpus):
    if (i % 500000) == 0:
        print(i)
    if i - context_window >= 0 and i + context_window <= (len(corpus)-1):
        if corpus_valids[i]:
            con_words = list()
            
            for k in range(-context_window, context_window+1, 1):
                #if i == 0:
                #    print("outside", k)
                con_words.append(corpus[i+k])
            positives, valids = get_context_pairs(i, con_words, word)
            
            num_valids = len(valids)
            forbidden_ints = list()
            for val in valids:
                forbidden_ints.append(words_to_ind_dict[corpus[i+val]])
            for val in valids:
                n_sampled = 0
                training_set.append((words_to_ind_dict[word], words_to_ind_dict[corpus[i+val]], 1.))
                if abs(val) < 3:
                    training_set.append((words_to_ind_dict[corpus[i+val]], words_to_ind_dict[word], 1.))

                while n_sampled < 10:
                    a = np.random.randint(0, dict_len)
                    if a not in forbidden_ints:
                        sampled_neg = a
                        n_sampled += 1
                        training_set.append((words_to_ind_dict[word], sampled_neg, 0.))



0
500000
1000000
1500000


In [25]:
ind_to_words_dict[training_set[74512][0]], ind_to_words_dict[training_set[74512][1]], training_set[74512][2]

('house', 'saying', 0.0)

In [26]:
len(training_set)

164541480

# Creating model and pytorch tensors for training and testing

In [27]:
device = "cpu"
#(
#    "cuda:0"
#    if torch.cuda.is_available()
#    else "mps"
#    if torch.backends.mps.is_available()
#    else "cpu"
#)
print(f"Using {device} device")

Using cpu device


In [28]:
class word2vec(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.emb_targ = torch.nn.Embedding(dict_len, 200)
        self.emb_con = torch.nn.Embedding(dict_len, 200)

    def forward(self, x1, x2):
        target, context = x1, x2
        word_emb = self.emb_targ(target.to(device)).unsqueeze(1)
        context_emb = self.emb_con(context.to(device)).unsqueeze(2)
        out = word_emb.bmm(context_emb)
        output = torch.flatten(out)
        return output

In [29]:
#training_set[:10]

In [None]:
X1 = np.array(training_set)[:,0]
X1 = [int(x) for x in X1]
X1 = torch.tensor(X1, dtype=torch.int32).to(device)

print("done")

X2 = np.array(training_set)[:,1]
X2 = [int(x) for x  in X2]
X2 = torch.tensor(X2, dtype=torch.int32).to(device)

print("done")

Y = np.array(training_set)[:,2]
Y = [y for y in Y]
Y = torch.tensor(Y).to(device)

print("done")


In [None]:
indices_ = torch.randperm(len(X1))#.to("cuda")  # Generate indices on GPU

X1 = X1[indices_]
X2 = X2[indices_]
Y = Y[indices_]

In [None]:
X1_batches = torch.split(X1, 4096, dim=0)
X2_batches = torch.split(X2, 4096, dim=0)
Y_batches = torch.split(Y, 4096, dim=0)

len(X1_batches)

In [None]:
len(Y_batches)

In [None]:
X1_batches[0].size()

In [None]:
train_X1 = X1_batches[:-3000]
train_X2 = X2_batches[:-3000]
train_Y = Y_batches[:-3000]

test_X1 = X1_batches[-3000:]
test_X2 = X2_batches[-3000:]
test_Y = Y_batches[-3000:]

In [None]:
X1_batches[1]

In [None]:
device = "cuda:0"

In [None]:
w2v = word2vec().to(device)

# Training the model

In [None]:
import torch.optim as optim

criterion = torch.nn.BCEWithLogitsLoss()
optimizer = optim.Adam(w2v.parameters(), lr=0.002)#, momentum=0.9)

In [None]:
losses = []

for epoch in range(10):  # loop over the dataset multiple times

    running_loss = 0.0
    
    for i, (x1, x2, y) in enumerate(zip(train_X1, train_X2, train_Y)):
        # get the inputs; data is a list of [inputs, labels]
        #x1, x2, y = data
        #print(x1)
        #print(x2)
        #print(y)
        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = w2v(x1.to(device), x2.to(device)).type(torch.float32)
        #print(outputs.size())
        #print(y.size())
        loss = criterion(outputs, torch.flatten(y.to(device)).type(torch.float32))
        loss.backward()
        optimizer.step()

        # print statisticsq3
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            losses.append(running_loss / 2000)
            running_loss = 0.0

print('Finished Training')

In [None]:
plt.plot(range(len(losses)), losses)

# Testing the model

In [None]:
test_X1_ = torch.cat(test_X1)
test_X2_ = torch.cat(test_X2)
test_Y_ = torch.cat(test_Y)

In [None]:
test_Y_.size()

## Testing on target-context pairs

In [None]:
correct = 0
total = 0
# since we're not training, we don't need to calculate the gradients for our outputs
with torch.no_grad():
    for i, (x1, x2, y) in enumerate(zip(test_X1_[:1000000], test_X2_[:1000000], test_Y_[:1000000])):
        if (i % 100000) == 0:
            print(i)
        # calculate outputs by running images through the network
        outputs = w2v(x1.unsqueeze(0),x2.unsqueeze(0))
        # the class with the highest energy is what we choose as prediction
        predicted = (outputs > 0).float()
        total += 1
        #if (i % 1000) == 0:
        #print((predicted == y).sum().item())
        correct += (predicted == y).sum().item()

In [None]:
correct/total

In [None]:
Embeds = w2v.emb_con.weight.cpu().detach().numpy()

## Testing on common sense closest words

In [None]:
words_to_ind_dict['green']

In [None]:
word1 = 'bad'

In [None]:
#Embeds[913]

In [None]:
diffs = list()
val = Embeds[words_to_ind_dict[word1]]/abs(Embeds[words_to_ind_dict[word1]]).sum()
for i,word2 in enumerate(list(words_to_ind_dict.keys())):
    a = abs(Embeds[words_to_ind_dict[word2]]).sum()
    if a != 0:
        diff_vec = abs(val - Embeds[words_to_ind_dict[word2]]/a).sum()
        diffs.append([i, diff_vec])
best_ws = sorted(diffs, key=lambda x: x[1])

In [None]:
[list(words_to_ind_dict.keys())[x[0]] for x in best_ws[0:6]]

In [None]:
embeddings_ = list(w2v.emb_targ.parameters())[0]
embeddings_ = embeddings_.cpu().detach().numpy() 
norms = (embeddings_ ** 2).sum(axis=1) ** (1 / 2)
norms = norms.reshape(norms.shape[0], 1)
embeddings_ = embeddings_ / norms

def get_similar_words(word, n):
    word_id = words_to_ind_dict[word]
    if word_id == 0:
        print("Out of vocabulary word")
        return

    word_vec = embeddings_[word_id]
    word_vec = np.reshape(word_vec, (word_vec.shape[0], 1))
    dists = np.matmul(embeddings_, word_vec).flatten()
    topN_ids = np.argsort(-dists)[1 : n + 1]

    topN_dict = {}
    for sim_word_id in topN_ids:
        sim_word = ind_to_words_dict[sim_word_id]
        topN_dict[sim_word] = dists[sim_word_id]
    return topN_dict


get_similar_words('great', 10)

In [None]:
#((embeddings_[words_to_ind_dict['weak']] - embeddings_[words_to_ind_dict['strong']]) ** 2).sum() ** (1 / 2)

In [None]:
diffs = list()
val = (embeddings_[words_to_ind_dict['weaker']] - embeddings_[words_to_ind_dict['weak']]) + embeddings_[words_to_ind_dict['strong']]

for i,word2 in enumerate(list(words_to_ind_dict.keys())):
    a = abs(Embeds[words_to_ind_dict[word2]]).sum()
    if a != 0:
        diff_vec = abs(val - Embeds[words_to_ind_dict[word2]]/a).sum()
        diffs.append([i, diff_vec])
best_ws = sorted(diffs, key=lambda x: x[1])

[list(words_to_ind_dict.keys())[x[0]] for x in best_ws[0:6]]

In [None]:
Embeds = pd.DataFrame(Embeds)

In [None]:
#Embeds

In [None]:
words_dict_df = pd.DataFrame([words_to_ind_dict.keys(), words_to_ind_dict.values()]).T[[0]]

In [None]:
#words_dict_df

# Saving embeddings and dictionary

In [None]:
from numpy import savetxt

Embeds.to_csv("Embeddings.csv")
words_dict_df.to_csv("words_dict.csv")

In [None]:
#pd.read_csv("Embeddings.csv", index_col=0)
#pd.read_csv("words_dict.csv", index_col=0)

In [None]:
def get_CO_matrix_t(all_sentences):
    window_size = 8
    CO_matrix = np.zeros((dict_len,dict_len))
    for all_words_ in all_sentences:
        all_words_ = all_words_.split(' ')
        for i in range(0, len(all_words_)):
            if(all_words_[i] in words_to_ind_dict.keys()):
                curr_word = all_words_[i]
                for k in range(1, window_size+1):
                    if((i + k) < len(all_words_)):
                        if (all_words_[i+k] in words_to_ind_dict.keys()):
                            ind_curr_word = words_to_ind_dict[curr_word]
                            a = window_size+1 - k
                            CO_matrix[words_to_ind_dict[all_words_[i+k]]][ind_curr_word] += a

    return CO_matrix