In [1]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

In [6]:
import re, string 
import pandas as pd   
from collections import defaultdict
import spacy
from sklearn.manifold import TSNE
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
%matplotlib inline

df = pd.read_csv('bbc_data.csv')
df.columns= ['news_article'] 
def clean_text(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)
    
    text = word_tokenize(text)
    # Remove a sentence if it is only one word long
    if len(text) > 2:
        return ' '.join(word for word in text if word not in STOPWORDS)

df_clean = pd.DataFrame(df['news_article'].apply(lambda x: clean_text(x)))
print(df_clean)

                                           news_article
0     claxton hunting first major medal british hurd...
1     osullivan could run worlds sonia osullivan ind...
2     greene sets sights world title maurice greene ...
3     iaaf launches fight drugs iaaf athletics world...
4     dibaba breaks world record ethiopias tirunesh ...
...                                                 ...
2220  trial begins spains top banker trial emilio bo...
2221  uk economy ends year spurt uk economy grew est...
2222  healthsouth exboss goes trial former head us m...
2223  euro firms miss optimism large companies aroun...
2224  lacroix label bought us firm luxury goods grou...

[2225 rows x 1 columns]


In [12]:
vocabulary = []
tokenized_corpus = df_clean['news_article'].apply(word_tokenize)
for sentence in tokenized_corpus:
    for token in sentence:
        if token not in vocabulary:
            vocabulary.append(token)

word2idx = {w: idx for (idx, w) in enumerate(vocabulary)}
idx2word = {idx: w for (idx, w) in enumerate(vocabulary)}

vocabulary_size = len(vocabulary)

In [13]:
word2idx

{'claxton': 0,
 'hunting': 1,
 'first': 2,
 'major': 3,
 'medal': 4,
 'british': 5,
 'hurdler': 6,
 'sarah': 7,
 'confident': 8,
 'win': 9,
 'next': 10,
 'months': 11,
 'european': 12,
 'indoor': 13,
 'championships': 14,
 'madrid': 15,
 'already': 16,
 'smashed': 17,
 'record': 18,
 'hurdles': 19,
 'twice': 20,
 'season': 21,
 'setting': 22,
 'new': 23,
 'mark': 24,
 'seconds': 25,
 'aaas': 26,
 'title': 27,
 'quite': 28,
 'said': 29,
 'take': 30,
 'race': 31,
 'comes': 32,
 'long': 33,
 'keep': 34,
 'training': 35,
 'much': 36,
 'think': 37,
 'chance': 38,
 'national': 39,
 'past': 40,
 'three': 41,
 'years': 42,
 'struggled': 43,
 'translate': 44,
 'domestic': 45,
 'success': 46,
 'international': 47,
 'stage': 48,
 'scotlandborn': 49,
 'athlete': 50,
 'owns': 51,
 'equal': 52,
 'fifthfastest': 53,
 'time': 54,
 'world': 55,
 'year': 56,
 'last': 57,
 'weeks': 58,
 'birmingham': 59,
 'grand': 60,
 'prix': 61,
 'left': 62,
 'favourite': 63,
 'russian': 64,
 'irina': 65,
 'shevchenko'

In [14]:
window_size = 3
idx_pairs = []
# for each sentence
for sentence in tokenized_corpus:
    indices = [word2idx[word] for word in sentence]
    # for each word, threated as center word
    for center_word_pos in range(len(indices)):
        # for each window position
        for w in range(-window_size, window_size + 1):
            context_word_pos = center_word_pos + w
            # make soure not jump out sentence
            if context_word_pos < 0 or context_word_pos >= len(indices) or center_word_pos == context_word_pos:
                continue
            context_word_idx = indices[context_word_pos]
            idx_pairs.append((indices[center_word_pos], context_word_idx))

idx_pairs = np.array(idx_pairs) # it will be useful to have this as numpy array

In [15]:
idx_pairs[:10]

array([[0, 1],
       [0, 2],
       [0, 3],
       [0, 4],
       [1, 0],
       [1, 2],
       [1, 3],
       [1, 4],
       [1, 5],
       [2, 0]])

In [16]:
def get_input_layer(word_idx):
    x = torch.zeros(vocabulary_size).float()
    x[word_idx] = 1.0
    return x
  
  #Input layer is just the center word encoded in one-hot manner. It dimensions are [1, vocabulary_size]

In [17]:
embedding_dims = 5
W1 = Variable(torch.randn(embedding_dims, vocabulary_size).float(), requires_grad=True)
W2 = Variable(torch.randn(vocabulary_size, embedding_dims).float(), requires_grad=True)
num_epochs = 1010
learning_rate = 0.001

for epo in range(num_epochs):
    loss_val = 0
    for data, target in idx_pairs:
        x = Variable(get_input_layer(data)).float()
        y_true = Variable(torch.from_numpy(np.array([target])).long())

        z1 = torch.matmul(W1, x)
        z2 = torch.matmul(W2, z1)
    
        log_softmax = F.log_softmax(z2, dim=0)

        loss = F.nll_loss(log_softmax.view(1,-1), y_true)
        loss_val += loss.data.item()
        loss.backward()
        W1.data -= learning_rate * W1.grad.data
        W2.data -= learning_rate * W2.grad.data

        W1.grad.data.zero_()
        W2.grad.data.zero_()
    if epo % 10 == 0:    
        print(f'Loss at epo {epo}: {loss_val/len(idx_pairs)}')

KeyboardInterrupt: 