In [174]:
import torch
from torch.autograd import Variable
import numpy as np
import torch.functional as F
import torch.nn.functional as F

import re, string 
import pandas as pd   
from collections import defaultdict
import spacy
import nltk

from sklearn.manifold import TSNE
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords

from time import time  
from matplotlib import pyplot as plt
%matplotlib inline

In [175]:
nltk.download('popular')
nltk.download('stopwords')
STOPWORDS = stopwords.words('english')

[nltk_data] Downloading collection 'popular'
[nltk_data]    | 
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package gazetteers to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package gazetteers is already up-to-date!
[nltk_data]    | Downloading package genesis to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package genesis is already up-to-date!
[nltk_data]    | Downloading package gutenberg to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package gutenberg is already up-to-date!
[nltk_data]    | Downloading package inaugural to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package inaugural is already up-to-date!
[nltk_data]    | Downloading package movie_reviews to
[nltk_data]    |     /Users/mirko/nltk_data...
[nltk_data]    |   Package movie_reviews is already up-to

In [176]:
dataset = pd.read_csv('bbc_data.csv')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r'\w*\d\w*', '', text)

    if len(text) > 2:
        return ' '.join(word for word in text.split() if word not in STOPWORDS)

clean_dataset = pd.DataFrame(dataset.news_article.apply(lambda x: clean_text(x)))
clean_dataset.head()

Unnamed: 0,news_article
0,claxton hunting first major medal british hurd...
1,osullivan could run worlds sonia osullivan ind...
2,greene sets sights world title maurice greene ...
3,iaaf launches fight drugs iaaf athletics world...
4,dibaba breaks world record ethiopias tirunesh ...


In [None]:
nlp = spacy.load('en', disable=['ner', 'parser']) 

def lemmatizer(text):        
    sent = []
    doc = nlp(text)
    for word in doc:
        sent.append(word.lemma_)
    return " ".join(sent)

clean_dataset["text_lemmatize"] =  clean_dataset.apply(lambda x: lemmatizer(x['news_article']), axis=1)
clean_dataset.head()

In [None]:
clean_dataset['text_lemmatize_clean'] = clean_dataset['text_lemmatize'].str.replace('-PRON-', '')

In [None]:
sentences = [row.split() for row in clean_dataset['text_lemmatize_clean']]
word_freq = defaultdict(int)

for sent in sentences:
    for i in sent:
        word_freq[i] += 1
        
len(word_freq)

In [None]:
w2v_model = Word2Vec(min_count=200, window=5, size=100, workers=4)
w2v_model.build_vocab(sentences)
w2v_model.train(sentences, total_examples=w2v_model.corpus_count, epochs=w2v_model.iter)
w2v_model.init_sims(replace=True)

In [None]:
w2v_model.wv.most_similar(positive=['people'])

In [None]:
w2v_model.wv.most_similar(positive=['different'])

In [None]:
labels = []
tokens = []

for word in w2v_model.wv.vocab:
    tokens.append(w2v_model[word])
    labels.append(word)

tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
new_values = tsne_model.fit_transform(tokens)

x = []
y = []
for value in new_values:
    x.append(value[0])
    y.append(value[1])

plt.figure(figsize=(15, 15)) 
for i in range(len(x)):
    plt.scatter(x[i],y[i])
    plt.annotate(labels[i], xy=(x[i], y[i]), xytext=(5, 2),textcoords='offset points',ha='right',va='bottom')
    
plt.show()