# Creating summed word2vec and doc2vec representations

Uses data from every news dump created so far

In [1]:
import re
import json
import torch
import pickle
import nltk

import numpy as np
import pandas as pd

from os import listdir
from importlib import reload
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ozzy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ozzy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Load the news corpus, clean and prepare sentences

In [3]:
# Retrieve a list of Json corpus files so far
files = [x for x in listdir("./output") if x.endswith(".json") and ("corpus" in x)]

In [4]:
# Load all of the news corpus files
articles = []
for file in files:
    with open("./output/"+file, "r") as f:
        dump = json.load(f)
        articles = articles + list(dump)

In [5]:
articles[0:2]

[{'date': 'Wed, 29 May 2019 17:27:58 GMT',
  'link': 'https://www.bbc.co.uk/news/world-us-canada-48450534',
  'retrieval_timestamp': '2019-05-29 21:02:30.743862',
  'source_url': 'http://feeds.bbci.co.uk/news/world/rss.xml',
  'summary': 'The special counsel said legal guidelines meant he was unable to charge a sitting president.',
  'title': 'Robert Mueller: Charging Trump was not an option'},
 {'date': 'Wed, 29 May 2019 14:45:39 GMT',
  'link': 'https://www.bbc.co.uk/news/world-asia-48441604',
  'retrieval_timestamp': '2019-05-29 21:02:30.743862',
  'source_url': 'http://feeds.bbci.co.uk/news/world/rss.xml',
  'summary': 'The schoolgirl was set on fire after filing a sexual harassment complaint against her principal.',
  'title': 'Nusrat Jahan Rafi: 16 charged in Bangladesh for burning girl alive'}]

In [6]:
def clean_text(article_text):
    """ Utility function for cleaning up text for me.  There's probably better ways to prepare data. """
    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  # Gets rid of numbers
    article_text = re.sub(r'\s+', ' ', article_text)         # Replaces all forms of white space with single space
    #article_text = re.sub(r'"', '', article_text)            # Removes quotation marks
    
    return(article_text)

In [7]:
clean_articles = []
source_urls = []

for article in articles:
    clean_articles.append(clean_text(article['title'] + ". " + article['summary']))
    source_urls.append(article['source_url'])

## Set up InferSent Word2Vec Model Malarky

In [None]:
from embedding_models import InferSentModel

In [None]:
infersent = InferSentModel(sentences=clean_articles, labels=clean_articles)

In [None]:
embeddings = infersent.get_embeddings()

In [None]:
# Save the embeddings for later, so I don't have to regenerate them every time
with open("./output/sentence_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

## Set up Summed Word Vectors Model Malarky

In [8]:
from embedding_models import GloveWordModel

In [9]:
glove = GloveWordModel(sentences=clean_articles, labels=clean_articles)

[nltk_data] Downloading package stopwords to /home/ozzy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
embeddings = glove.get_embeddings()

In [11]:
# Save the embeddings for later, so I don't have to regenerate them every time
with open("./output/word_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

## Experiment with interesting story extraction!

Possibly less of an issue for other model methods:  An unexpected consequence of using the InferSent2 model is that it produces really quite big vectors of length 4096.  Possibly a PCA step to reduce this would be wise?

In [12]:
with open("./output/word_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [28]:
labels = list(embeddings.keys())
embeddings_array = np.asarray(list(embeddings.values()))
embeddings_array.shape

(8136, 100)

In [29]:
labels[0:10]

['Robert Mueller: Charging Trump was not an option. The special counsel said legal guidelines meant he was unable to charge a sitting president.',
 'Nusrat Jahan Rafi: 16 charged in Bangladesh for burning girl alive. The schoolgirl was set on fire after filing a sexual harassment complaint against her principal.',
 'Tankers almost certainly damaged by Iranian naval mines, US says. National Security Adviser John Bolton blames Iran for attacks off the UAE, but provides no evidence.',
 'Huawei: US blacklist will harm billions of consumers. Huawei says the US is "using the strength of an entire nation to come after a private company".',
 'Growing crops in the shadow of Fukushima. Eight years on from the nuclear disaster, some have chosen to return to the small town of Okuma in Japan.',
 'Niki Lauda: F1 stars attend Mass for late racing legend in Austria. They were among thousands paying tributes to the late Austrian driver at a Mass in Vienna.',
 'Hells Angels bikers banned by Netherlands 

In [34]:
# Reducing the embedding's dimensionality to see if that helps
pca = PCA(n_components=50)
embeddings_reduced = pca.fit_transform(embeddings_array)
embeddings_reduced.shape

(8136, 50)

In [35]:
# Spawn an isolation forest.  I want it to tell me what it's doing, and use a few hundred estimators
# given the large parameter space over which it's forced to operate.
isoforest = IsolationForest(n_estimators=400, verbose=1)
isoforest.fit(embeddings_reduced)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    3.3s finished


IsolationForest(behaviour='old', bootstrap=False, contamination='legacy',
                max_features=1.0, max_samples='auto', n_estimators=400,
                n_jobs=None, random_state=None, verbose=1, warm_start=False)

In [37]:
results = pd.DataFrame({"labels":labels,
                        "anomaly_score": isoforest.score_samples(embeddings_reduced),
                        "anomaly_predicted": isoforest.predict(embeddings_reduced),
                        })



In [38]:
# Anomalous news, most weird by embedding first
results[results['anomaly_predicted']==-1]\
    [['labels', 'anomaly_score', 'anomaly_predicted']].\
    drop_duplicates().\
    sort_values("anomaly_score", ascending=True).\
    head(n=20)

Unnamed: 0,labels,anomaly_score,anomaly_predicted
4293,"More than 2,000 infected with Ebola in Congo a...",-0.594878,-1
1387,Some of the best restaurants and pubs in Sussex.,-0.594132,-1
7960,Eid date: When is Eid al Fitr in USA? Has the ...,-0.587328,-1
275,A coup against corruption in Romania.,-0.586002,-1
6829,Heavy rainfall and thunderstorms predicted thi...,-0.578392,-1
7483,Qatar PM to attend Saudi Arabia summit - Al Ja...,-0.576391,-1
6454,Weather warning issued as Sussex braces itself...,-0.571338,-1
226,Women Acquitting Themselves Well.,-0.570077,-1
7959,Eid Mubarak: Islamic prayer times - Eid al Fit...,-0.568927,-1
6399,Defending champions USA beat Chile 3-0. Defend...,-0.567695,-1


In [39]:
# News stories, non-anomalous
results[results['anomaly_predicted']==1]\
    [['labels', 'anomaly_score', 'anomaly_predicted']].\
    drop_duplicates().\
    sort_values("anomaly_score", ascending=False).\
    head(n=20)

Unnamed: 0,labels,anomaly_score,anomaly_predicted
1103,Two women 'beat up' a Burger King restaurant m...,-0.362615,1
3265,"Brit stag-do tourist, 35, DIES after being bru...",-0.363042,1
4390,'Historic Partners': Anti-Trump Protests Fail ...,-0.363966,1
4509,London mayor trolls Trump: He doesn’t deserve ...,-0.364107,1
6577,Young daughter breaks down in public appeal fo...,-0.364857,1
4520,UK mobilizes to make sure Trump knows he ‘is n...,-0.365121,1
6127,Teen ‘killed her best friend after man she met...,-0.365357,1
750,"Shocking moment Brit stag-do tourist, 35, is b...",-0.365611,1
3268,Husband turns up at a police station with his ...,-0.366069,1
2360,Brits express anxiety that Trump will cause ma...,-0.36617,1


## Using similarity to extract relevant articles

In [None]:
labels = ["a", "b", "c"]
sss = np.array([[1, 2, 3],
                [4, 5, 6],
                [7, 8, 9]])