# Creating summed word2vec and doc2vec representations

Uses data from every news dump created so far

In [50]:
import re
import json
import torch
import pickle
import nltk

import numpy as np
import pandas as pd

from os import listdir
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [51]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ozzy/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/ozzy/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Load the news corpus, clean and prepare sentences

In [2]:
# Retrieve a list of Json corpus files so far
files = [x for x in listdir("./output") if x.endswith(".json") and ("corpus" in x)]

In [3]:
# Load all of the news corpus files
articles = []
for file in files:
    with open("./output/"+file, "r") as f:
        dump = json.load(f)
        articles = articles + list(dump)

In [4]:
articles[0:2]

[{'date': 'Wed, 29 May 2019 17:27:58 GMT',
  'link': 'https://www.bbc.co.uk/news/world-us-canada-48450534',
  'retrieval_timestamp': '2019-05-29 21:02:30.743862',
  'source_url': 'http://feeds.bbci.co.uk/news/world/rss.xml',
  'summary': 'The special counsel said legal guidelines meant he was unable to charge a sitting president.',
  'title': 'Robert Mueller: Charging Trump was not an option'},
 {'date': 'Wed, 29 May 2019 14:45:39 GMT',
  'link': 'https://www.bbc.co.uk/news/world-asia-48441604',
  'retrieval_timestamp': '2019-05-29 21:02:30.743862',
  'source_url': 'http://feeds.bbci.co.uk/news/world/rss.xml',
  'summary': 'The schoolgirl was set on fire after filing a sexual harassment complaint against her principal.',
  'title': 'Nusrat Jahan Rafi: 16 charged in Bangladesh for burning girl alive'}]

In [5]:
def clean_text(article_text):
    """ Utility function for cleaning up text for me.  There's probably better ways to prepare data. """
    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  # Gets rid of numbers
    article_text = re.sub(r'\s+', ' ', article_text)         # Replaces all forms of white space with single space
    #article_text = re.sub(r'"', '', article_text)            # Removes quotation marks
    
    return(article_text)

In [15]:
clean_articles = []
source_urls = []

for article in articles:
    clean_articles.append(clean_text(article['title'] + ". " + article['summary']))
    source_urls.append(article['source_url'])

In [7]:
clean_articles[0:10]

['Robert Mueller: Charging Trump was not an option. The special counsel said legal guidelines meant he was unable to charge a sitting president.',
 'Nusrat Jahan Rafi: 16 charged in Bangladesh for burning girl alive. The schoolgirl was set on fire after filing a sexual harassment complaint against her principal.',
 'Tankers almost certainly damaged by Iranian naval mines, US says. National Security Adviser John Bolton blames Iran for attacks off the UAE, but provides no evidence.',
 'Huawei: US blacklist will harm billions of consumers. Huawei says the US is "using the strength of an entire nation to come after a private company".',
 'Growing crops in the shadow of Fukushima. Eight years on from the nuclear disaster, some have chosen to return to the small town of Okuma in Japan.',
 'Niki Lauda: F1 stars attend Mass for late racing legend in Austria. They were among thousands paying tributes to the late Austrian driver at a Mass in Vienna.',
 'Hells Angels bikers banned by Netherlands 

## Set up InferSent Word2Vec Model Malarky

In [39]:
from InferSent.models import InferSent
V = 2
MODEL_PATH = './InferSent/encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [40]:
W2V_PATH = './InferSent/dataset/fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [41]:
infersent.build_vocab(clean_articles, tokenize=True)

Found 25599(/27624) words with w2v vectors
Vocab size : 25599


In [49]:
embeddings = infersent.encode(clean_articles, tokenize=True)

In [50]:
# Save the embeddings for later, so I don't have to regenerate them every time
with open("./output/sentence_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

## Experiment with interesting story extraction!

Possibly less of an issue for other model methods:  An unexpected consequence of using the InferSent2 model is that it produces really quite big vectors of length 4096.  Possibly a PCA step to reduce this would be wise?

In [10]:
with open("./output/sentence_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

In [11]:
embeddings.shape

(15544, 4096)

In [44]:
# Reducing the embedding's dimensionality to see if that helps
pca = PCA(n_components=50)
embeddings_reduced = pca.fit_transform(embeddings)
embeddings_reduced.shape

# Spawn an isolation forest.  I want it to tell me what it's doing, and use a few hundred estimators
# given the large parameter space over which it's forced to operate.
isoforest = IsolationForest(n_estimators=300, verbose=1)
isoforest.fit(embeddings_reduced)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.6s finished


IsolationForest(behaviour='old', bootstrap=False, contamination='legacy',
                max_features=1.0, max_samples='auto', n_estimators=300,
                n_jobs=None, random_state=None, verbose=1, warm_start=False)

In [46]:
results = pd.DataFrame(articles)
results['anomaly_score'] = isoforest.score_samples(embeddings_reduced)
results['anomaly_predicted'] = isoforest.predict(embeddings_reduced)
results['clean_text'] = clean_articles



In [47]:
# Anomalous news, most weird by embedding first
results[results['anomaly_predicted']==-1]\
    [['clean_text', 'anomaly_score', 'anomaly_predicted']].\
    drop_duplicates().\
    sort_values("anomaly_score", ascending=True).\
    head(n=20)

Unnamed: 0,clean_text,anomaly_score,anomaly_predicted
3111,LICENSING ACT 2003 APPLICATION FOR A PREMISES ...,-0.574346,-1
1423,Win a family VIP ticket to Kynren.,-0.562457,-1
11201,Binyavanga Wainaina obituary.,-0.545477,-1
14552,Eid al Fitr moon sighting: Has the Shawwal moo...,-0.545039,-1
2912,Cambridgeshire County Council plans plastic-fr...,-0.543537,-1
2194,Swedish bid to extradite Julian Assange is dea...,-0.539314,-1
8060,Russian Su-27 Fighter Intercepts U.S. Air Forc...,-0.533968,-1
2037,Julian Assange extradition to Sweden in doubt ...,-0.533444,-1
14553,Eid Mubarak: Islamic prayer times - Eid al Fit...,-0.533055,-1
3479,Bali volcano spews ash in new eruption.,-0.532623,-1


In [48]:
# News stories, non-anomalous
results[results['anomaly_predicted']==1]\
    [['clean_text', 'anomaly_score', 'anomaly_predicted']].\
    drop_duplicates().\
    sort_values("anomaly_score", ascending=False).\
    head(n=20)

Unnamed: 0,clean_text,anomaly_score,anomaly_predicted
8891,Six Flags worker was told to go home over pier...,-0.370994,1
335,The ‘#MeToo hurricane’: Rape case divides a Ru...,-0.371247,1
15106,Jersey man jailed for life after murdering his...,-0.371736,1
608,How innocent fall while playing uncovered litt...,-0.373064,1
2612,This blast from the past shows how much people...,-0.374038,1
7322,Pictures from downtown LA capture the problem ...,-0.375823,1
2675,"Woman, 78, is raped in Queens as police offer ...",-0.375954,1
2184,Sick crimes of one of world's worst killers wh...,-0.376238,1
7344,Love Island race row as Anton Danyluk is pictu...,-0.376265,1
8980,"Software engineer, 27, successfully sues for s...",-0.376606,1


## Using similarity to extract relevant articles