# Creating summed word2vec and doc2vec representations

Uses data from every news dump created so far

In [1]:
import re
import json
import torch
import pickle
import nltk

import numpy as np
import pandas as pd

from os import listdir
from importlib import reload
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from gensim.models import Doc2Vec
from sklearn.ensemble import IsolationForest
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/ozzy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ozzy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [15]:
def clean_text(article_text):
    """ Utility function for cleaning up text for me.  There's probably better ways to prepare data. """
    article_text = re.sub(r'\[[0-9]*\]', ' ', article_text)  # Gets rid of numbers
    article_text = re.sub(r'\s+', ' ', article_text)         # Replaces all forms of white space with single space
    #article_text = re.sub(r'"', '', article_text)            # Removes quotation marks
    
    return(article_text)

## Load the news corpus, clean and prepare sentences

In [11]:
source_list = pd.read_csv("rss_urls.csv").rename(columns={"url": "source_url"})

In [3]:
# Retrieve a list of Json corpus files so far
files = [x for x in listdir("./output") if x.endswith(".json") and ("corpus" in x)]

In [4]:
# Load all of the news corpus files
articles = []
for file in files:
    with open("./output/"+file, "r") as f:
        dump = json.load(f)
        articles = articles + list(dump)

In [19]:
data = pd.DataFrame(articles)

In [20]:
data = pd.merge(data, source_list, how="left", on="source_url")

In [30]:
data['clean_text'] = data[['title', 'summary']].apply(lambda x: clean_text('. '.join(x)), axis=1)

In [33]:
data.shape

(15544, 8)

In [40]:
with open("./output/sentence_embeddings.pkl", "rb") as f:
    embeddings = pickle.load(f)

embeddings_df = pd.DataFrame({"clean_text": list(embeddings.keys()),
                              "embeddings": list(embeddings.values())})

Unnamed: 0,clean_text,embeddings
0,Robert Mueller: Charging Trump was not an opti...,"[0.07501952, 0.008237855, 0.04390721, 0.004687..."
1,Nusrat Jahan Rafi: 16 charged in Bangladesh fo...,"[0.011178329, 0.053766962, 0.08121284, 0.04253..."
2,Tankers almost certainly damaged by Iranian na...,"[0.06743715, 0.08942383, 0.07022511, 0.0506297..."
3,Huawei: US blacklist will harm billions of con...,"[0.05050419, 0.03701058, 0.03597484, 0.0102925..."
4,Growing crops in the shadow of Fukushima. Eigh...,"[0.025309468, 0.06599146, 0.050482944, 0.0, 0...."


In [41]:
data = pd.merge(data, embeddings_df, how="left", on="clean_text")
data.head()

Unnamed: 0,date,link,retrieval_timestamp,source_url,summary,title,type,clean_text,embeddings
0,"Wed, 29 May 2019 17:27:58 GMT",https://www.bbc.co.uk/news/world-us-canada-484...,2019-05-29 21:02:30.743862,http://feeds.bbci.co.uk/news/world/rss.xml,The special counsel said legal guidelines mean...,Robert Mueller: Charging Trump was not an option,world,Robert Mueller: Charging Trump was not an opti...,"[0.07501952, 0.008237855, 0.04390721, 0.004687..."
1,"Wed, 29 May 2019 14:45:39 GMT",https://www.bbc.co.uk/news/world-asia-48441604,2019-05-29 21:02:30.743862,http://feeds.bbci.co.uk/news/world/rss.xml,The schoolgirl was set on fire after filing a ...,Nusrat Jahan Rafi: 16 charged in Bangladesh fo...,world,Nusrat Jahan Rafi: 16 charged in Bangladesh fo...,"[0.011178329, 0.053766962, 0.08121284, 0.04253..."
2,"Wed, 29 May 2019 10:33:22 GMT",https://www.bbc.co.uk/news/world-middle-east-4...,2019-05-29 21:02:30.743862,http://feeds.bbci.co.uk/news/world/rss.xml,National Security Adviser John Bolton blames I...,Tankers almost certainly damaged by Iranian na...,world,Tankers almost certainly damaged by Iranian na...,"[0.06743715, 0.08942383, 0.07022511, 0.0506297..."
3,"Wed, 29 May 2019 07:49:20 GMT",https://www.bbc.co.uk/news/business-48441814,2019-05-29 21:02:30.743862,http://feeds.bbci.co.uk/news/world/rss.xml,"Huawei says the US is ""using the strength of a...",Huawei: US blacklist will harm billions of con...,world,Huawei: US blacklist will harm billions of con...,"[0.05050419, 0.03701058, 0.03597484, 0.0102925..."
4,"Tue, 28 May 2019 23:02:53 GMT",https://www.bbc.co.uk/news/world-asia-48433222,2019-05-29 21:02:30.743862,http://feeds.bbci.co.uk/news/world/rss.xml,"Eight years on from the nuclear disaster, some...",Growing crops in the shadow of Fukushima,world,Growing crops in the shadow of Fukushima. Eigh...,"[0.025309468, 0.06599146, 0.050482944, 0.0, 0...."


## Experiment with interesting story extraction!

In this case we try to extract interesting stories by using cosine similarity and the page rank algorithm to find stories least similar to those in the sensible "world" RSS feeds.

In [75]:
# Get the "average" array for "world" stories
world_array = np.mean(np.asarray(list( data[data['type']=="world"]['embeddings'].drop_duplicates() )), axis=0).reshape(1, -1)

data['world_similarity'] = data['embeddings'].apply(lambda x: float(cosine_similarity(world_array, x.reshape(1, -1))[0]))

In [79]:
data[['clean_text', 'world_similarity']].sort_values("world_similarity", ascending=True).drop_duplicates()

Unnamed: 0,clean_text,world_similarity
4953,Picture Of The Day.,0.492969
15262,"Muslims, Islam and Ramadan.",0.513591
343,Correction: Trump-Japan story. Correction: Tru...,0.530321
13692,Super cars: Mooragh Park.,0.560429
8042,The Hypersonic Arms Race.,0.567377
11201,Binyavanga Wainaina obituary.,0.579315
1275,Automatic number plate recognition.,0.583151
14216,Countering a Resurgent Russia.,0.589116
4950,Sending A Message To Iran.,0.610975
4396,MLC resigns government department position.,0.622134


In [None]:
data.to_csv("temp_cosine_results.csv")