# All the News 
## Data Analysis for Political Data Science
by Nico Hertel

Data is obtained from https://www.kaggle.com/snapcrack/all-the-news

## Training word embeddings
Preparing the texts for training

In [103]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time
import numpy as np
import gensim
import pickle
from nltk import word_tokenize
%matplotlib inline

In [77]:
# Reading the Data
articles1 = pd.read_csv('articles1.csv')
articles2 = pd.read_csv('articles2.csv')
articles3 = pd.read_csv('articles3.csv')
articles = pd.concat([articles1, articles3, articles3])
articles.info()
del(articles1, articles2, articles3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 135142 entries, 0 to 42570
Data columns (total 10 columns):
Unnamed: 0     135142 non-null int64
id             135142 non-null int64
title          135140 non-null object
publication    135142 non-null object
author         126892 non-null object
date           135112 non-null object
year           135112 non-null float64
month          135112 non-null float64
url            85142 non-null object
content        135142 non-null object
dtypes: float64(2), int64(2), object(6)
memory usage: 11.3+ MB


In [94]:
# Dict for storing the texts:
texts = dict()
papers = list(articles['publication'].unique())

In [95]:
papers.remove('Business Insider')
papers.remove('Guardian')
papers.remove('Atlantic')
papers

['New York Times',
 'Breitbart',
 'CNN',
 'NPR',
 'Reuters',
 'Vox',
 'Washington Post']

In [109]:
# Extract text from articles
for paper in papers:
    text = []
    content = articles[articles['publication'] == paper]['content'].values
    for article in content:
        token = word_tokenize(article)
        token = [word.lower() for word in token if word.isalpha()]
        text.append(token)
    texts[paper] = text

In [112]:
savename = time.strftime("Models/%d%m%y_%H%M%S_")
pickle.dump(model, open(savename + 'texts.pickle', 'wb'))

## Creating the Models

In [126]:
models = dict()
savename = time.strftime("Models/%d%m%y_%H%M%S_")

In [127]:
for paper in papers:
    print('Creating model for %s' % paper)
    model = gensim.models.Word2Vec(texts[paper], size=150, window=10, min_count=5)
    print('Finished creating model, moving to training')
    model.train(token, total_examples=len(token), epochs=10)
    print('Finished training model on %s' % paper)
    models[paper] = model
    model.wv.save_word2vec_format(savename + paper + '.txt', binary=False)
print('Finished Training')

Creating model for New York Times
Finished creating model, moving to training
Finished training model on New York Times
Creating model for Breitbart
Finished creating model, moving to training
Finished training model on Breitbart
Creating model for CNN
Finished creating model, moving to training
Finished training model on CNN
Creating model for NPR
Finished creating model, moving to training
Finished training model on NPR
Creating model for Reuters
Finished creating model, moving to training
Finished training model on Reuters
Creating model for Vox
Finished creating model, moving to training
Finished training model on Vox
Creating model for Washington Post
Finished creating model, moving to training
Finished training model on Washington Post
Finished Training


## Playing around with the models
Having seven trained word2vec models, it't time to start playing around and exploring the passabilities. Lets start by looking how the current president of the United States, Donald Trup, is embedded in the different news organisations

In [185]:
# Loading the Models
models = dict()
papers = ['New York Times',
     'Breitbart',
     'CNN',
     'NPR',
     'Reuters',
     'Vox',
     'Washington Post']
savename = 'Models/141118_121502_'
for paper in papers:
    models[paper] = gensim.models.KeyedVectors.load_word2vec_format(savename + paper + '.txt',
                                                                    binary=False)

In [154]:
def extract_words(most_similar):
    '''
    Extracts only the words from the most similar function of gensim

    Parameters
    ----------
    most_similar : list
        Sequence of (word, similarity) (Returned by gensim.wv.most_similar)

    Returns
    -------
    words : list
        List of the 'word's in most_similar
    '''
    words = []
    for entry in most_similar:
        words.append(entry[0])
    return words

In [192]:
def show_context(word, models, topn=10):
    '''
    Creates a DataFrame of the context of the passed word

    Parameters
    ----------
    word : str
        The word to be analyzed
    models : dict
        A dictionary containing trained Word2Vec models
    topn : int (default 10)
        How many similar words should be returned

    Returns
    -------
    context : DataFrame
        A collection of the closesed words in a DataFrame
    '''
    topn = int(topn)
    context = pd.DataFrame(columns=list(models.keys()))
    for paper in models:
        if word not in models[paper].vocab:
            print('Error! Word %s not in vocabulary of %s' % (word, paper))
        else:
            words = extract_words(models[paper].most_similar(positive=word, topn=n))
            context[paper] = words
    return context

In [193]:
# Looking for context of 'donald'
show_context('donald', models)

Unnamed: 0,New York Times,Breitbart,CNN,NPR,Reuters,Vox,Washington Post
0,melania,tower,melania,melania,alassane,tower,melania
1,presumptive,ivanka,ebell,colluding,merk,melania,tower
2,presidency,melania,unfit,thundered,petro,cruz,dionne
3,pence,steaks,unqualified,barasso,joko,ivanka,cruz
4,ivanka,winery,surrogates,ivana,barack,birtherism,presumptive
5,turnberry,supporters,crooked,steaks,maithripala,alterman,speakerryan
6,cruz,hopeful,tweeting,birtherism,bashar,nieto,ivanka
7,tower,egocentric,teleprompters,phony,muhammadu,musket,dana
8,supporter,cruz,ivanka,ivanka,tayyip,presumptive,stahl
9,ivana,romney,loyalty,presidency,nicolas,supporters,riffed


In [194]:
# Looking for context of 'trump'
show_context('trump', models)

Unnamed: 0,New York Times,Breitbart,CNN,NPR,Reuters,Vox,Washington Post
0,candidacy,cruz,rhetoric,rubio,obama,cruz,candidacy
1,pence,kasich,pence,obama,sanders,rubio,rubio
2,sanders,candidacy,kaine,conway,clinton,candidacy,candidate
3,nominee,romney,cruz,spicer,rubio,obama,priebus
4,rumsfeld,rubio,obama,pence,cruz,he,romney
5,obama,he,conway,sanders,rhetoric,bannon,sanders
6,clinton,gop,kasich,tusk,nominee,sanders,he
7,romney,realdonaldtrump,spicer,carson,candidate,rhetoric,rhetoric
8,bannon,fiorina,rubio,clinton,nomination,santorum,conway
9,candidate,sanders,candidacy,cruz,conway,presidency,cruz


In [195]:
# Looking for context of 'election'
show_context('election', models)

Unnamed: 0,New York Times,Breitbart,CNN,NPR,Reuters,Vox,Washington Post
0,elections,elections,elections,elections,elections,elections,elections
1,race,primary,electorate,primaries,race,primaries,race
2,debates,primaries,candidate,campaign,candidate,primary,primary
3,primaries,race,debates,candidate,vote,victory,contest
4,primary,contest,primaries,primary,primaries,electorate,primaries
5,polls,electoral,primary,voter,presidency,candidate,electoral
6,vote,victory,polls,race,referendum,caucuses,electorate
7,candidate,landslide,outcome,caucuses,inauguration,presidency,candidate
8,conventions,candidate,vote,referendum,ballot,polling,nomination
9,campaign,debates,race,vote,electoral,polls,caucuses
