In [1]:
from GoogleNews import GoogleNews

googlenews = GoogleNews()

googlenews = GoogleNews(lang='en')
googlenews = GoogleNews(encode='utf-8')



In [2]:
googlenews.get_news('senate')

In [3]:
articles = googlenews.get_links()
if len(articles)>25:
    articles = articles[:25]

In [4]:
# newspaper parser
from newspaper import Article
from newspaper import Config

# set up newspaper parser
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36'
config = Config()
config.browser_user_agent = user_agent

In [5]:
def clean_text(input_string):
    """ clean the text parsed from the news articles

    :param input_string: raw plain text from article
    :return: clean_string: cleaned article text
    """
    clean_string = (input_string.translate(str.maketrans('', '', string.punctuation))).lower()
    clean_string = ' '.join(clean_string.split())
    return clean_string

In [6]:
import requests
import string

In [7]:

data = {}

for google_news_article in articles:
    google_news_article = 'http://' + google_news_article
    r = requests.get(google_news_article)
    article_url = r.url

    try:
        html = Article(article_url, config=config)
        html.download()
        html.parse()
        website_text = clean_text(html.text)
        data[article_url] = website_text
    except Exception as e:
        print(e)
        print("parse failure: ", article_url)
        website_text = float('NaN')
    
    print(f'finished with {article_url}')

print(data)  

finished with https://www.vogue.com/article/7-things-you-need-to-know-about-the-senate-runoff-races-in-georgia
finished with https://www.nbcnews.com/news/latino/what-do-u-s-senate-runoffs-georgia-have-do-puerto-n1250033
finished with https://www.brookings.edu/blog/fixgov/2020/11/16/who-will-hold-the-most-power-in-the-next-senate/
finished with https://www.nytimes.com/2020/12/06/us/politics/anti-vax-scientist-senate-hearing.html
finished with https://www.bostonglobe.com/2020/12/06/business/trump-mcconnell-are-expected-back-relief-bill-senator-says/
finished with https://www.nbcnews.com/think/opinion/georgia-s-senate-runoffs-show-democrats-need-new-message-socialism-ncna1250075
finished with https://www.npr.org/2020/11/07/932068951/senate-control-likely-decided-by-fate-of-2-georgia-runoff-races
finished with https://www.npr.org/2020/11/09/932921474/a-vexing-decision-calif-governor-mulls-who-will-replace-harris-in-senate
finished with https://www.foxnews.com/politics/georgia-runoffs-danie

In [8]:
import pandas as pd

In [9]:
df = pd.DataFrame(data.items(), columns=['url','website_text'])
df

Unnamed: 0,url,website_text
0,https://www.vogue.com/article/7-things-you-nee...,for the first time in us history two runoff ra...
1,https://www.nbcnews.com/news/latino/what-do-u-...,as georgias gop sen david perdue was forced in...
2,https://www.brookings.edu/blog/fixgov/2020/11/...,in conventional wisdom the two georgia senate ...
3,https://www.nytimes.com/2020/12/06/us/politics...,washington — a doctor who is skeptical of coro...
4,https://www.bostonglobe.com/2020/12/06/busines...,house speaker nancy pelosi and senate democrat...
5,https://www.nbcnews.com/think/opinion/georgia-...,the word socialism has been used for well over...
6,https://www.npr.org/2020/11/07/932068951/senat...,senate control likely decided by fate of 2 geo...
7,https://www.npr.org/2020/11/09/932921474/a-vex...,a vexing decision calif governor mulls who wil...
8,https://www.foxnews.com/politics/georgia-runof...,fresh off a senate campaign that fell short to...
9,https://www.nytimes.com/2020/12/05/us/senate-l...,“it makes it difficult to work in the middle w...


In [10]:
df.dropna(inplace=True)
df = df[df['website_text'].str.len() > 50]
df.shape

(24, 2)

In [11]:
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from nltk.corpus import stopwords

In [12]:

stop_words=stopwords.words('english')+list(string.punctuation) + ['“','”','–','—','’']

word_frequencies = []

for index, row in df.iterrows():
    #print(index, row['subreddit'])
    cleaneddoc=[word.lower() for word in word_tokenize(row['website_text']) if word not in stop_words]
    word_frequencies.append(FreqDist(cleaneddoc))
    #print([FreqDist(cleaneddoc)])

df['word_frequencies'] = word_frequencies

In [13]:
import gensim
import os
import pickle
import time

import numpy as np

In [14]:
filename = '../models/GoogleNews-vectors-negative300.bin.gz'
model = gensim.models.KeyedVectors.load_word2vec_format(filename, binary=True)

In [15]:
all_vectors=np.zeros((len(df),300))
total_zeros=0
# iterate through each row (document) in the dataframe
for index, row in df.iterrows(): 
    total_words_from_doc=0
    document_vector=np.zeros((300,))
    for word in row['word_frequencies'].keys():
        frequency = row['word_frequencies'][word]
        try:
            document_vector+=model.wv[word]*(frequency)
            total_words_from_doc+=frequency
        except:
            pass
            print(word)
    if total_words_from_doc>0:
        all_vectors[index]=document_vector/total_words_from_doc
    else:
        total_zeros+=1
        all_vectors[index]=document_vector
all_vectors

presidentelect
20
days—with
him—pit
ossoff
raphael
perdue
loeffler
5050
president–elect
5248
covid
14
2020
2017
2019
isakson
50
49
twoman
freeforall
ebenezer
33
bwx
covid19
sprecher
closeddoor
senate—like
—that
wnba
coowns
1996
cleland
saxby
chambliss
1992—and
17
insideradvantagefox
48
maine—where
wellfinanced
challenger—as
16
trumpian
abrams
gwinnett
razorthin
2018
800000
10
journalconstitution
georgias
perdue
ossoff
presidentelect
loeffler
ossoffand
raphael
nrsc
lindgren
11
24
2016
2020
rky
selfdetermination
ricos
alexandria
ocasiocortez
nydia
velázquez
4850
ricans
jenniffer
gonzálezcolón
prostatehood
pierluisi
federico
jesús
fdj
selfinterest
dduluth
2002
georgians
covid
97000
2017
miryam
lipper
ossoffs
nrscs
loefflers
centerright
fraga
haney
lópez
lyndon
delano
nonamerican
instagram
bidenharris
democrats—jon
ossoff
warnock—are
opponents—current
loeffler
perdue—and
georgians
5050
vote—unless
manchin
2005
2010
2018
trumpsupported
2017
—prominently
pandemicrelated
racism—not
2022
manch

array([[ 0.01951621,  0.0120675 , -0.00056542, ..., -0.03807266,
         0.01168425,  0.00249121],
       [ 0.01499967, -0.01366007,  0.02617706, ..., -0.03895775,
         0.03714606,  0.03346597],
       [ 0.03712238,  0.02816779,  0.00612958, ..., -0.02762088,
         0.03963205,  0.02147878],
       ...,
       [ 0.03021083,  0.01680851,  0.00604944, ..., -0.0663718 ,
         0.00231446,  0.02389476],
       [ 0.01705205,  0.01051885, -0.00369874, ..., -0.04075443,
         0.01420798,  0.01002707],
       [ 0.01708962,  0.01412981,  0.01626718, ..., -0.03643869,
         0.00794441,  0.00710348]])

In [16]:
all_vectors.shape

(24, 300)

In [17]:
import pickle
import xgboost


In [18]:
classifier=pickle.load(open('../models/xgboost_trained.pickle','rb'))

In [19]:
predictions = classifier.predict_proba(all_vectors)

In [24]:
df['prediction'] = predictions[:,0]

In [25]:
df

Unnamed: 0,url,website_text,word_frequencies,prediction
0,https://www.vogue.com/article/7-things-you-nee...,for the first time in us history two runoff ra...,"{'first': 5, 'time': 4, 'us': 4, 'history': 1,...",0.873749
1,https://www.nbcnews.com/news/latino/what-do-u-...,as georgias gop sen david perdue was forced in...,"{'georgias': 1, 'gop': 3, 'sen': 2, 'david': 1...",0.972076
2,https://www.brookings.edu/blog/fixgov/2020/11/...,in conventional wisdom the two georgia senate ...,"{'conventional': 1, 'wisdom': 1, 'two': 9, 'ge...",0.998997
3,https://www.nytimes.com/2020/12/06/us/politics...,washington — a doctor who is skeptical of coro...,"{'washington': 1, 'doctor': 1, 'skeptical': 1,...",0.072228
4,https://www.bostonglobe.com/2020/12/06/busines...,house speaker nancy pelosi and senate democrat...,"{'house': 3, 'speaker': 1, 'nancy': 1, 'pelosi...",0.999189
5,https://www.nbcnews.com/think/opinion/georgia-...,the word socialism has been used for well over...,"{'word': 1, 'socialism': 14, 'used': 2, 'well'...",0.679324
6,https://www.npr.org/2020/11/07/932068951/senat...,senate control likely decided by fate of 2 geo...,"{'senate': 15, 'control': 4, 'likely': 1, 'dec...",0.924901
7,https://www.npr.org/2020/11/09/932921474/a-vex...,a vexing decision calif governor mulls who wil...,"{'vexing': 3, 'decision': 4, 'calif': 1, 'gove...",0.693541
8,https://www.foxnews.com/politics/georgia-runof...,fresh off a senate campaign that fell short to...,"{'fresh': 1, 'senate': 5, 'campaign': 2, 'fell...",0.923061
9,https://www.nytimes.com/2020/12/05/us/senate-l...,“it makes it difficult to work in the middle w...,"{'makes': 1, 'difficult': 1, 'work': 2, 'middl...",0.998234


In [34]:
df[df['prediction'] > 0.5].loc[0].url

'https://www.vogue.com/article/7-things-you-need-to-know-about-the-senate-runoff-races-in-georgia'

In [35]:
df[df['prediction'] < 0.5].loc[0].url

KeyError: 0

In [45]:
df[df['prediction'] < 0.5]['url'].iloc[0]

'https://www.nytimes.com/2020/12/06/us/politics/anti-vax-scientist-senate-hearing.html'