In [22]:
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns

In [3]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2"
tf.logging.set_verbosity(tf.logging.ERROR)
np.random.seed(6)

In [39]:
def open_csvs(paths):
    final = pd.read_csv(paths[0], lineterminator='\n')
    final['source'] = re.findall(r".*/(.*)\.csv", paths[0])[0]
    for i in range(1, len(paths)):
        tmp = pd.read_csv(paths[i], lineterminator='\n')
        tmp['source'] = re.findall(r".*/(.*)\.csv", paths[i])[0]
        final = pd.concat([final, tmp], axis=0, ignore_index=True)
    del tmp
    final = final.drop(labels='Unnamed: 0', axis=1)
    return final

In [40]:
paths = ['clean-news-data/wapo.csv', 'clean-news-data/chicago.csv', 'clean-news-data/fox.csv', 'clean-news-data/breitbart.csv']
data = open_csvs(paths)

In [43]:
data.head()

Unnamed: 0,source,text,title,url
0,wapo,Eric Reid appeared to be one of the most relia...,NFL players’ union files grievance against Ben...,https://www.washingtonpost.com/news/sports/wp/...
1,wapo,Desktop notifications are on | Turn off Get ...,Business - The Washington Post,https://www.washingtonpost.com/business/
2,wapo,Perspective Interpretation of the news based ...,New OPM chief presses for civil service overha...,https://www.washingtonpost.com/news/powerpost/...
3,wapo,SAN FRANCISCO – Google on Tuesday plans to wad...,"At Google, ‘responsibility’ upstages new techn...",https://www.washingtonpost.com/news/the-switch...
4,wapo,"Stand-up comics, famously, do not like heckler...",A woman had a seizure at Ken Jeong’s comedy sh...,https://www.washingtonpost.com/news/arts-and-e...


In [51]:
text = data['text']

In [53]:
text.values

array(['Eric Reid appeared to be one of the most reliable defensive backs on this year’s NFL free agent market, at least on paper. But he’s still looking for a job, and his continued unemployment has prompted a second grievance filed against the NFL. The NFL Players Association filed a noninjury grievance related to Reid’s free agency Monday, the union said in a statement. It marks the union’s most formal defense yet for players who have protested\xa0racial injustice and police brutality during the playing of the national anthem. The grievance is related to questions the safety has been asked about his plans to continue kneeling during the anthem next season. “According to our information, a club appears to have based its decision not to sign a player based on the player’s statement that he would challenge the implementation of a club’s policy prohibiting demonstration, which is contrary to League policy,” the NFLPA said in a statement. The grievance is connected to Reid’s only known f

In [54]:
documents = text.values

In [80]:
skip = []
for i in range(len(documents)):
    try:
        documents[i] = documents[i].replace('\n', ' ') 
        documents[i] = documents[i].replace('\r', ' ') 
        if documents[i] == np.nan:
            skip.append(i)
    except:
        skip.append(i)
docs = np.delete(documents, skip, axis=0)

## Universal Sentence Encoder embeddings

In [81]:
embed = hub.Module(module_url)
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    embeddings = session.run(embed(docs))

Exception ignored in: <bound method BaseSession.__del__ of <tensorflow.python.client.session.Session object at 0x1229f4278>>
Traceback (most recent call last):
  File "/Users/alexchan/anaconda3/envs/MLKart/lib/python3.6/site-packages/tensorflow/python/client/session.py", line 707, in __del__
    tf_session.TF_DeleteSession(self._session)
KeyboardInterrupt: 


In [66]:
np.savetxt("news_embeds.tsv", embeddings, delimiter="\t")

In [68]:
titles = data['title']
titles = titles.values
titles = np.delete(titles, skip, axis=0)
np.savetxt("news_titles.tsv", titles, fmt='%s')

In [70]:
source = data['source']
source = source.values
source = np.delete(source, skip, axis=0)
np.savetxt("news_source.tsv", source, fmt='%s')

In [71]:
documents.shape

(20006,)

## LDA

In [82]:
import csv
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation, NMF
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [83]:
def preprocess(sent):
    string = ''
    for word in sent.split(' '): 
        word = word.lower()
        if word.find('https') == -1 and word.find('amp') == -1:
            curr = ps.stem(word)
            string += curr + ' '
    return string

In [84]:
ps = PorterStemmer()

In [87]:
no_features = 1000

tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english', preprocessor=preprocess)
tfidf = tfidf_vectorizer.fit_transform(docs)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()

# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english', preprocessor=preprocess)
tf = tf_vectorizer.fit_transform(docs)
tf_feature_names = tf_vectorizer.get_feature_names()

In [88]:
no_topics = 20

nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)

lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)



In [89]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic %d:" % (topic_idx))
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(nmf, tfidf_feature_names, no_top_words)

Topic 0:
wa like thi don just know peopl want think make
Topic 1:
click video sign help comment post florida watch featur car
Topic 2:
trump presid donald white hi hous president said obama ha
Topic 3:
breitbart facebook comment com articl page count reflect visit thi
Topic 4:
percent compani said ha data year million market new health
Topic 5:
iran nuclear deal iranian agreement saudi weapon obama program syria
Topic 6:
fox news podcast host radio hour com free everi chri
Topic 7:
wa hi polic said report sexual year old offic told
Topic 8:
desktop turn alert break washington post news newslett pleas check
Topic 9:
game team season hi play player coach nfl draft win
Topic 10:
clinton fbi comey hillari investig depart mueller email justic russia
Topic 11:
china korea north chines kim trade korean south war state
Topic 12:
chicago post free book recent add limit park 00 offer
Topic 13:
climat ice research scientist carbon sea ocean studi warm water
Topic 14:
school student gun teacher un

In [90]:
display_topics(lda, tf_feature_names, no_top_words)

Topic 0:
school tax student pay program educ health cut plan fund
Topic 1:
trump presid hi donald white wa hous news said ha
Topic 2:
thi wa think peopl know becaus like just don say
Topic 3:
compani ha said use tech data facebook new user thi
Topic 4:
team hi player nfl draft season coach sport return round
Topic 5:
turn email washington post check desktop break pleas news alert
Topic 6:
ha gun immigr american nation support govern group union illeg
Topic 7:
wa hi said thi like just ha time make want
Topic 8:
research climat thi said studi ha chang sea new scientist
Topic 9:
iran china israel trump presid deal nuclear state north said
Topic 10:
wa hi said year old told ha day man new
Topic 11:
said state feder depart administr ha court law wa obama
Topic 12:
energi oil food water ga carbon use thi make clean
Topic 13:
percent year 000 million new ha rate price thi american
Topic 14:
news fox post host video free report sign thi 2018
Topic 15:
republican democrat senat trump vote state

## Sentiment Analysis

In [None]:
import nltk 
import pandas as pd 
import string

from nltk.sentiment.vader import SentimentIntensityAnalyzer

messages = documents
sid = SentimentIntensityAnalyzer()

summary = {"positive":0,"neutral":0,"negative":0}
for x in messages: 
    ss = sid.polarity_scores(x)
    if ss["compound"] == 0.0: 
        summary["neutral"] +=1
    elif ss["compound"] > 0.0:
        summary["positive"] +=1
    else:
        summary["negative"] +=1
print(summary)

In [None]:
import nltk
nltk.download('vader_lexicon')