## Imports

In [1]:
import pandas as pd
import numpy as np
import re

pd.set_option('display.max_columns', None)

In [2]:
df_comments = pd.read_csv('fb_news_comments_1000K_hashed.csv')
df_posts = pd.read_csv('fb_news_posts_20K.csv')

## Cleaning + making new columns

In [3]:
#Clean date
df_posts['date'] = df_posts['created_time'].str[0:10]
df_posts['date'] = pd.to_datetime(df_posts['date'])

#Clean time
df_posts['time'] = df_posts['created_time'].str[11:19]
df_posts['time'] = pd.to_datetime(df_posts['time'],format="%H:%M:%S")

In [4]:
#Percent angry reacts
df_posts['percent_angry'] = (df_posts['react_angry'] / (df_posts['react_angry'] + df_posts['react_haha'] + df_posts['react_like'] + df_posts['react_love'] + df_posts['react_sad'] + df_posts['react_wow'])) * 100

#Number all reacts
df_posts['all_reacts'] = (df_posts['react_angry'] + df_posts['react_haha'] + df_posts['react_like'] + df_posts['react_love'] + df_posts['react_sad'] + df_posts['react_wow'])

#Make df of posts above 50 reacts (decent exposure) and descriptions
df_posts_big = df_posts[df_posts['all_reacts'] > 50]
df_posts_big = df_posts_big[df_posts_big['message'].notna()]

#Make angry_label column
df_posts_big['angry_label'] = (df_posts_big['percent_angry'] > 30).astype(int)

## Exploration

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))

matrix = vectorizer.fit_transform(df_posts_big.message)
words_df = pd.DataFrame(matrix.toarray(),
                        columns=vectorizer.get_feature_names())
words_df

In [None]:
len(df_posts_big[df_posts_big['percent_angry'] > 30])

### Message + angriness

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df_posts_big['angry_label'] = le.fit_transform(df_posts_big.percent_angry > 30)
df_posts_big[df_posts_big['angry_label'] == 1].head(1)

In [None]:
X = words_df
y = df_posts_big.angry_label

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
# LinearSVC: 0.84

# from sklearn.svm import LinearSVC
# clf = LinearSVC(max_iter=10000)
# clf.fit(X_train, y_train)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)
#100

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

In [None]:
import eli5

feature_names = list(words_df.columns)
eli5.show_weights(clf, feature_names=feature_names)

### Comments and angriness

In [9]:
#Turn stuff into strings
df_comments['from_name'] = df_comments['from_name'].astype(str)
df_comments['message'] = df_comments['message'].astype(str)

#Stick comments together and group by post id to allow for join with main csv
df_comments['post_id_for_merge'] = df_comments['post_name'].str.split('_').str[-1].astype(str)
df_aggregated_comments = df_comments.groupby('post_id_for_merge')['message'].agg('|'.join)
df_aggregated_comments = df_aggregated_comments.to_frame()

#Make column to join by in main df
df_posts_big['post_id_for_merge'] = df_posts['post_id'].str.split('_').str[-1].astype(str)

#Merge
df_new = df_posts_big.merge(df_aggregated_comments,right_index=True,left_on="post_id_for_merge")

In [None]:
df_new.head(2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))

matrix = vectorizer.fit_transform(df_new.message_y)

In [None]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
df_new['angry_label'] = le.fit_transform(df_new.percent_angry > 30)
df_new[df_new['angry_label'] == 1].head(2)

In [None]:
X = matrix
y = df_new.angry_label

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

In [None]:
# 100% accuracy!! Need to check if something is going wrong...
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

In [None]:
import eli5

feature_names = vectorizer.get_feature_names()
eli5.show_weights(clf, feature_names=feature_names)

## Anger in certain outlets outlet

In [None]:
# Guardian = 10513336322
# Fox News = 15704546335
# MSNBC = 273864989376427
# BBC = 228735667216

In [28]:
df_new['page_id'] = df_new['page_id'].astype(str).str.strip()
df_fox = df_new[df_new['page_id'] == "15704546335"]
df_msnbc = df_new[df_new['page_id'] == "273864989376427"]

### FOX NEWS POSTS RESULTS

In [11]:
df_fox1 = df_fox[df_fox.message_x.notnull()]

In [12]:
#Vectorize words
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))
matrix = vectorizer.fit_transform(df_fox1.message_x)

#Set up x and y values
X = matrix
y = df_fox1.angry_label

#Training split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

RandomForestClassifier()

In [13]:
clf.score(X_test, y_test)

0.9841269841269841

In [14]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not angry,Predicted angry
Is not angry,50,0
Is angry,1,12


In [15]:
import eli5

feature_names = vectorizer.get_feature_names()
eli5.show_weights(clf, feature_names=feature_names)

Weight,Feature
0.0402  ± 0.0819,police
0.0395  ± 0.0756,government
0.0378  ± 0.0868,hawaii
0.0373  ± 0.0583,state
0.0304  ± 0.0665,calling
0.0288  ± 0.0513,live
0.0243  ± 0.0633,trump
0.0238  ± 0.0621,people
0.0232  ± 0.0520,speech
0.0229  ± 0.0736,york


### FOX NEWS COMMENTS RESULTS

In [16]:
#Vectorize words
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))
matrix = vectorizer.fit_transform(df_fox.message_y)

#Set up x and y values
X = matrix
y = df_fox.angry_label

#Training split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

RandomForestClassifier()

In [17]:
clf.score(X_test, y_test)

1.0

In [18]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not angry,Predicted angry
Is not angry,51,0
Is angry,0,12


In [19]:
import eli5

feature_names = vectorizer.get_feature_names()
eli5.show_weights(clf, feature_names=feature_names)

Weight,Feature
0.0070  ± 0.0596,punishment
0.0049  ± 0.0562,executive orders
0.0040  ± 0.0468,ban
0.0039  ± 0.0422,foul
0.0037  ± 0.0441,penalties
0.0037  ± 0.0448,politically correct
0.0036  ± 0.0316,gitmo
0.0036  ± 0.0339,em
0.0034  ± 0.0399,majority people
0.0034  ± 0.0341,george lopez


### MSNBC NEWS POSTS RESULTS

In [31]:
df_msnbc1 = df_msnbc[df_msnbc.message_x.notnull()]

In [32]:
#Vectorize words
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))
matrix = vectorizer.fit_transform(df_msnbc1.message_x)

#Set up x and y values
X = matrix
y = df_msnbc1.angry_label

#Training split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

RandomForestClassifier()

In [33]:
clf.score(X_test, y_test)

0.9838709677419355

In [34]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not angry,Predicted angry
Is not angry,44,0
Is angry,1,17


In [35]:
import eli5

feature_names = vectorizer.get_feature_names()
eli5.show_weights(clf, feature_names=feature_names)

Weight,Feature
0.0510  ± 0.0928,putin
0.0364  ± 0.0661,new
0.0312  ± 0.0646,president trump
0.0283  ± 0.0770,russian president
0.0276  ± 0.0467,president
0.0257  ± 0.0735,health
0.0246  ± 0.0639,health care
0.0240  ± 0.0709,live
0.0228  ± 0.0707,melania
0.0215  ± 0.0764,president vladimir


### MSNBC NEWS COMMENTS RESULTS

In [37]:
df_msnbc2 = df_msnbc[df_msnbc.message_y.notnull()]

In [41]:
#Vectorize words
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english', min_df=5, max_df=0.5, ngram_range=(1, 2))
matrix = vectorizer.fit_transform(df_msnbc2.message_y)

#Set up x and y values
X = matrix
y = df_msnbc2.angry_label

#Training split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

#Random forest
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X, y)

RandomForestClassifier()

In [42]:
clf.score(X_test, y_test)

1.0

In [43]:
from sklearn.metrics import confusion_matrix

y_true = y_test
y_pred = clf.predict(X_test)
matrix = confusion_matrix(y_true, y_pred)

label_names = pd.Series(['not angry', 'angry'])
pd.DataFrame(matrix,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted not angry,Predicted angry
Is not angry,51,0
Is angry,0,11


In [44]:
import eli5

feature_names = vectorizer.get_feature_names()
eli5.show_weights(clf, feature_names=feature_names)

Weight,Feature
0.0053  ± 0.0455,jersey
0.0042  ± 0.0414,inappropriate
0.0038  ± 0.0392,facilities
0.0036  ± 0.0411,disgusting
0.0034  ± 0.0344,lots people
0.0030  ± 0.0380,hand
0.0030  ± 0.0341,22 million
0.0029  ± 0.0290,closed
0.0029  ± 0.0389,looked
0.0029  ± 0.0296,believe putin
