# Text NN classifier for Political Bias

In [1]:
%load_ext autoreload

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data
from torch.autograd import Variable
from sklearn.utils import shuffle
from torchsample.initializers import XavierUniform, Uniform
from torchsample.modules import ModuleTrainer
from torchsample.metrics import CategoricalAccuracy

%aimport torchsample.modules



In [2]:
use_cuda = False
batch_size = 64

In [3]:
df = pd.read_csv("data/source/newsclust.csv")
df = df.query("site != 'cbn.com'")
print(len(df))
df = df.sample(30000)

103262


Compute the bias for each one of the articles, based on the publication's known bias

In [4]:
from bias import Bias

num_classes = 7
df['bias'] = df.apply(lambda row: Bias.get_bias_for_domain(row['site']).value, axis=1)
df.head(2)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias
69135,69135,2017-02-04T09:57:00.000+02:00,motherjones.com,US Customs Agents Just Gave Airlines the Green...,US Customs Agents Just Gave Airlines the Green...,http://www.motherjones.com/politics/2017/02/us...,1
80070,80070,2016-07-06T05:24:00.000+03:00,rightwingnews.com,Print this article Font size - 16 + 1 Subscrib...,BREAKING: FBI Director Facing Congressional Pr...,http://rightwingnews.com/top-news/breaking-fbi...,6


Tokenize the text of the articles, create a vocabulary of words

In [5]:
import spacy
nlp = spacy.load('en')
nlp.pipeline = []

def tokenize_text(texts):
    docs = [doc for doc in nlp.pipe(texts, batch_size=500, n_threads=8)]
    return docs

def is_invalid_token(token):
    return token.is_punct or token.is_space or token.like_url or token.like_num or token.is_digit

def get_words_for_docs(docs):
    return [get_words_ids(doc) for doc in docs]

def get_words_ids(doc):
    return [token.orth for token in doc if not is_invalid_token(token)]

In [6]:
df['docs'] = tokenize_text(df['text'])

In [7]:
df['words'] = get_words_for_docs(df['docs'])
df.head(1)

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words
69135,69135,2017-02-04T09:57:00.000+02:00,motherjones.com,US Customs Agents Just Gave Airlines the Green...,US Customs Agents Just Gave Airlines the Green...,http://www.motherjones.com/politics/2017/02/us...,1,"(US, Customs, Agents, Just, Gave, Airlines, th...","[884, 643643, 382300, 985, 488862, 383694, 501..."


Keep the most common words as vocabulary, replace the words out of vocabulary by the least frequently used word

In [8]:
from collections import Counter

vocab_size = 5000

def flatten(l):
    flat_list = [item for sublist in l for item in sublist]
    return flat_list

word_freq = Counter(flatten(df['words']))
common_words = word_freq.most_common(vocab_size)
word_freq.most_common(5)

[(501, 763159), (504, 422005), (510, 382775), (512, 338685), (506, 337060)]

In [9]:
print(nlp.vocab.strings[501], nlp.vocab.strings[510], nlp.vocab.strings[512], nlp.vocab.strings[506])

the of and a


In [10]:
vocab = {word_id: i for (i, (word_id, freq)) in enumerate(common_words)}
print(len(vocab), vocab[501], vocab[504], vocab[510])
oov_word = vocab_size - 1
oov_word

5000 0 1 2


4999

In [11]:
def get_words_ids_if_common(words):
    return [vocab.get(word, oov_word) for word in words]

df['words_with_oov'] = df.apply(lambda row: get_words_ids_if_common(row['words']), axis=1)
df['words_with_oov'].head()

69135    [339, 4999, 4999, 805, 4999, 3513, 0, 1659, 49...
80070    [1446, 29, 590, 4999, 1543, 2134, 1744, 142, 4...
72555    [556, 4999, 4, 519, 315, 210, 4999, 11, 2768, ...
43318    [4999, 2395, 565, 4999, 4999, 1, 1047, 4999, 9...
83414    [443, 716, 368, 4999, 4999, 1, 4999, 4999, 499...
Name: words_with_oov, dtype: object

Look at the distribution of words. The longest article has 15K words, the shortest 21.

In [12]:
lens = np.array(list(map(len, df['words'])))
(lens.max(), lens.min(), lens.mean())

(23168, 16, 538.72820000000002)

Split the data for training and validation

In [13]:
TEST_DOMAINS =  {
    'bloomberg.com',
     'breitbart.com',
     'c-span.org',
     'chicagotribune.com',
     'chron.com'
}
df_train =  df.loc[~df['site'].isin(TEST_DOMAINS)]
df_test =  df.loc[df['site'].isin(TEST_DOMAINS)]
print(len(df_train))
print(len(df_test))

27391
2609


Enforce all texts to have the same length, truncating or padding if necessary

In [14]:
seq_len = 1000

from keras.preprocessing import sequence

trn = sequence.pad_sequences(df_train['words_with_oov'], maxlen=seq_len, value=0)
test = sequence.pad_sequences(df_test['words_with_oov'], maxlen=seq_len, value=0)

trn_tensor = torch.from_numpy(trn).long()
test_tensor = torch.from_numpy(test).long()

trn

Using TensorFlow backend.


array([[   0,    0,    0, ..., 3041, 2039,  177],
       [   0,    0,    0, ...,  235,  101,   85],
       [   0,    0,    0, ...,  842,    0, 2215],
       ..., 
       [   0,    0,    0, ..., 3111, 4999, 1206],
       [   0,    0,    0, ..., 1808, 1134,   20],
       [  19,   11, 4999, ..., 1664, 4167,   15]], dtype=int32)

Prepare the tensor data for pytorch

In [15]:
labels_train_tensor = torch.from_numpy(np.array(df_train['bias']))
labels_test_tensor = torch.from_numpy(np.array(df_test['bias']))
labels_train_tensor[:3]


 1
 6
 1
[torch.LongTensor of size 3]

## Single Hidden Layer NN classifier

In [43]:
import torch.nn as nn
import torch.nn.functional as F

class SingleHiddenLayerModule(nn.Module):
    def __init__(self):
        super().__init__()
        num_dimensions = 32
        self.embedding = nn.Embedding(vocab_size, num_dimensions)
        self.fc1 = nn.Linear(seq_len * num_dimensions, 100)
        self.dropout = nn.Dropout(0.7)
        self.fc2 = nn.Linear(100, num_classes)
        self.init()
        self.classifier = True

    def forward(self, words_ids):
        x = self.embedding(words_ids) # x => torch.Size([64, 1000, 32])
        x = x.view(x.size(0), -1) # x => torch.Size([64, 16000])
        x = self.fc1(x)
        if self.classifier:
            x = F.relu(x, True)
            x = self.dropout(x)
            x = self.fc2(x)
        return x
    
    def init(self):
        torch.nn.init.constant(self.fc1.bias, val=0.0)
        torch.nn.init.constant(self.fc2.bias, val=0.0)

In [48]:
%autoreload 2

model = SingleHiddenLayerModule()

def make_trainer(model):
    criterion = nn.CrossEntropyLoss()
    if(use_cuda):
        model.cuda()
        criterion.cuda()
    trainer = ModuleTrainer(model)
    trainer.set_optimizer(optim.Adam, lr=1e-3)
    trainer.set_loss(criterion)
    trainer.set_initializers([Uniform(module_filter="embedding*", a=-0.05, b=0.05), XavierUniform(module_filter="fc*")])
    trainer.set_metrics([CategoricalAccuracy()])

    return trainer
    
trainer = make_trainer(model)
model

SingleHiddenLayerModule (
  (embedding): Embedding(5000, 32)
  (fc1): Linear (32000 -> 100)
  (dropout): Dropout (p = 0.7)
  (fc2): Linear (100 -> 7)
)

In [49]:
model.classifier = True
trainer.fit(trn_tensor, labels_train_tensor, val_data=(test_tensor, labels_test_tensor), 
            nb_epoch=1, batch_size=batch_size, shuffle=True)

Epoch 1/1: 429 batches [00:34, 12.50 batches/s, acc=44.05, loss=1.4276, val_acc=45.52, val_loss=1.3247]


In [51]:
precision = sum(torch.max(trainer.predict(test_tensor).data, 1)[1] == labels_test_tensor).numpy() * 1.0 / len(labels_test_tensor)
print("Val acc", 100.0 * precision)

Val acc [ 5.21272518]


Manually verify some articles. The bias for these articles should match the predicted bias below.

In [52]:
df_sample = df_test.sample(5)
df_sample

Unnamed: 0.1,Unnamed: 0,date,site,text,title,url,bias,docs,words,words_with_oov
80793,80793,2016-07-19T06:34:00.000+03:00,chicagotribune.com,"Even at convention, Trump finds it hard to ste...","Even at convention, Trump finds it hard to ste...",http://www.chicagotribune.com/news/nationworld...,2,"(Even, at, convention, ,, Trump, finds, it, ha...","[1285, 584, 7553, 576246, 5370, 519, 954, 504,...","[830, 15, 1525, 30, 3442, 14, 451, 1, 1112, 30..."
36451,36451,2016-02-23T09:12:00.000+02:00,chron.com,Close Image 1 of 4 A man holds his smart phone...,Chinese phones go global after pushing aside A...,http://www.chron.com/business/technology/artic...,2,"(Close, Image, 1, of, 4, A, man, holds, his, s...","[544954, 580713, 510, 688, 852, 4138, 553, 174...","[2998, 546, 2, 37, 210, 2360, 24, 2612, 965, 1..."
91691,91691,2015-08-28T03:00:00.000+03:00,bloomberg.com,Type of Data * Update Needed * \nAll data chan...,AMANA INC (2402:Tokyo): Ownership & Shareholde...,http://www.bloomberg.com/research/stocks/owner...,2,"(Type, of, Data, *, Update, Needed, *, \n, All...","[755732, 510, 663156, 156974, 593302, 1125, 16...","[4999, 2, 4500, 4999, 4999, 426, 459, 994, 162..."
92490,92490,2015-08-28T03:00:00.000+03:00,bloomberg.com,Financial Statements Ratios Pensions & Options...,BEIJING CHIEFTAIN CONTROL -A (300430:Shenzhen)...,http://www.bloomberg.com/research/stocks/finan...,2,"(Financial, Statements, Ratios, Pensions, &, O...","[36837, 665381, 487729, 164756, 435163, 224463...","[1692, 4999, 4999, 4999, 4999, 4735, 4999, 169..."
92081,92081,2015-08-17T01:30:00.000+03:00,chron.com,gallery_overlay_open_thumbs|article-gallery-64...,21st Dream Cruise makes its way up Woodward - ...,http://www.chron.com/cars/article/21st-Dream-C...,2,"(gallery_overlay_open_thumbs|article, -, galle...","[794738, 794740, 794742, 580713, 510, 749899, ...","[4999, 4999, 4999, 546, 2, 4999, 1473, 300, 39..."


In [53]:
model.classifier = True
predictor = ModuleTrainer(model)
sample = sequence.pad_sequences(df_sample['words_with_oov'], maxlen=seq_len, value=0)
sample_tensor = torch.from_numpy(sample).long()
print(sample_tensor)
torch.max(predictor.predict(sample_tensor), 1)[1]
# trainer.predict(sample_tensor)


   24   814     4  ...    248    25  2072
 4999     5   461  ...    253  4490   177
    0     0     0  ...   1224    85    15
    0     0     0  ...   4999  4816  4999
 4999  4999   737  ...    138    54  4999
[torch.LongTensor of size 5x1000]



Variable containing:
 1
 2
 2
 4
 2
[torch.LongTensor of size 5x1]

## Site vectors

In [25]:
model.classifier = False
article_vectors_var = trainer.predict(trn_tensor)
article_vectors = article_vectors_var.data.numpy()
article_vectors_var

Variable containing:
 0.9039  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  4.0931  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  1.7655  ...   0.0000  0.0000  0.0000
          ...             ⋱             ...          
 0.0000  0.0649  0.0000  ...   0.0000  0.0000  0.3370
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  1.1352  ...   0.0000  0.0000  2.7516
[torch.FloatTensor of size 27391x100]

In [26]:
site_vectors = []
labels = []
for site in df_train['site'].unique():
    site_indexes = np.where(df_train["site"] == site)[0]
    if len(site_indexes) > 0:
        site_vector = np.mean(article_vectors[site_indexes, :], axis=0)
        site_vectors.append(site_vector)
        labels.append(site)
site_vectors = np.array(site_vectors)
site_vectors.shape

(57, 100)

In [27]:
from sklearn.manifold import TSNE

tsne_model = TSNE(n_components=2, random_state=0)
# np.set_printoptions(suppress=True)
site_vectors_2_dim = tsne_model.fit_transform(site_vectors)
X_proj = site_vectors_2_dim[:, 0]
Y_proj = site_vectors_2_dim[:, 1]

In [28]:
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

from plotly.graph_objs import Bar, Scatter, Figure, Layout, XAxis, YAxis

In [29]:
trace = Scatter(x=X_proj, y=Y_proj, mode='markers+text', text=labels, textposition='top',  marker=dict(size=10))
iplot({
    'data': [trace],
    'layout': Layout(
        #xaxis=XAxis(title='Left vs Right'), 
        #yaxis=YAxis(title='Biased vs Factual'),
        autosize=False,
        width=1000,
        height=700)},
    show_link=False
)