In [1]:
%pip install torch tqdm

Note: you may need to restart the kernel to use updated packages.


In [3]:
import stop_words as sw
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, recall_score, f1_score
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import copy
from datetime import datetime
import os
pd.options.mode.chained_assignment = None

In [4]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']

In [5]:
def preprocess_data(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)].replace({'bias': {2: 1}})
    train_data['stop_content'] = train_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)].replace({'bias': {2: 1}})
    test_data['stop_content'] = test_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    
    tfid = TfidfVectorizer()
    
    x_train = tfid.fit_transform(train_data['stop_content'])
    y_train = train_data['bias'].values
    
    x_test = tfid.transform(test_data['stop_content'])
    y_test = test_data['bias'].values
    
    return x_train, y_train, x_test, y_test, train_data, test_data

In [6]:
class DeepText(nn.Module):
    def __init__(self, tfidf_size):
        super().__init__()
        self.layer1 = nn.Linear(tfidf_size, 200)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(200, 100)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(100, 50)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(50, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [7]:
def model_train(model, X_train, y_train, X_val, y_val, n_epochs=30, batch_size=10):
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    batch_start = torch.arange(0, X_train.shape[0], batch_size)

    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

In [16]:
def process_dataset(url):
    def xy_torchly(x, y):
        x_torch = torch.tensor(x.todense(), dtype=torch.float32)
        y_torch = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
        return x_torch, y_torch
    x_train, y_train, x_test, y_test, _, test_data = preprocess_data(url)
    x_train, y_train = xy_torchly(x_train, y_train)
    x_test, y_test = xy_torchly(x_test, y_test)
    return x_train, y_train, x_test, y_test, test_data

def init_saveplace():
    current = datetime.now().strftime("%d-%m-%y%H:%M:%S")
    route = os.path.join('.', 'runs', f'nn-run-{current}')
    os.makedirs(route, exist_ok=True)
    return route

def train_and_save(x_train, y_train, x_test, y_test, saveplace, topic):
    # Generate model
    vec_size = x_train.shape[1]
    model = DeepText(vec_size)

    _ = model_train(model, x_train, y_train, x_test, y_test)

    torch.save(model.state_dict(), os.path.join(saveplace, f'model_{topic}.pt'))
    return model

def test_and_save(model, x_test, test_data, saveplace, topic):
    test_data['prediction'] = np.array([model(x).detach().numpy() for x in x_test])
    test_data['pred_bias'] = test_data['prediction'].round().astype(int)

    test_data.to_csv(os.path.join(saveplace, f'test_data_{topic}.csv'), index=False)
    return test_data

In [17]:
def conf_mat(data):
    confusion_matrix = pd.DataFrame(0, ['Predicted Left', 'Predicted Right'], ['Actual Left', 'Actual Right'])
    for prediction, actual in [[0,0], [0,1], [1,0], [1,1]]:
        confusion_matrix[f'Actual {"Left" if actual == 0 else "Right"}'][f'Predicted {"Left" if prediction == 0 else "Right"}'] = \
            data[(data['pred_bias'] == prediction) & (data['bias'] == actual)].count()['pred_bias']
    return confusion_matrix

In [18]:
scores = pd.DataFrame(None, ['f1_macro', 'f1_micro', 'precision', 'recall'], urls)

saveplace = init_saveplace()

for url in urls:
    topic = url.split('/')[-1]
    print(f'Dataset: {url} (about {topic})')
    print(f'Processing dataset')
    x_train, y_train, x_test, y_test, test_data = process_dataset(url)

    print(f'Training model')
    model = train_and_save(x_train, y_train, x_test, y_test, saveplace, topic)
    print(f'Testing model')
    test_data = test_and_save(model, x_test, test_data, saveplace, topic)

    print(f'Computing statistics')
    print(f'{url}\n{conf_mat(test_data)}\n')

    true_bias, pred_bias = test_data['bias'], test_data['pred_bias']
    scores[url]['f1_macro'] = f1_score(true_bias, pred_bias, average='macro')
    scores[url]['f1_micro'] = f1_score(true_bias, pred_bias, average='micro')
    scores[url]['precision'] = average_precision_score(true_bias, pred_bias)
    scores[url]['recall'] = recall_score(true_bias, pred_bias)

scores.to_csv(os.path.join(saveplace, 'scores.csv'))

Dataset: data/elections (about elections)
Processing dataset
Training model


Epoch 0: 100%|██████████| 83/83 [00:04<00:00, 18.19batch/s, acc=0, loss=0.715]  
Epoch 1: 100%|██████████| 83/83 [00:04<00:00, 18.34batch/s, acc=1, loss=0.666]  
Epoch 2: 100%|██████████| 83/83 [00:04<00:00, 16.64batch/s, acc=1, loss=0.533]  
Epoch 3: 100%|██████████| 83/83 [00:04<00:00, 18.67batch/s, acc=1, loss=0.27]   
Epoch 4: 100%|██████████| 83/83 [00:04<00:00, 18.75batch/s, acc=1, loss=0.0771] 
Epoch 5: 100%|██████████| 83/83 [00:04<00:00, 16.91batch/s, acc=1, loss=0.0216]
Epoch 6: 100%|██████████| 83/83 [00:04<00:00, 18.38batch/s, acc=1, loss=0.00771]
Epoch 7: 100%|██████████| 83/83 [00:04<00:00, 17.99batch/s, acc=1, loss=0.00408]
Epoch 8: 100%|██████████| 83/83 [00:04<00:00, 18.84batch/s, acc=1, loss=0.0026] 
Epoch 9: 100%|██████████| 83/83 [00:05<00:00, 16.39batch/s, acc=1, loss=0.00171]
Epoch 10: 100%|██████████| 83/83 [00:04<00:00, 16.62batch/s, acc=1, loss=0.00119]
Epoch 11: 100%|██████████| 83/83 [00:04<00:00, 17.61batch/s, acc=1, loss=0.000865]
Epoch 12: 100%|██████████|

Testing model
Computing statistics
data/elections
                 Actual Left  Actual Right
Predicted Left          1651           739
Predicted Right          284           737

Dataset: data/politics (about politics)
Processing dataset
Training model


Epoch 0: 100%|██████████| 41/41 [00:01<00:00, 20.66batch/s, acc=0.667, loss=0.693]
Epoch 1: 100%|██████████| 41/41 [00:02<00:00, 19.20batch/s, acc=0.667, loss=0.689]
Epoch 2: 100%|██████████| 41/41 [00:01<00:00, 21.60batch/s, acc=1, loss=0.676]  
Epoch 3: 100%|██████████| 41/41 [00:03<00:00, 13.12batch/s, acc=1, loss=0.642]  
Epoch 4: 100%|██████████| 41/41 [00:03<00:00, 12.03batch/s, acc=1, loss=0.571]  
Epoch 5: 100%|██████████| 41/41 [00:03<00:00, 11.82batch/s, acc=1, loss=0.445]  
Epoch 6: 100%|██████████| 41/41 [00:05<00:00,  7.90batch/s, acc=1, loss=0.278]  
Epoch 7: 100%|██████████| 41/41 [00:05<00:00,  7.66batch/s, acc=1, loss=0.145]
Epoch 8: 100%|██████████| 41/41 [00:03<00:00, 11.44batch/s, acc=1, loss=0.071] 
Epoch 9: 100%|██████████| 41/41 [00:05<00:00,  7.71batch/s, acc=1, loss=0.0383]
Epoch 10: 100%|██████████| 41/41 [00:05<00:00,  7.52batch/s, acc=1, loss=0.0231]
Epoch 11: 100%|██████████| 41/41 [00:03<00:00, 11.85batch/s, acc=1, loss=0.0154]
Epoch 12: 100%|██████████| 4

Testing model
Computing statistics
data/politics
                 Actual Left  Actual Right
Predicted Left           462           210
Predicted Right          302           644

Dataset: data/white_house (about white_house)
Processing dataset
Training model


Epoch 0: 100%|██████████| 27/27 [00:01<00:00, 22.35batch/s, acc=0.667, loss=0.691]
Epoch 1: 100%|██████████| 27/27 [00:01<00:00, 19.97batch/s, acc=0.667, loss=0.69]
Epoch 2: 100%|██████████| 27/27 [00:01<00:00, 23.39batch/s, acc=0.667, loss=0.687]
Epoch 3: 100%|██████████| 27/27 [00:01<00:00, 19.89batch/s, acc=1, loss=0.681]  
Epoch 4: 100%|██████████| 27/27 [00:01<00:00, 22.24batch/s, acc=1, loss=0.668] 
Epoch 5: 100%|██████████| 27/27 [00:01<00:00, 20.88batch/s, acc=1, loss=0.644] 
Epoch 6: 100%|██████████| 27/27 [00:01<00:00, 21.04batch/s, acc=1, loss=0.603] 
Epoch 7: 100%|██████████| 27/27 [00:01<00:00, 18.65batch/s, acc=1, loss=0.54]  
Epoch 8: 100%|██████████| 27/27 [00:02<00:00, 12.73batch/s, acc=1, loss=0.45] 
Epoch 9: 100%|██████████| 27/27 [00:01<00:00, 15.26batch/s, acc=1, loss=0.344]
Epoch 10: 100%|██████████| 27/27 [00:01<00:00, 18.89batch/s, acc=1, loss=0.243]
Epoch 11: 100%|██████████| 27/27 [00:01<00:00, 19.50batch/s, acc=1, loss=0.167]
Epoch 12: 100%|██████████| 27/27 

Testing model
Computing statistics
data/white_house
                 Actual Left  Actual Right
Predicted Left           349           225
Predicted Right          164           344

Dataset: data/immigration (about immigration)
Processing dataset
Training model


Epoch 0: 100%|██████████| 23/23 [00:01<00:00, 16.17batch/s, acc=0, loss=0.756]  
Epoch 1: 100%|██████████| 23/23 [00:01<00:00, 22.51batch/s, acc=0, loss=0.747]  
Epoch 2: 100%|██████████| 23/23 [00:01<00:00, 22.06batch/s, acc=0, loss=0.734]  
Epoch 3: 100%|██████████| 23/23 [00:01<00:00, 17.38batch/s, acc=0, loss=0.718]  
Epoch 4: 100%|██████████| 23/23 [00:00<00:00, 26.15batch/s, acc=0, loss=0.699]  
Epoch 5: 100%|██████████| 23/23 [00:01<00:00, 18.71batch/s, acc=1, loss=0.676]  
Epoch 6: 100%|██████████| 23/23 [00:00<00:00, 25.98batch/s, acc=1, loss=0.642]  
Epoch 7: 100%|██████████| 23/23 [00:01<00:00, 17.43batch/s, acc=1, loss=0.585]
Epoch 8: 100%|██████████| 23/23 [00:01<00:00, 16.91batch/s, acc=1, loss=0.503]
Epoch 9: 100%|██████████| 23/23 [00:01<00:00, 18.39batch/s, acc=1, loss=0.4]  
Epoch 10: 100%|██████████| 23/23 [00:02<00:00, 11.33batch/s, acc=1, loss=0.292]
Epoch 11: 100%|██████████| 23/23 [00:01<00:00, 13.27batch/s, acc=1, loss=0.195]
Epoch 12: 100%|██████████| 23/23 [00

Testing model
Computing statistics
data/immigration
                 Actual Left  Actual Right
Predicted Left           196            71
Predicted Right          195           453

Dataset: data/healthcare (about healthcare)
Processing dataset
Training model


Epoch 0: 100%|██████████| 22/22 [00:01<00:00, 16.08batch/s, acc=0.667, loss=0.679]
Epoch 1: 100%|██████████| 22/22 [00:01<00:00, 17.83batch/s, acc=0.667, loss=0.679]
Epoch 2: 100%|██████████| 22/22 [00:01<00:00, 18.57batch/s, acc=0.667, loss=0.677]
Epoch 3: 100%|██████████| 22/22 [00:01<00:00, 20.40batch/s, acc=0.667, loss=0.67]
Epoch 4: 100%|██████████| 22/22 [00:01<00:00, 15.47batch/s, acc=0.667, loss=0.656]
Epoch 5: 100%|██████████| 22/22 [00:01<00:00, 11.92batch/s, acc=1, loss=0.634]  
Epoch 6: 100%|██████████| 22/22 [00:00<00:00, 27.35batch/s, acc=1, loss=0.601] 
Epoch 7: 100%|██████████| 22/22 [00:01<00:00, 19.51batch/s, acc=1, loss=0.556]
Epoch 8: 100%|██████████| 22/22 [00:01<00:00, 14.36batch/s, acc=1, loss=0.493]
Epoch 9: 100%|██████████| 22/22 [00:00<00:00, 26.06batch/s, acc=1, loss=0.408]
Epoch 10: 100%|██████████| 22/22 [00:01<00:00, 19.89batch/s, acc=1, loss=0.307]
Epoch 11: 100%|██████████| 22/22 [00:01<00:00, 21.30batch/s, acc=1, loss=0.209]
Epoch 12: 100%|██████████| 2

Testing model
Computing statistics
data/healthcare
                 Actual Left  Actual Right
Predicted Left           285           139
Predicted Right          137           342

