In [1]:
%pip install torch tqdm

Note: you may need to restart the kernel to use updated packages.


In [65]:
import stop_words as sw
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import average_precision_score, recall_score, f1_score
from sklearn.decomposition import PCA, TruncatedSVD
import torch
import torch.nn as nn
import torch.optim as optim
import tqdm
import copy
from datetime import datetime
import os
pd.options.mode.chained_assignment = None

In [66]:
urls = ['data/elections', 'data/politics', 'data/white_house', 'data/immigration', 'data/healthcare']

In [74]:
def preprocess_data(url):
    train_data = pd.read_csv(f'{url}/train/{url.split("/")[1]}_train.csv')
    train_data = train_data[(train_data['bias'] == 0) | (train_data['bias'] == 2)].replace({'bias': {2: 1}})
    train_data['stop_content'] = train_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    
    test_data = pd.read_csv(f'{url}/test/{url.split("/")[1]}_test.csv')
    test_data = test_data[(test_data['bias'] == 0) | (test_data['bias'] == 2)].replace({'bias': {2: 1}})
    test_data['stop_content'] = test_data['content'].apply(lambda x: ' '.join(sw.stop_words(x)))
    
    tfid = TfidfVectorizer()
    
    x_train = tfid.fit_transform(train_data['stop_content'])
    y_train = train_data['bias'].values
    
    x_test = tfid.transform(test_data['stop_content'])
    y_test = test_data['bias'].values

    # We utilized the default version of PCA from sklearn instead of using class-based code.
    # This allows us to use the same formatting as a slot-in replacement for the other
    # dimensionality reduction techniques we evaluated or tested in this project, like
    # Sparse PCA and Truncated SVD. Furthermore, due to the scale and sparsity of our text data,
    # utilizing the sklearn PCA, which is optimized for speed, made our computations more efficient. 
    # pca = PCA(n_components=0.9)
    # transformed_x_train = pca.fit_transform(x_train.toarray())
    # transformed_x_test = pca.transform(x_test.toarray())
    
    num_components = {'elections': 600,  'politics': 311, 'white_house': 211, 'immigration': 176, 'healthcare': 170}

    svd = TruncatedSVD(n_components=num_components[topic])
    transformed_x_train = svd.fit_transform(x_train)
    transformed_x_test = svd.transform(x_test)

    return transformed_x_train, y_train, transformed_x_test, y_test, train_data, test_data

    # return x_train, y_train, x_test, y_test, train_data, test_data

In [75]:
class DeepText(nn.Module):
    def __init__(self, tfidf_size):
        super().__init__()
        self.layer1 = nn.Linear(tfidf_size, 200)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(200, 100)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(100, 50)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(50, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.act1(self.layer1(x))
        x = self.act2(self.layer2(x))
        x = self.act3(self.layer3(x))
        x = self.sigmoid(self.output(x))
        return x

In [76]:
def model_train(model, X_train, y_train, X_val, y_val, n_epochs=30, batch_size=10):
    # loss function and optimizer
    loss_fn = nn.BCELoss()  # binary cross entropy
    optimizer = optim.Adam(model.parameters(), lr=0.0001)

    batch_start = torch.arange(0, X_train.shape[0], batch_size)

    # Hold the best model
    best_acc = - np.inf   # init to negative infinity
    best_weights = None

    for epoch in range(n_epochs):
        model.train()
        with tqdm.tqdm(batch_start, unit="batch", mininterval=0) as bar:
            bar.set_description(f"Epoch {epoch}")
            for start in bar:
                # take a batch
                X_batch = X_train[start:start+batch_size]
                y_batch = y_train[start:start+batch_size]
                # forward pass
                y_pred = model(X_batch)
                loss = loss_fn(y_pred, y_batch)
                # backward pass
                optimizer.zero_grad()
                loss.backward()
                # update weights
                optimizer.step()
                # print progress
                acc = (y_pred.round() == y_batch).float().mean()
                bar.set_postfix(
                    loss=float(loss),
                    acc=float(acc)
                )
        # evaluate accuracy at end of each epoch
        model.eval()
        y_pred = model(X_val)
        acc = (y_pred.round() == y_val).float().mean()
        acc = float(acc)
        if acc > best_acc:
            best_acc = acc
            best_weights = copy.deepcopy(model.state_dict())
    # restore model and return best accuracy
    model.load_state_dict(best_weights)
    return best_acc

In [77]:
def process_dataset(url, need_dense=True):
    def xy_torchly(x, y):
        x_torch = torch.tensor(x.todense() if need_dense else x, dtype=torch.float32)
        y_torch = torch.tensor(y, dtype=torch.float32).reshape(-1, 1)
        return x_torch, y_torch
    x_train, y_train, x_test, y_test, _, test_data = preprocess_data(url)
    x_train, y_train = xy_torchly(x_train, y_train)
    x_test, y_test = xy_torchly(x_test, y_test)
    return x_train, y_train, x_test, y_test, test_data

def init_saveplace():
    current = datetime.now().strftime("%d-%m-%y%H:%M:%S")
    route = os.path.join('.', 'runs', f'nn-run-{current}')
    os.makedirs(route, exist_ok=True)
    return route

def train_and_save(x_train, y_train, x_test, y_test, saveplace, topic):
    # Generate model
    vec_size = x_train.shape[1]
    model = DeepText(vec_size)

    _ = model_train(model, x_train, y_train, x_test, y_test)

    torch.save(model.state_dict(), os.path.join(saveplace, f'model_{topic}.pt'))
    return model

def test_and_save(model, x_test, test_data, saveplace, topic):
    test_data['prediction'] = np.array([model(x).detach().numpy() for x in x_test])
    test_data['pred_bias'] = test_data['prediction'].round().astype(int)

    test_data.to_csv(os.path.join(saveplace, f'test_data_{topic}.csv'), index=False)
    return test_data

In [78]:
def conf_mat(url, data):
    confusion_matrix = pd.DataFrame(0, [url], ['True Left', 'False Left', 'True Right', 'False Right'])
    for prediction, actual, tag in [[0,0,'True Left'], [0,1,'False Left'], [1,0,'False Right'], [1,1,'True Right']]:
        confusion_matrix[tag] = \
            data[(data['pred_bias'] == prediction) & (data['bias'] == actual)].count()['pred_bias']
    return confusion_matrix

In [79]:
scores = pd.DataFrame(None, urls, ['F1 Macro', 'F1 Micro', 'Precision', 'Recall'])
cms = pd.DataFrame(None, [], ['True Left', 'False Left', 'True Right', 'False Right'])

saveplace = init_saveplace()

for url in urls:
    topic = url.split('/')[-1]
    print(f'Dataset: {url} (about {topic})')
    print(f'Processing dataset')
    x_train, y_train, x_test, y_test, test_data = process_dataset(url, need_dense=False)

    print(f'Training model')
    model = train_and_save(x_train, y_train, x_test, y_test, saveplace, topic)
    print(f'Testing model')
    test_data = test_and_save(model, x_test, test_data, saveplace, topic)

    cm = conf_mat(url, test_data)
    print(f'Computing statistics')
    print(f'{url}\n{cm}\n')
    cms = pd.concat([cms, cm], axis=0)

    true_bias, pred_bias = test_data['bias'], test_data['pred_bias']
    scores['F1 Macro'][url] = f1_score(true_bias, pred_bias, average='macro')
    scores['F1 Micro'][url] = f1_score(true_bias, pred_bias, average='micro')
    scores['Precision'][url] = average_precision_score(true_bias, pred_bias)
    scores['Recall'][url] = recall_score(true_bias, pred_bias)

combined_info = pd.concat([scores, cms], axis=1)

combined_info.to_csv(os.path.join(saveplace, 'nn_tsvd_stats.csv'))

Dataset: data/elections (about elections)
Processing dataset
Training model


Epoch 0: 100%|██████████| 83/83 [00:00<00:00, 95.25batch/s, acc=1, loss=0.665]   
Epoch 1: 100%|██████████| 83/83 [00:00<00:00, 108.20batch/s, acc=1, loss=0.652]  
Epoch 2: 100%|██████████| 83/83 [00:01<00:00, 42.76batch/s, acc=1, loss=0.63]    
Epoch 3: 100%|██████████| 83/83 [00:02<00:00, 31.44batch/s, acc=1, loss=0.598]   
Epoch 4: 100%|██████████| 83/83 [00:01<00:00, 58.70batch/s, acc=1, loss=0.556]   
Epoch 5: 100%|██████████| 83/83 [00:01<00:00, 78.74batch/s, acc=1, loss=0.475]   
Epoch 6: 100%|██████████| 83/83 [00:01<00:00, 58.57batch/s, acc=1, loss=0.314]   
Epoch 7: 100%|██████████| 83/83 [00:01<00:00, 63.57batch/s, acc=1, loss=0.131]   
Epoch 8: 100%|██████████| 83/83 [00:01<00:00, 77.28batch/s, acc=1, loss=0.0394]  
Epoch 9: 100%|██████████| 83/83 [00:01<00:00, 72.05batch/s, acc=1, loss=0.0128] 
Epoch 10: 100%|██████████| 83/83 [00:01<00:00, 75.73batch/s, acc=1, loss=0.00521] 
Epoch 11: 100%|██████████| 83/83 [00:01<00:00, 48.81batch/s, acc=1, loss=0.0026]  
Epoch 12: 100%|

Testing model
Computing statistics
data/elections
                True Left  False Left  True Right  False Right
data/elections       1462         582         894          473

Dataset: data/politics (about politics)
Processing dataset
Training model


Epoch 0: 100%|██████████| 41/41 [00:00<00:00, 64.01batch/s, acc=0.333, loss=0.694]
Epoch 1: 100%|██████████| 41/41 [00:01<00:00, 38.60batch/s, acc=0.667, loss=0.692]
Epoch 2: 100%|██████████| 41/41 [00:00<00:00, 60.13batch/s, acc=0.667, loss=0.689]
Epoch 3: 100%|██████████| 41/41 [00:00<00:00, 46.17batch/s, acc=0.667, loss=0.686]
Epoch 4: 100%|██████████| 41/41 [00:01<00:00, 22.55batch/s, acc=0.667, loss=0.682]
Epoch 5: 100%|██████████| 41/41 [00:01<00:00, 34.42batch/s, acc=0.667, loss=0.677]
Epoch 6: 100%|██████████| 41/41 [00:01<00:00, 32.89batch/s, acc=0.667, loss=0.669]
Epoch 7: 100%|██████████| 41/41 [00:01<00:00, 30.73batch/s, acc=0.667, loss=0.658]
Epoch 8: 100%|██████████| 41/41 [00:01<00:00, 40.12batch/s, acc=1, loss=0.64]    
Epoch 9: 100%|██████████| 41/41 [00:01<00:00, 40.67batch/s, acc=1, loss=0.61]   
Epoch 10: 100%|██████████| 41/41 [00:01<00:00, 37.17batch/s, acc=1, loss=0.565]  
Epoch 11: 100%|██████████| 41/41 [00:00<00:00, 44.72batch/s, acc=1, loss=0.504]  
Epoch 12:

Testing model
Computing statistics
data/politics
               True Left  False Left  True Right  False Right
data/politics        459         240         614          305

Dataset: data/white_house (about white_house)
Processing dataset
Training model


Epoch 0: 100%|██████████| 27/27 [00:00<00:00, 52.66batch/s, acc=0.667, loss=0.68]
Epoch 1: 100%|██████████| 27/27 [00:01<00:00, 14.07batch/s, acc=0.667, loss=0.681]
Epoch 2: 100%|██████████| 27/27 [00:01<00:00, 19.32batch/s, acc=0.667, loss=0.681]
Epoch 3: 100%|██████████| 27/27 [00:01<00:00, 23.45batch/s, acc=0.667, loss=0.682]
Epoch 4: 100%|██████████| 27/27 [00:03<00:00,  7.53batch/s, acc=0.667, loss=0.682]
Epoch 5: 100%|██████████| 27/27 [00:03<00:00,  8.39batch/s, acc=0.667, loss=0.683]
Epoch 6: 100%|██████████| 27/27 [00:02<00:00, 11.26batch/s, acc=0.667, loss=0.683]
Epoch 7: 100%|██████████| 27/27 [00:00<00:00, 58.04batch/s, acc=0.667, loss=0.683]
Epoch 8: 100%|██████████| 27/27 [00:00<00:00, 63.98batch/s, acc=0.667, loss=0.682]
Epoch 9: 100%|██████████| 27/27 [00:00<00:00, 31.52batch/s, acc=0.667, loss=0.681]
Epoch 10: 100%|██████████| 27/27 [00:02<00:00,  9.61batch/s, acc=0.889, loss=0.679]
Epoch 11: 100%|██████████| 27/27 [00:01<00:00, 22.52batch/s, acc=1, loss=0.674]  
Epoch

Testing model
Computing statistics
data/white_house
                  True Left  False Left  True Right  False Right
data/white_house        334         205         364          179

Dataset: data/immigration (about immigration)
Processing dataset
Training model


Epoch 0: 100%|██████████| 23/23 [00:00<00:00, 24.98batch/s, acc=1, loss=0.672]  
Epoch 1: 100%|██████████| 23/23 [00:02<00:00, 10.15batch/s, acc=1, loss=0.666]  
Epoch 2: 100%|██████████| 23/23 [00:00<00:00, 57.79batch/s, acc=1, loss=0.661]   
Epoch 3: 100%|██████████| 23/23 [00:01<00:00, 19.87batch/s, acc=1, loss=0.654]  
Epoch 4: 100%|██████████| 23/23 [00:00<00:00, 58.62batch/s, acc=1, loss=0.648]  
Epoch 5: 100%|██████████| 23/23 [00:00<00:00, 27.90batch/s, acc=1, loss=0.641]  
Epoch 6: 100%|██████████| 23/23 [00:01<00:00, 14.06batch/s, acc=1, loss=0.633]  
Epoch 7: 100%|██████████| 23/23 [00:00<00:00, 37.77batch/s, acc=1, loss=0.625]  
Epoch 8: 100%|██████████| 23/23 [00:00<00:00, 51.83batch/s, acc=1, loss=0.616]  
Epoch 9: 100%|██████████| 23/23 [00:00<00:00, 52.05batch/s, acc=1, loss=0.606]  
Epoch 10: 100%|██████████| 23/23 [00:01<00:00, 22.42batch/s, acc=1, loss=0.594]  
Epoch 11: 100%|██████████| 23/23 [00:00<00:00, 51.80batch/s, acc=1, loss=0.581]  
Epoch 12: 100%|██████████

Testing model
Computing statistics
data/immigration
                  True Left  False Left  True Right  False Right
data/immigration        184          67         457          207

Dataset: data/healthcare (about healthcare)
Processing dataset
Training model


Epoch 0: 100%|██████████| 22/22 [00:00<00:00, 75.17batch/s, acc=0.667, loss=0.692] 
Epoch 1: 100%|██████████| 22/22 [00:00<00:00, 68.17batch/s, acc=0.333, loss=0.693]
Epoch 2: 100%|██████████| 22/22 [00:00<00:00, 53.20batch/s, acc=0.333, loss=0.694]
Epoch 3: 100%|██████████| 22/22 [00:00<00:00, 80.83batch/s, acc=0.333, loss=0.694] 
Epoch 4: 100%|██████████| 22/22 [00:00<00:00, 62.90batch/s, acc=0.333, loss=0.694]
Epoch 5: 100%|██████████| 22/22 [00:00<00:00, 81.67batch/s, acc=0.333, loss=0.694]
Epoch 6: 100%|██████████| 22/22 [00:00<00:00, 89.51batch/s, acc=0.333, loss=0.694]
Epoch 7: 100%|██████████| 22/22 [00:00<00:00, 55.49batch/s, acc=0.333, loss=0.692]
Epoch 8: 100%|██████████| 22/22 [00:00<00:00, 63.36batch/s, acc=0.333, loss=0.691]
Epoch 9: 100%|██████████| 22/22 [00:00<00:00, 73.28batch/s, acc=0.333, loss=0.688]
Epoch 10: 100%|██████████| 22/22 [00:00<00:00, 79.72batch/s, acc=0.333, loss=0.683]
Epoch 11: 100%|██████████| 22/22 [00:00<00:00, 74.57batch/s, acc=0.333, loss=0.676]


Testing model
Computing statistics
data/healthcare
                 True Left  False Left  True Right  False Right
data/healthcare        182          62         419          240

