# GBV Article Binary Classification

**Acknowledgement**: The LSTM code in this notebook was adapted from the following two tutorials
1. https://www.kaggle.com/swarnabha/pytorch-text-classification-torchtext-lstm#Sentence-Classification 
2. https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/

In [125]:
!python -m spacy download es_core_news_sm

Collecting es_core_news_sm==2.2.5
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-2.2.5/es_core_news_sm-2.2.5.tar.gz (16.2MB)
[K     |████████████████████████████████| 16.2MB 7.3MB/s 
Building wheels for collected packages: es-core-news-sm
  Building wheel for es-core-news-sm (setup.py) ... [?25l[?25hdone
  Created wheel for es-core-news-sm: filename=es_core_news_sm-2.2.5-cp37-none-any.whl size=16172936 sha256=58f2a42e26cf3c4fce16a17682c919d7ca8131387ca42e04007e2d895f070f02
  Stored in directory: /tmp/pip-ephem-wheel-cache-7vljsf1y/wheels/05/4f/66/9d0c806f86de08e8645d67996798c49e1512f9c3a250d74242
Successfully built es-core-news-sm
Installing collected packages: es-core-news-sm
Successfully installed es-core-news-sm-2.2.5
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('es_core_news_sm')


In [1]:
import pandas as pd
import re
import time
import random
import numpy as np

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torchtext.legacy import data
import torchtext
from torchtext.vocab import Vectors
import spacy 

In [2]:
!pip install altair
import altair as alt



In [3]:
def altair_theme():
    '''
    Helper function to set up the theme for altair (to create plots)
    '''
    font = "Playfair Display, serif"
    labelFont = "Raleway, sans-serif"
    fontColor = '#3c3f42'
    axisColor = "#9E9EA3"
    gridColor = "#D1D4D6"  
    main_palette = ["#06063c","#c5741d","#902727","#265886","#dab312","#523378","#3e8c7e","#5f8540"]
    return {
        "config": {
            "title": {
                "fontSize": 20,
                "font": font,
                "anchor": "start", 
                "color": fontColor,
                "fontWeight": 600,
                "offset": 20,
            },
            "axisX": {
                "domain": True,
                "domainColor": axisColor,
                "domainWidth": 1,
                "grid": False,
                "labelFont": labelFont,
                "labelFontSize": 12,
                "labelPadding": 10,
                "labelColor": fontColor,
                "labelFontWeight": 100,                
                "tickColor": axisColor,
                "tickSize": 5, 
                "titleFont": font,
                "titleFontSize": 16,
                "titleFontWeight": 100,
                "titlePadding": 10, 
                "titleColor": fontColor,
            },
            "axisY": {
                "domain": False,
                "grid": True,
                "gridColor": gridColor,
                "gridWidth": 1,
                "labelFont": labelFont,
                "labelFontSize": 11,
                "labelPadding": 10,
                "labelColor": fontColor,
                "labelFontWeight": 100,                
                "ticks": False, 
                "titleFont": font,
                "titleFontSize": 14,
                "titleFontWeight": 600,
                "titleColor": fontColor,
                "titleY": -15, 
                "titleX": -10, 
                "titlePadding": 10, 
                "titleAngle": 0, 
            },
            "legend": {
                "labelFont": labelFont,
                "labelFontSize": 12,
                "labelColor": fontColor,
                "labelFontWeight": 100,                
                "symbolType": "square", 
                "titleFont": labelFont,
                "titleFontSize": 12,
                "title": "", 
                # "orient": "top-right", 
            },
            "view": {
                "strokeWidth": 0

            }
        }
    }

In [5]:
alt.themes.register("altair_theme", altair_theme)
alt.themes.enable("altair_theme")

ThemeRegistry.enable('altair_theme')

## 1. Set up

In [36]:
USE_CUDA = torch.cuda.is_available()

if USE_CUDA:
    DEVICE = torch.device('cuda')
    print("Using cuda.")
else:
    DEVICE = torch.device('cpu')
    print("Using cpu.")

from google.colab import drive 
drive.mount('/content/gdrive')
PATH = "gdrive/MyDrive/"

random.seed(30255)
np.random.seed(30255)
torch.manual_seed(30255)
if USE_CUDA:
    torch.cuda.manual_seed(30255)

# Change the following to false when training on the full set
#DEVELOPING = True    
DEVELOPING = False

country_paths = {"MEXICO": ["mexico_el_universal_scraped_lab", "mexico_heraldo_scraped_labelled", "mexico_la_jornada_scraped_label"],
                 "PAKISTAN": ["pakistan_dawn_labelled300","pakistan_news_labelled300", "pakistan_nation_labelled300"],
                 "UK": ["uk_sun_scraped_cleaned_labelled", "uk_times_scraped_cleaned_labell", "uk_guardian_scraped_cleaned_lab"]}

# specify the country to build the model for
COUNTRY = "PAKISTAN"

if DEVELOPING:
    print("small test dataset")
    df = pd.read_csv(PATH + 'labelling_data - uk_guardian_scraped_cleaned_labelled300.csv')[['webTitle', 'GBV']]
    df.columns = ['text', 'target']
else:
    data_xlsx = pd.ExcelFile(PATH + 'labelling_data.xlsx')
    list_df = []
    for sheet_name in country_paths[COUNTRY]:
        columns = ['webTitle', 'GBV'] if sheet_name == "uk_guardian_scraped_cleaned_lab" else ["title", "GBV"]
        sdf = pd.read_excel(data_xlsx, sheet_name=sheet_name)[columns]
        sdf.columns = ['text', 'target']
        list_df.append(sdf)
    df = pd.concat(list_df)

Using cpu.
Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [37]:
df.head()

Unnamed: 0,text,target
0,Government College University VC in Lahore ord...,1.0
1,16 days of activism drive launched against gen...,1.0
2,"Another Zainab': 2-year-old girl assaulted, to...",1.0
3,Kasur policeman arrested for allegedly killing...,1.0
4,‚ÄòHonour‚Äô in shame,1.0


## 2. Set up data

### 2.1 Cleaning the data

In [38]:
def normalise_text(text):
    ''' 
    Normalise and clean text (remove html tags, set to lower case etc)
    inputs:
        text: str, the text to be cleaned
    returns: str, the cleaned and normalised text
    '''
    text = text.str.lower() # lowercase
    text = text.str.replace(r"\#","") # replaces hashtags
    text = text.str.replace(r"http\S+","URL")  # remove URL addresses
    text = text.str.replace(r"@","")
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ") # strip out punctuation
    text = text.str.replace("\s{2,}", " ")
    text = text.apply(lambda x: re.sub('<[^<]+?>', '', x)) # remove html tags
    return text

In [39]:
df.dropna(inplace=True)

df['text'] = normalise_text(df['text'])    

### 2.2 Split df into train, test and validation sets

In [40]:
train_df, test_df = train_test_split(df, test_size=0.15)
train_df, vali_df = train_test_split(train_df, test_size=0.2)

In [41]:
print("training dataset shape: ", train_df.shape)
print("validation dataset shape: ", vali_df.shape)
print("test dataset shape: ", test_df.shape)

training dataset shape:  (466, 2)
validation dataset shape:  (117, 2)
test dataset shape:  (103, 2)


### 2.3 Create text and data iterators

In [42]:
SEED = 42

torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [43]:
if COUNTRY == 'MEXICO':
    tokenizer_language = 'es_core_news_sm'
else:
    tokenizer_language = 'en_core_web_sm'

TEXT = data.Field(tokenize = 'spacy', tokenizer_language=tokenizer_language, include_lengths=True, batch_first=True)
LABEL = data.LabelField(dtype = torch.float)

In [44]:
# source : https://gist.github.com/lextoumbourou/8f90313cbc3598ffbabeeaa1741a11c8
# to use DataFrame as a Data source

class DataFrameDataset(data.Dataset):

    def __init__(self, df, fields, is_test=False, **kwargs):
        examples = []
        for i, row in df.iterrows():
            label = row.target if not is_test else None
            text = row.text
            examples.append(data.Example.fromlist([text, label], fields))

        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    @classmethod
    def splits(cls, fields, train_df, val_df=None, test_df=None, **kwargs):
        train_data, val_data, test_data = (None, None, None)
        data_field = fields

        if train_df is not None:
            train_data = cls(train_df.copy(), data_field, **kwargs)
        if val_df is not None:
            val_data = cls(val_df.copy(), data_field, **kwargs)
        if test_df is not None:
            test_data = cls(test_df.copy(), data_field, True, **kwargs)

        return tuple(d for d in (train_data, val_data, test_data) if d is not None)



In [45]:
fields = [('text',TEXT), ('label',LABEL)]

train_ds, val_ds = DataFrameDataset.splits(fields, train_df=train_df, val_df=vali_df)

In [46]:
print("Example of data: ", vars(train_ds[15]))

Example of data:  {'text': ['boy', 'dies', 'six', 'others', 'burnt', 'in', 'gas', 'explosion'], 'label': 0.0}


In [47]:
MAX_VOCAB_SIZE = 25000

if COUNTRY == 'MEXICO':
    # trained_vectors = Vectors('cc.es.300.bin')
    TEXT.build_vocab(train_ds, 
                    max_size = MAX_VOCAB_SIZE)
else:
    TEXT.build_vocab(train_ds, 
                    max_size = MAX_VOCAB_SIZE, 
                    vectors = 'glove.6B.200d',
                    unk_init = torch.Tensor.zero_)


In [48]:
LABEL.build_vocab(train_ds)

In [49]:
BATCH_SIZE = 128

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = data.BucketIterator.splits(
    (train_ds, val_ds), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

## 3. Set up model

### 3.1 Model parameters

In [50]:
num_epochs = 25
learning_rate = 0.001

INPUT_DIM = len(TEXT.vocab)
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.2
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]

### 3.2 Define model

In [51]:
class LSTM_net(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, 
                 bidirectional, dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.lstm = nn.LSTM(embedding_dim, 
                           hidden_dim, 
                           num_layers=n_layers, 
                           bidirectional=bidirectional, 
                           dropout=dropout,
                           batch_first=True)
        
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text, text_lengths):              
        embedded = self.embedding(text)
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, text_lengths, batch_first=True)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        hidden = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1))
        dense_outputs = self.fc1(hidden)
        outputs = self.dropout(self.fc2(dense_outputs))            
        return outputs

In [52]:
model = LSTM_net(INPUT_DIM, 
            EMBEDDING_DIM, 
            HIDDEN_DIM, 
            OUTPUT_DIM, 
            N_LAYERS, 
            BIDIRECTIONAL, 
            DROPOUT, 
            PAD_IDX)


### 3.3 Initialise embedding layer with GloVe embeddings (for UK and Pakistan models only)

In [53]:
pretrained_embeddings = TEXT.vocab.vectors

In [54]:
model.embedding.weight.data.copy_(pretrained_embeddings)

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [-0.1027,  0.3041, -0.1358,  ...,  0.3695,  0.1904, -0.1227],
        ...,
        [-0.2565,  0.7228, -0.5316,  ..., -0.4012, -0.8587, -0.0942],
        [ 0.0945, -0.1654, -0.8128,  ..., -0.3747, -0.0331,  0.6317],
        [-0.0660, -0.0530, -0.3336,  ..., -0.6483, -0.4552,  0.1592]])

In [55]:
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [56]:
model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

criterion = criterion.to(device)

## 4. Train model

### 4.1 Define model training and evaluation functions

In [57]:
def binary_accuracy(preds, targets):
    """
    Returns accuracy per batch, for example if the model predicts 8/10 right, this returns 0.8
    inputs:
        preds: list of floats, the predictions for the batch from the model
        targets: list of floats, the targets (pre-labels) for the batch
    returns: float, the proportion of correct labels from the model
    """
    # get binary predictions:
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == targets).float() 
    acc = correct.sum() / len(correct)
    return acc

In [58]:
def train_an_epoch(model, iterator):
    '''
    Trains the model for one epoc
    inputs: 
        model: the instance of the model
        iterator: the train data iterator
    returns:
        training loss, float
        training accuracy, float
    '''
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    for batch in iterator:
        text, text_lengths = batch.text
        optimizer.zero_grad()
        predictions = model(text, text_lengths).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)

        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [59]:
def evaluate(model, iterator):
    '''
    Evaluate the model on the validation dataset
    inputs:
        model: the instance of the LSTM model
        iterator: the validation data iterator
    returns: validation accuracy, float
    '''
    epoch_acc = 0
    model.eval()
    with torch.no_grad():
        for batch in iterator:
            text, text_lengths = batch.text
            predictions = model(text, text_lengths).squeeze()
            acc = binary_accuracy(predictions, batch.label)
            epoch_acc += acc.item()
    return epoch_acc / len(iterator)


### 4.2. Train the model

In [60]:
loss=[]
acc=[]
val_accs=[]

for epoch in range(num_epochs):
    
    train_loss, train_acc = train_an_epoch(model, train_iterator)
    valid_acc = evaluate(model, valid_iterator)
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Acc: {valid_acc*100:.2f}%')
    
    loss.append(train_loss)
    acc.append(train_acc)
    val_accs.append(valid_acc)

    if valid_acc >= max(val_accs):
        torch.save(model.state_dict(), PATH + COUNTRY + 'saved_weights.pt')

	Train Loss: 0.675 | Train Acc: 60.34%
	 Val. Acc: 65.81%
	Train Loss: 0.637 | Train Acc: 66.94%
	 Val. Acc: 65.81%
	Train Loss: 0.586 | Train Acc: 68.00%
	 Val. Acc: 66.67%
	Train Loss: 0.505 | Train Acc: 71.93%
	 Val. Acc: 82.05%
	Train Loss: 0.524 | Train Acc: 76.84%
	 Val. Acc: 83.76%
	Train Loss: 0.429 | Train Acc: 82.45%
	 Val. Acc: 86.32%
	Train Loss: 0.391 | Train Acc: 83.71%
	 Val. Acc: 84.62%
	Train Loss: 0.366 | Train Acc: 84.60%
	 Val. Acc: 84.62%
	Train Loss: 0.295 | Train Acc: 87.05%
	 Val. Acc: 85.47%
	Train Loss: 0.286 | Train Acc: 90.03%
	 Val. Acc: 83.76%
	Train Loss: 0.249 | Train Acc: 88.72%
	 Val. Acc: 84.62%
	Train Loss: 0.215 | Train Acc: 88.64%
	 Val. Acc: 84.62%
	Train Loss: 0.163 | Train Acc: 89.83%
	 Val. Acc: 85.47%
	Train Loss: 0.095 | Train Acc: 91.87%
	 Val. Acc: 84.62%
	Train Loss: 0.118 | Train Acc: 92.98%
	 Val. Acc: 84.62%
	Train Loss: -0.051 | Train Acc: 93.29%
	 Val. Acc: 83.76%
	Train Loss: -0.010 | Train Acc: 90.37%
	 Val. Acc: 82.05%
	Train Loss:

In [61]:
plot_df = pd.DataFrame({'Validation Accuracy': val_accs, 'EPOC': range(1, num_epochs+1)})

alt.Chart(plot_df).mark_line().encode(
    x='EPOC:Q',
    y=alt.Y('Validation Accuracy', scale=alt.Scale(domain=(0, 1)))
).properties(width=250, height=350)

## 5. Evaluate the best model on the test dataset

In [62]:
model_saved ='saved_weights.pt'
model.load_state_dict(torch.load(PATH + COUNTRY + 'saved_weights.pt'));
model.eval();

if COUNTRY == 'MEXICO':
    nlp = spacy.load("es_core_news_sm")
else:
    nlp = spacy.load("en_core_web_sm")

def predict(model, sentence):
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]        
    length = [len(indexed)]                                    
    tensor = torch.LongTensor(indexed).to(device)             
    tensor = tensor.unsqueeze(1).T                            
    length_tensor = torch.LongTensor(length) 
    prediction = model(tensor, length_tensor)
    binary_pred = torch.round(torch.sigmoid(prediction)).item()
    return abs(binary_pred-1)

In [63]:
num_right = 0
true_positives = 0
false_positives = 0
false_negatives = 0

for i, t in test_df.iterrows():
    pred = predict(model, t['text'])
    if pred == t['target']:
        num_right += 1
    if (pred == 1) and (t['target'] == 1):  
        true_positives += 1
    if (pred == 1) and (t['target'] == 0):
        false_positives += 1
    if (pred == 0) and (t['target'] == 1):
        false_negatives += 1

f1_score = true_positives / (true_positives + 0.5*(false_positives + false_negatives))

print("{} best model: \n percent of test set labelled correctly: {} \n F1 score: {}".format(COUNTRY, (num_right / test_df.shape[0])*100, f1_score))


PAKISTAN best model: 
 percent of test set labelled correctly: 82.52427184466019 
 F1 score: 0.875
