In [2]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import torch
from torchtext.data import Field, TabularDataset, BucketIterator
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
import torch.optim as optim
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

import pandas as pd
import random
import os
from urllib import request
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
module_url = f"https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
#with open("file_1.txt") as f1, open("file_2.txt") as f2
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
    a = f.read()
    outf.write(a.decode('utf-8'))
from dont_patronize_me import DontPatronizeMe
# Initialize a dpm (Don't Patronize Me) object.
# It takes two areguments as input: 
# (1) Path to the directory containing the training set files, which is the root directory of this notebook.
# (2) Path to the test set, which will be released when the evaluation phase begins. In this example, 
# we use the dataset for Subtask 1, which the code will load without labels.
dpm = DontPatronizeMe('./dontpatronizeme_v1.4/', 'dontpatronizeme_pcl.tsv')
# This method loads the subtask 1 data
dpm.load_task1()

Fetching https://raw.githubusercontent.com/Perez-AlmendrosC/dontpatronizeme/master/semeval-2022/dont_patronize_me.py


In [3]:
df = dpm.train_task1_df
df

Unnamed: 0,par_id,art_id,keyword,country,text,label,orig_label
0,1,@@24942188,hopeless,ph,"we 're living in times of absolute insanity , ...",0,0
1,2,@@21968160,migrant,gh,"in libya today , there are countless number of...",0,0
2,3,@@16584954,immigrant,ie,"""white house press secretary sean spicer said ...",0,0
3,4,@@7811231,disabled,nz,council customers only signs would be displaye...,0,0
4,5,@@1494111,refugee,ca,""""""" just like we received migrants fleeing el ...",0,0
...,...,...,...,...,...,...,...
10464,10465,@@14297363,women,lk,"""sri lankan norms and culture inhibit women fr...",0,1
10465,10466,@@70091353,vulnerable,ph,he added that the afp will continue to bank on...,0,0
10466,10467,@@20282330,in-need,ng,""""""" she has one huge platform , and informatio...",1,3
10467,10468,@@16753236,hopeless,in,""""""" anja ringgren loven i ca n't find a word t...",1,4


In [4]:
df_raw = df[['label', 'keyword', 'text']]
df_nonpcl = df_raw[df_raw['label'] == 0]
df_pcl = df_raw[df_raw['label'] == 1]

In [5]:
df_nonpcl = df_nonpcl.sample(frac=.5)
df_pcl = df_pcl.append(df_pcl)


In [6]:
train_test_ratio = 0.10
train_valid_ratio = 0.80

# Train-test split
df_nonpcl_full_train, df_nonpcl_test = train_test_split(df_nonpcl, train_size = train_test_ratio, random_state = 1)
df_pcl_full_train, df_pcl_test = train_test_split(df_pcl, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_nonpcl_train, df_nonpcl_valid = train_test_split(df_nonpcl_full_train, train_size = train_valid_ratio, random_state = 1)
df_pcl_train, df_pcl_valid = train_test_split(df_pcl_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_nonpcl_train, df_pcl_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_nonpcl_valid, df_pcl_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_nonpcl_test, df_pcl_test], ignore_index=True, sort=False)

# Write preprocessed data
df_train.to_csv('./data/train.csv', index=False)
df_valid.to_csv('./data/valid.csv', index=False)
df_test.to_csv('./data/test.csv', index=False)

In [7]:
# !pip install -U torchtext==0.8.0


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [9]:
# !python3 -m spacy download en

In [10]:
source_folder = "./data/"

In [11]:
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
text_field = Field(tokenize='spacy', lower=True, include_lengths=True, batch_first=True)
fields = [('label', label_field), ('keyword', text_field), ('text', text_field)]

# # TabularDataset

train, valid, test = TabularDataset.splits(path=source_folder, train='train.csv', validation='valid.csv', test='test.csv',
                                           format='CSV', fields=fields, skip_header=True)
# train, valid, test = df_train, df_valid, df_test
# Iterators

train_iter = BucketIterator(train, batch_size=32, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=32, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=32, sort_key=lambda x: len(x.text),
                            device=device, sort=True, sort_within_batch=True)

# Vocabulary

text_field.build_vocab(train, min_freq=3)



In [12]:
class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), 300)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=300,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.5)

        self.fc = nn.Linear(2*dimension, 1)

    def forward(self, text, text_len):

        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

In [13]:
# Save and Load Functions

def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list'] 

In [14]:
destination_folder = "./data/model_save/"

In [15]:
# Training Function

def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, (keyword, keyword_len), (text, text_len)), _ in train_loader:           
            labels = labels.to(device)
            text = text.to(device)
            text_len = text_len.to(device)
            output = model(text, text_len)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                  for (labels, (keyword, keyword_len), (text, text_len)), _ in valid_loader:
                        labels = labels.to(device)
                        text = text.to(device)
                        text_len = text_len.to(device)
                        output = model(text, text_len)

                        loss = criterion(output, labels)
                        valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '/model.pt', model, optimizer, best_valid_loss)
                    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '/metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')


model = LSTM().to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

train(model=model, optimizer=optimizer, num_epochs=5)



Epoch [1/5], Step [8/85], Train Loss: 0.5664, Valid Loss: 0.7650
Model saved to ==> ./data/model_save//model.pt
Model saved to ==> ./data/model_save//metrics.pt
Epoch [1/5], Step [16/85], Train Loss: 0.7021, Valid Loss: 0.6963
Model saved to ==> ./data/model_save//model.pt
Model saved to ==> ./data/model_save//metrics.pt
Epoch [2/5], Step [24/85], Train Loss: 0.5171, Valid Loss: 0.6599
Model saved to ==> ./data/model_save//model.pt
Model saved to ==> ./data/model_save//metrics.pt
Epoch [2/5], Step [32/85], Train Loss: 0.6173, Valid Loss: 0.6561
Model saved to ==> ./data/model_save//model.pt
Model saved to ==> ./data/model_save//metrics.pt
Epoch [3/5], Step [40/85], Train Loss: 0.5002, Valid Loss: 0.6505
Model saved to ==> ./data/model_save//model.pt
Model saved to ==> ./data/model_save//metrics.pt
Epoch [3/5], Step [48/85], Train Loss: 0.5545, Valid Loss: 0.6666
Epoch [4/5], Step [56/85], Train Loss: 0.4852, Valid Loss: 0.6432
Model saved to ==> ./data/model_save//model.pt
Model saved 

In [16]:
# Evaluation Function

def evaluate(model, test_loader, version='v11', threshold=0.5):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (labels, (keyword, keyyword_len), (text, text_len)), _ in test_loader:           
            labels = labels.to(device)
            text = text.to(device)
            text_len = text_len.to(device)
            output = model(text, text_len)

            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(labels.tolist())
    
    print('F1:')
    print(f1_score(y_true, y_pred, average='macro'))
    
    
best_model = LSTM().to(device)
optimizer = optim.Adam(best_model.parameters(), lr=0.001)

load_checkpoint(destination_folder + '/model.pt', best_model, optimizer)
evaluate(best_model, test_iter)

Model loaded from <== ./data/model_save//model.pt
F1:
0.5606925312345181
