<a href="https://colab.research.google.com/github/XinzhouLi/Toxic_Language_Detection_in_Social_Media/blob/main/LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import pandas as pd
import numpy as np
from torch.utils import data
from torch import nn, optim
import torchtext
import re
import sys


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(torch.__version__)
# print(sys.version)
print(torchtext.__version__)
print(torch.version.cuda)

1.8.2
0.9.2
11.1


In [4]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(torch.cuda.is_available())

True


## Data Cleaning

In [5]:
def unify_format(text):
    return re.sub(r"[^a-zA-Z0-9]", " ", text).lower()

In [9]:
# implement dataset extends util.data.Dataset
class ToxicDataset(data.Dataset):
    def __init__(self, *args):
        super().__init__()
        # Read data in
        if len(args) == 1:
            # for initialize training dataset
            filepath = args[0]
            self.dataframe = pd.read_csv(
                filepath, iterator=True, header=0, encoding='utf-8', usecols=['comment_text', 'toxic'])
            # initialize pandas dataframe to store the data
            self.dataframe = pd.DataFrame(self.dataframe.read())
            self.dataframe.convert_dtypes()
            # Clean the data using regular expersion, only reserve letter and number
            self.dataframe['comment_text'] = self.dataframe['comment_text'].apply(unify_format)
            self.dataframe.to_csv('train_cleaned.csv', index=False, header=["text", "label"])
        else:
            # for initialize testing dataset
            filepath1 = args[0]
            filepath2 = args[1]
            test_comments = pd.DataFrame(pd.read_csv(
                filepath1, iterator=True, header=0, usecols=['comment_text', 'id']).read())
            test_label = pd.DataFrame(pd.read_csv(
                filepath2, iterator=True, header=0, usecols=['id', 'toxic']).read())
            # merge two dataframe together by unique key id
            result = pd.merge(test_comments, test_label, how='left', on=['id'])
            # select the text and label col
            self.dataframe = result[result['toxic']>= 0].loc[:, ['comment_text', 'toxic']]
            self.dataframe.convert_dtypes()
            # Clean the data using regular expersion, only reserve letter and number
            self.dataframe['comment_text'] = self.dataframe['comment_text'].apply(unify_format)
            self.dataframe.to_csv('test_cleaned.csv', index=False, header=["text", "label"])
            

    # override the getiem function, return tuple contains comment and label

    def __getitem__(self, index):
        comment, label = self.dataframe.iat[index, 0], self.dataframe.iat[index, 1]
        return (comment, label)

    # Override the len function, return the length of the dataframe
    def __len__(self):
        return len(self.dataframe.loc[:, ['comment_text']])


In [11]:
train_dataset = ToxicDataset("train.csv")
test_dataset = ToxicDataset("test.csv", "test_labels.csv")

# Load Dataset

In [12]:
def get_dataset(csv_data, text_field, label_field, test=False):
    fields = [("id", None), # we won't be needing the id, so we pass in None as the field
                 ("text", text_field), ("label", label_field)]       
    examples = []

    if test:
        for text in csv_data['text']:
            examples.append(torchtext.legacy.data.Example.fromlist([None, text, None], fields))
    else:
        for text, label in zip(csv_data['text'], csv_data['label']):
            examples.append(torchtext.legacy.data.Example.fromlist([None, text, label], fields))
    return examples, fields

In [13]:
train_data = pd.read_csv('train_cleaned.csv')
test_data = pd.read_csv("test_cleaned.csv")


# Initialize Vocab

In [13]:
tokenize = lambda x: x.split()
TEXT = torchtext.legacy.data.Field(sequential=True, tokenize=tokenize, lower=True)
LABEL = torchtext.legacy.data.Field(sequential=False, use_vocab=False)

train_examples, train_fields = get_dataset(train_data, TEXT, LABEL)
test_examples, test_fields = get_dataset(train_data, TEXT, LABEL)

train = torchtext.legacy.data.Dataset(train_examples, train_fields)
test = torchtext.legacy.data.Dataset(test_examples, test_fields)

TEXT.build_vocab(train, max_size=10000, vectors='glove.6B.300d',)
LABEL.build_vocab(train)

# Construct Bi LSTM Model

In [15]:
class BLSTMModel(nn.Module):
    def __init__(self, vocab_size, embeding_dim, hidden_dim):
        super().__init__()
        # implement embedding function to convert text to numerical data
        self.embedding = nn.Embedding(vocab_size, embeding_dim)
        self.rnn = nn.LSTM(embeding_dim, hidden_dim, num_layers =2, bidirectional=True, dropout = 0.5)
        self.fc = nn.Linear(hidden_dim*2,1)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):

        embediding = self.dropout(self.embedding(x))
        output,(hidden, cell) = self.rnn(embediding)
        hidden = torch.cat([hidden[-2], hidden[-1]],dim=1)
        hidden = self.dropout(hidden)
        out = self.fc(hidden)
        return out


In [16]:
train_dataloader = torchtext.legacy.data.BucketIterator(dataset=train, batch_size=180, shuffle=True)#, sort_within_batch=False, sort_key=lambda x: len(x.text), repeat=False)
test_dataloader = torchtext.legacy.data.BucketIterator(dataset=test, batch_size=180, shuffle=True)

In [17]:
# init Lstm model 
BLSTM = BLSTMModel(len(TEXT.vocab), 300, 256)
# replace the embedding to the pretrained embedding Glove
pretrained = TEXT.vocab.vectors
BLSTM.embedding.weight.data.copy_(pretrained) 

optimizer = optim.Adam(BLSTM.parameters(), lr=1e-3)
criteon = nn.BCEWithLogitsLoss().to(DEVICE)
BLSTM = BLSTM.to(DEVICE)

# Define Training Testing Process

In [83]:


def train(rnn, dataloader, optimizer, criteon):
    
    avg_acc = []
    rnn.train()
    i=0
    for data in dataloader:
        text = data.text
        text = text.to(DEVICE)
        label = data.label
        label = label.to(DEVICE)
        # [seq, b] => [b, 1] => [b]

        pred = rnn(text).squeeze(1)
        # 
        loss = criteon(pred.float(), label.float())
        acc = binary_acc(pred, label).item()
        avg_acc.append(acc)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if i%500 == 0:
            print(i, acc)
        i +=1
        
    avg_acc = np.array(avg_acc).mean()
    print('avg acc:', avg_acc)		
    
def eval(rnn, iterator, criteon):
    
    avg_acc = []
    TP = 0
    TN = 0
    FP = 0
    FN = 0
    rnn.eval()
    
    with torch.no_grad():
        for data in iterator:
            text = data.text
            text = text.to(DEVICE)
            label = data.label
            label = label.to(DEVICE)
            # [b, 1] => [b]
            pred = rnn(text).squeeze(1)

            #
            loss = criteon(pred.float(), label.float())
            acc_gpu , tp_gpu ,tn_gpu, fp_gpu, fn_gpu= binary_acc(pred, label)

            acc = acc_gpu.item()
            tp = tp_gpu.item()
            tn = tn_gpu.item()
            fp = fp_gpu.item()
            fn = fn_gpu.item()

            avg_acc.append(acc)
            TP = TP +tp
            TN = TN +tn
            FP = FP + fp
            FN = FN + fn
    

    avg_acc = np.array(avg_acc).mean()
    print("TP", TP)
    print("TP", TN)
    print("FP", FP)
    print("FN", FN)
    print('>>test:', avg_acc)

def binary_acc(preds, y):
    preds = torch.round(torch.sigmoid(preds))
    correct = torch.eq(preds, y).float()
    TP = (preds * y).sum()
    TN = (torch.eq(preds,torch.zeros(y.size()).to(DEVICE)).float() *torch.eq(y,torch.zeros(y.size()).to(DEVICE)).float()).sum()
    FP = (torch.eq(preds,torch.zeros(y.size()).to(DEVICE)).float() *torch.eq(y,torch.ones(y.size()).to(DEVICE)).float()).sum()
    FN = (torch.eq(preds,torch.ones(y.size()).to(DEVICE)).float() *torch.eq(y,torch.zeros(y.size()).to(DEVICE)).float()).sum()
    acc = correct.sum() / len(correct)
    return acc, TP,TN, FP,FN


In [17]:
for epoch in range(5):
    train(BLSTM, train_dataloader, optimizer, criteon)
torch.save(BLSTM.state_dict(), "Model.pth")

0 0.2888889014720917
500 0.9611111283302307
avg acc: 0.9493802320527116
0 0.9722222685813904
500 0.9555555582046509
avg acc: 0.963284778890642
0 0.9777777791023254
500 0.9722222685813904
avg acc: 0.9667297369194676
0 0.9611111283302307
500 0.9611111283302307
avg acc: 0.9685966161488143
0 0.9833333492279053
500 0.9611111283302307
avg acc: 0.9704122875293185


In [84]:
BLSTM.load_state_dict(torch.load("Model.pth"))
eval(BLSTM, test_dataloader, criteon)

TP 12133.0
TP 143351.0
FP 3161.0
FN 926.0
>>test: 0.9743896265175055
