In [1]:
import numpy as np
import pandas as pd
import os
import json
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import AdamW, get_linear_schedule_with_warmup, get_constant_schedule_with_warmup
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import DataLoader, Dataset
from torch import nn
import torch.optim as optim
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [9]:
root = os.getcwd()
raw_data = []
raw_data = pd.read_json(root + '/data/News_Category_Dataset_v2.json', lines=True)

# data pre-processing
def category_merge(x):
    if x == 'THE WORLDPOST' or x == 'WORLD NEWS' or x == 'WORLDPOST':
        return 'WORLD NEWS'
    elif x == 'TASTE':
        return 'FOOD & DRINK'
    elif x == 'STYLE':
        return 'STYLE & BEAUTY'
    elif x == 'PARENTING':
        return 'PARENTS'
    elif x == 'COLLEGE':
        return 'EDUCATION'
    elif x == 'ARTS' or x == 'CULTURE & ARTS':
        return 'ARTS & CULTURE'
    elif x == 'SCIENCE' or x == 'TECH':
        return 'SCIENCE & TECH'
    elif x == 'BLACK VOICES' or x == 'DIVORCE' or x == 'LATINO VOICES' or x == 'QUEER VOICES' or x == 'GOOD NEWS':
        return 'DROP'
    elif x == 'WEDDINGS' or x == 'WOMEN' or x == 'IMPACT' or x == 'CRIME' or x == 'MEDIA' or x == 'WEIRD NEWS':
        return 'DROP'
    elif x == 'GREEN' or x == 'RELIGION' or x == 'EDUCATION' or x == 'MONEY' or x == 'FIFTY' or x == 'ENVIRONMENT':
        return 'DROP'
    else:
        return x

def drop_small_data(data):
    data['category'] = data['category'].apply(category_merge)
    data["text"] = data["headline"] + '. ' + data["short_description"]
    data.drop(["authors", "link", "date", "headline", "short_description"], axis=1, inplace=True)

    drop_data = []
    cnt = 0
    for i in data['category']:
        if i == 'EDUCATION' or i == 'DROP':
            drop_data.append(cnt)
        cnt = cnt + 1
    data.drop(index=drop_data, inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    return data
    
def drop_large_data(data):
    drop_large_data = []
    cnt = [0, 0, 0, 0, 0]
    index = 0

    for i in data['category']:
        if i == 'POLITICS':
            cnt[0] = cnt[0] + 1
            if cnt[0] > 10000:
                drop_large_data.append(index)
        elif i == 'WELLNESS':
            cnt[1] = cnt[1] + 1
            if cnt[1] > 10000:
                drop_large_data.append(index)
        elif i == 'PARENTS':
            cnt[2] = cnt[2] + 1
            if cnt[2] > 10000:
                drop_large_data.append(index)
        elif i == 'ENTERTAINMENT':
            cnt[3] = cnt[3] + 1
            if cnt[3] > 10000:
                drop_large_data.append(index)
        elif i == 'STYLE & BEAUTY':
            cnt[4] = cnt[4] + 1
            if cnt[4] > 10000:
                drop_large_data.append(index)
        index = index + 1

    data.drop(index=drop_large_data, inplace=True)
    data.reset_index(drop=True, inplace=True)
    
    return data

def pre_processing(data):
    data = drop_small_data(data)
    data = drop_large_data(data)
    return data

data = pre_processing(raw_data)

In [3]:
# bert, tokenizer, device setting
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = AutoModel.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
int_category = list(data["category"].unique())

def sparse_categories(category):
    return int_category.index(category)

In [16]:
data = shuffle(data)

#splitting data to train, val, test(7:2:1)
train_text, val_text, train_category, val_category = train_test_split(data.drop("category", axis=1), data["category"], test_size=0.2)
train_text, test_text, train_category, test_category = train_test_split(train_text, train_category, test_size=0.15)

train_text.head()

Unnamed: 0,text
80844,Can Science Show Us Secrets Of Making Better D...
52810,Global Travel Needs to Be Part of the Solution...
554,'The Late Show' Unveils Spoof Line Of Trump-Th...
37220,This Champagne Machine Gun Offers The Most Rid...
104541,Menopause as a Consequence of Within-Family Co...


In [6]:
class TextDataset(Dataset):
    def __init__(self, ids, texts, targets, tokenizer, max_len):
        self.ids = ids
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        id = self.ids[idx]
        text = self.texts[idx]
        label = self.targets[idx]

        encoding = self.tokenizer.encode_plus(text, add_special_tokens=True,
                                              max_length=self.max_len,
                                              return_token_type_ids=False,
                                              padding='max_length',
                                              return_attention_mask=True,
                                              truncation=True,
                                              return_tensors='pt')

        return {
          'id': torch.tensor(id, dtype=torch.long),
          'text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'],
          'label': torch.tensor(label, dtype=torch.int)
        }

    def __len__(self):
        return len(self.texts)

In [7]:
# data load

MAX_LEN = 64
BATCH_SIZE = 64

train_data = TextDataset(texts=train_text.text.to_numpy(), targets=train_category.to_numpy(),
                         ids=train_text.index.to_numpy(), tokenizer=tokenizer, max_len=MAX_LEN)
val_data = TextDataset(texts=val_text.text.to_numpy(), targets=val_category.to_numpy(),
                       ids=val_text.index.to_numpy(), tokenizer=tokenizer, max_len=MAX_LEN)
test_data = TextDataset(texts=test_text.text.to_numpy(), targets=test_category.to_numpy(),
                        ids=test_text.index.to_numpy(), tokenizer=tokenizer, max_len=MAX_LEN)

train_dataloader = DataLoader(train_data, batch_size=BATCH_SIZE, num_workers=1, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=BATCH_SIZE, num_workers=1, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=BATCH_SIZE, num_workers=1, shuffle=True)

In [8]:
# classfier model
class TextClassifier(nn.Module):
    def __init__(self, num_labels, BERT, _dropout=0.3):
        super(TextClassifier, self).__init__()
        self.num_labels = num_labels
        self.bert = BERT
        self.drop = nn.Dropout(_dropout)
        self.classifier = nn.Linear(BERT.config.hidden_size, num_labels)
        
    def forward(self, input_ids, attention_mask, labels=None):
        pooled_output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(pooled_output[0][:, 0, :])
        return self.classifier(output)

In [20]:
# evaluation function
def evaluation(model, data_loader, loss_function, device):
    model = model.eval()
    all_predictions , true_labels= [], []
    
    with torch.no_grad():
        for data in data_loader:
            labels = data["label"].to(device)
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(output, dim=1)
            loss = loss_function(output, labels.long())
            all_predictions.append(preds.cpu().data)
            true_labels.append(labels.cpu().data)
    
    all_predictions = np.concatenate(all_predictions, axis = 0)
    true_labels = np.concatenate(true_labels, axis = 0)
            
    accuracy = accuracy_score(true_labels, all_predictions)
    
    return accuracy

In [10]:
# train function
def train(model, epochs, train_dataloader, val_dataloader,
          loss_function, optimizer, device, scheduler):
    
    model = model.train()
    accumulation_step = 4
    
    for epoch in range(epochs):
        all_predictions, true_labels = [], []
        cnt = 0
        
        for data in tqdm(train_dataloader):
            labels = data["label"].to(device)
            input_ids = data["input_ids"].to(device)
            attention_mask = data["attention_mask"].to(device)
            
            output = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(output, dim=1)
            loss = loss_function(output, labels.long())
            loss = loss / accumulation_step
            all_predictions.append(preds.cpu().data)
            true_labels.append(labels.cpu().data)
            
            loss.backward()
            nn.utils.clip_grad_norm(model.parameters(), max_norm=1.0)
            
            if (cnt + 1) % accumulation_step == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
            cnt += 1
            
        all_predictions = np.concatenate(all_predictions, axis=0)
        true_labels = np.concatenate(true_labels, axis=0)
        accuracy = accuracy_score(true_labels, all_predictions)
        val_accuracy = evaluation(model, val_dataloader, loss_function, device)
        
        print(f"Epoch: {epoch + 1} Accuracy: {accuracy}")
        print(f"Validation Accuracy: {val_accuracy}")
        
    return accuracy

In [11]:
model = TextClassifier(len(int_category), bert, 0.4)
model = model.to(device)
EPOCHS = 1

optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * EPOCHS
warmup_step = int(len(train_dataloader) / 2) 

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=warmup_step,
  num_training_steps=total_steps
)

loss_function = nn.CrossEntropyLoss().to(device)

In [12]:
train(model=model, epochs=EPOCHS, train_dataloader=train_dataloader, val_dataloader=val_dataloader,
     loss_function=loss_function, optimizer=optimizer, device=device, scheduler=scheduler)

  0%|          | 0/1187 [00:00<?, ?it/s]

  nn.utils.clip_grad_norm(model.parameters(), max_norm = 1.0)


Epoch: 1 Accuracy: 0.40110903295487477
Validation Accuracy: 0.7036854596748914


0.40110903295487477

In [21]:
print(evaluation(model, test_dataloader, loss_function, device))

0.7071423240540339


In [15]:
PATH = "./weights/classification.pth"
torch.save(model.state_dict(), PATH)