<a href="https://colab.research.google.com/github/ParsaHejabi/ComputationalIntelligence-ComputerAssignments/blob/main/FinalProject/CI_FinalProject_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import dataset from Google Drive to Colab

In [None]:
# !rm train.csv
# !rm test.csv
# !unzip drive/MyDrive/CI_FinalProject/train.csv.zip
# !unzip drive/MyDrive/CI_FinalProject/test.csv.zip

!rm cleaned_train.csv
!cp drive/MyDrive/CI_FinalProject/cleaned_train.csv ./

!rm cleaned_test.csv
!cp drive/MyDrive/CI_FinalProject/cleaned_test.csv ./

rm: cannot remove 'cleaned_train.csv': No such file or directory
rm: cannot remove 'cleaned_test.csv': No such file or directory


# Install hazm, clean-text, and transformers

In [None]:
!pip install -q hazm
!pip install -q clean-text[gpl]
!pip install transformers

[K     |████████████████████████████████| 317kB 13.8MB/s 
[K     |████████████████████████████████| 1.4MB 41.6MB/s 
[K     |████████████████████████████████| 235kB 54.0MB/s 
[?25h  Building wheel for nltk (setup.py) ... [?25l[?25hdone
  Building wheel for libwapiti (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 51kB 7.0MB/s 
[K     |████████████████████████████████| 71kB 8.9MB/s 
[K     |████████████████████████████████| 245kB 26.7MB/s 
[?25h  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/84/ea/634945faff8ad6984b98f7f3d98f6d83083a18af44e349744d90bde81f80/transformers-4.2.0-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 17.9MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091

# Import all important things

In [None]:
import numpy as np
import pandas as pd

import hazm
from cleantext import clean

import json
import re
import os

from sklearn.metrics import f1_score

from transformers import BertConfig, BertTokenizer
from transformers import BertModel

from transformers import AdamW
from transformers import get_linear_schedule_with_warmup

import torch
import torch.nn as nn
import torch.nn.functional as F

from tqdm.notebook import tqdm

import collections

import torch, gc

# Configuration to use GPU with cuda

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f'device: {device}')

train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

device: cuda:0
CUDA is available!  Training on GPU ...


# Preprocessing

## Load cleaned train data

In [None]:
train_data = pd.read_csv('cleaned_train.csv', usecols=['clean_text', 'Category'])
# train_data.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)
train_data['category_id'] = train_data['Category'].factorize()[0]
train_data.head()

Unnamed: 0,Category,clean_text,category_id
0,Science and Culture,خبرنامه دانشگاه علم و صنعت ایران شماره یازدهم ...,0
1,Sport,تا پایان سال ۱۳۷۸ دهها زمین فوتبال و سالن ورزش...,1
2,Economy,انجمن تولیدکنندگان تجهیزات صنعت نفت تشکیل شد ن...,2
3,Miscellaneous.World News,کرتین برای سومین بار نخست وزیر کانادا شد ژان ک...,3
4,Sport,خداحافظ رفقا نمایندگان اروپای شرقی در جام ۲۰۰۲...,1


## Load cleaned test data

In [None]:
test_data = pd.read_csv('cleaned_test.csv', usecols=['Id', 'clean_text'])
# test_data.drop(columns=['Unnamed: 0'], inplace=True)
test_data.head()

Unnamed: 0,Id,clean_text
0,0,هفت اقلیم آلودگی هوا پکن را تهدید میکند باافزا...
1,1,گل و گیاه زعفران زینتی نام علمی: crocus banati...
2,2,یادداشت قانون بودجه و صنایع کوچک در شماره گذشت...
3,3,در سالروز میلاد حضرت مهدی همایش ادبی دانش آموز...
4,4,از ira تا فارک بوگوتا، پایتخت پرهرج ومرج کلمبی...


## Train data stat

In [None]:
print(train_data.columns)
print(train_data['Category'].unique())
print('missing values stats')
print(train_data.isnull().sum())

Index(['Category', 'clean_text', 'category_id'], dtype='object')
['Science and Culture' 'Sport' 'Economy' 'Miscellaneous.World News'
 'Miscellaneous.Urban' 'Social.Women' 'Social' 'Literature and Art'
 'Politics' 'Miscellaneous' 'Economy.Bank and Bourse'
 'Politics.Iran Politics' 'Tourism' 'Social.Religion'
 'Miscellaneous.Picture' 'Miscellaneous.Happenings'
 'Science and Culture.Science.Book' 'Literature and Art.Art'
 'Miscellaneous.Islamic Councils' 'Literature and Art.Art.Cinema'
 'Science and Culture.Science.Information and Communication Technology'
 'Economy.Oil' 'Economy.Commerce' 'Natural Environment'
 'Science and Culture.Science' 'Economy.Industry' 'Economy.Agriculture'
 'Sport.World Cup' 'Miscellaneous.Picture.Caricature'
 'Literature and Art.Art.Music' 'Literature and Art.Art.Theater'
 'Economy.Dwelling and Construction'
 'Science and Culture.Science.Medicine and Remedy'
 'Literature and Art.Literature']
missing values stats
Category       0
clean_text     0
category_id    0

## Clean train and test data

In [None]:
def cleaning(text):
    text = text.strip()
    
    # regular cleaning
    text = clean(text,
        fix_unicode=True,
        to_ascii=False,
        lower=True,
        no_line_breaks=True,
        no_urls=True,
        no_emails=True,
        no_phone_numbers=True,
        no_numbers=False,
        no_digits=False,
        no_currency_symbols=True,
        no_punct=False,
        replace_with_url="",
        replace_with_email="",
        replace_with_phone_number="",
        replace_with_number="",
        replace_with_digit="0",
        replace_with_currency_symbol="",
    )
    
    # normalizing
    normalizer = hazm.Normalizer()
    text = normalizer.normalize(text)
    
    # removing wierd patterns
    wierd_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u'\U00010000-\U0010ffff'
        u"\u200d"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\u3030"
        u"\ufe0f"
        u"\u2069"
        u"\u2066"
        u"\u200c"
        u"\u2068"
        u"\u2067"
        "]+", flags=re.UNICODE)
    
    text = wierd_pattern.sub(r'', text)
    
    # removing extra spaces, hashtags
    text = re.sub("#", "", text)
    text = re.sub("\s+", " ", text)
    
    return text

In [None]:
# train_data['clean_text'] = train_data['Text'].apply(cleaning)
# train_data.head()

In [None]:
# test_data['clean_text'] = test_data['Text'].apply(cleaning)
# test_data.head()

## Make one-hot for category field

In [None]:
train_data_labels = pd.get_dummies(train_data['Category'])
train_data_labels

Unnamed: 0,Economy,Economy.Agriculture,Economy.Bank and Bourse,Economy.Commerce,Economy.Dwelling and Construction,Economy.Industry,Economy.Oil,Literature and Art,Literature and Art.Art,Literature and Art.Art.Cinema,Literature and Art.Art.Music,Literature and Art.Art.Theater,Literature and Art.Literature,Miscellaneous,Miscellaneous.Happenings,Miscellaneous.Islamic Councils,Miscellaneous.Picture,Miscellaneous.Picture.Caricature,Miscellaneous.Urban,Miscellaneous.World News,Natural Environment,Politics,Politics.Iran Politics,Science and Culture,Science and Culture.Science,Science and Culture.Science.Book,Science and Culture.Science.Information and Communication Technology,Science and Culture.Science.Medicine and Remedy,Social,Social.Religion,Social.Women,Sport,Sport.World Cup,Tourism
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150091,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150092,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150093,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
150094,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [None]:
# general config
MAX_LEN = 128
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16

EPOCHS = 1
EEVERY_EPOCH = 1000
LEARNING_RATE = 2e-5
CLIP = 0.0

MODEL_NAME_OR_PATH = 'HooshvareLab/bert-fa-base-uncased'
OUTPUT_PATH = 'news_classification.bin'

# os.makedirs(os.path.dirname(OUTPUT_PATH), exist_ok=True)

In [None]:
label2id = {label: i for i, label in enumerate(train_data['Category'].unique())}
id2label = {v: k for k, v in label2id.items()}

print(f'label2id: {label2id}')
print(f'id2label: {id2label}')

label2id: {'Science and Culture': 0, 'Sport': 1, 'Economy': 2, 'Miscellaneous.World News': 3, 'Miscellaneous.Urban': 4, 'Social.Women': 5, 'Social': 6, 'Literature and Art': 7, 'Politics': 8, 'Miscellaneous': 9, 'Economy.Bank and Bourse': 10, 'Politics.Iran Politics': 11, 'Tourism': 12, 'Social.Religion': 13, 'Miscellaneous.Picture': 14, 'Miscellaneous.Happenings': 15, 'Science and Culture.Science.Book': 16, 'Literature and Art.Art': 17, 'Miscellaneous.Islamic Councils': 18, 'Literature and Art.Art.Cinema': 19, 'Science and Culture.Science.Information and Communication Technology': 20, 'Economy.Oil': 21, 'Economy.Commerce': 22, 'Natural Environment': 23, 'Science and Culture.Science': 24, 'Economy.Industry': 25, 'Economy.Agriculture': 26, 'Sport.World Cup': 27, 'Miscellaneous.Picture.Caricature': 28, 'Literature and Art.Art.Music': 29, 'Literature and Art.Art.Theater': 30, 'Economy.Dwelling and Construction': 31, 'Science and Culture.Science.Medicine and Remedy': 32, 'Literature and Ar

In [None]:
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME_OR_PATH)
config = BertConfig.from_pretrained(
    MODEL_NAME_OR_PATH, **{
        'label2id': label2id,
        'id2label': id2label,
    })

print(config.to_json_string())

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1198122.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440.0, style=ProgressStyle(description_…


{
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "Science and Culture",
    "1": "Sport",
    "2": "Economy",
    "3": "Miscellaneous.World News",
    "4": "Miscellaneous.Urban",
    "5": "Social.Women",
    "6": "Social",
    "7": "Literature and Art",
    "8": "Politics",
    "9": "Miscellaneous",
    "10": "Economy.Bank and Bourse",
    "11": "Politics.Iran Politics",
    "12": "Tourism",
    "13": "Social.Religion",
    "14": "Miscellaneous.Picture",
    "15": "Miscellaneous.Happenings",
    "16": "Science and Culture.Science.Book",
    "17": "Literature and Art.Art",
    "18": "Miscellaneous.Islamic Councils",
    "19": "Literature and Art.Art.Cinema",
    "20": "Science and Culture.Science.Information and Communication Technology",
    "21": "Economy.Oil",
    "22": "Economy.Commerce",
    "23": "Natural E

In [None]:
idx = np.random.randint(0, len(train_data))
sample_text = train_data.iloc[idx]['clean_text']
sample_label = train_data.iloc[idx]['category_id']

print(f'Sample: \n{sample_text}\n{sample_label}')

Sample: 
تازههای نشر در ماه گذشته هم، تعدادی از ناشران، موءسسات پژوهشی و ارگانهای دولتی، با ارسال آثار خود به مرکز مطالعات و کتابخانه روزنامه همشهری، همکاری فرهنگی و ارتباطات خود را با ما، کماکان مستمر نگاه داشتند. در ستون تازههای نشر، امروز برخی از این آثار را که در مرکز اطلاعات روزنامه همشهری فهرست شده است، معرفی کردهایم.
0


In [None]:
tokens = tokenizer.tokenize(sample_text)
token_ids = tokenizer.convert_tokens_to_ids(tokens)

print(f'     News: {sample_text}')
print(f'   Tokens: {tokenizer.convert_tokens_to_string(tokens)}')
print(f'Token IDs: {token_ids}')

     News: تازههای نشر در ماه گذشته هم، تعدادی از ناشران، موءسسات پژوهشی و ارگانهای دولتی، با ارسال آثار خود به مرکز مطالعات و کتابخانه روزنامه همشهری، همکاری فرهنگی و ارتباطات خود را با ما، کماکان مستمر نگاه داشتند. در ستون تازههای نشر، امروز برخی از این آثار را که در مرکز اطلاعات روزنامه همشهری فهرست شده است، معرفی کردهایم.
   Tokens: تازههای نشر در ماه گذشته هم ، تعدادی از ناشران ، موءسسات پژوهشی و ارگانهای دولتی ، با ارسال اثار خود به مرکز مطالعات و کتابخانه روزنامه همشهری ، همکاری فرهنگی و ارتباطات خود را با ما ، کماکان مستمر نگاه داشتند . در ستون تازههای نشر ، امروز برخی از این اثار را که در مرکز اطلاعات روزنامه همشهری فهرست شده است ، معرفی کردهایم .
Token IDs: [19225, 8005, 2786, 3125, 3421, 2820, 1348, 5198, 2791, 15504, 1348, 2845, 2072, 22976, 2811, 7622, 1379, 19113, 4433, 1348, 2799, 4855, 3917, 2847, 2789, 3726, 5629, 1379, 6492, 4531, 10543, 1348, 3909, 4009, 1379, 4528, 2847, 2803, 2799, 2964, 1348, 13055, 9891, 4407, 4159, 1012, 2786, 6710, 19225, 8005, 1348, 3767, 3237

In [None]:
encoding = tokenizer.encode_plus(
    sample_text,
    max_length=32,
    truncation=True,
    add_special_tokens=True,
    return_token_type_ids=True,
    return_attention_mask=True,
    padding='max_length',
    return_tensors='pt',
)

print(f'Keys: {encoding.keys()}\n')
for k in encoding.keys():
    print(f'{k}:\n{encoding[k]}')

Keys: dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])

input_ids:
tensor([[    2, 19225,  8005,  2786,  3125,  3421,  2820,  1348,  5198,  2791,
         15504,  1348,  2845,  2072, 22976,  2811,  7622,  1379, 19113,  4433,
          1348,  2799,  4855,  3917,  2847,  2789,  3726,  5629,  1379,  6492,
          4531,     4]])
token_type_ids:
tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])
attention_mask:
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])


In [None]:
class NewsDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, texts, labels=None, label_list=None, max_len=128):
        self.texts = texts
        self.labels = labels
        self.has_label = isinstance(labels, list) or isinstance(labels, np.ndarray)

        self.tokenizer = tokenizer
        self.max_len = max_len

        self.label_map = {label: i for i, label in enumerate(label_list)} if isinstance(label_list, list) else {}

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])

        if self.has_label:
            label = self.label_map.get(str(self.labels[item]), str(self.labels[item]))

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            truncation=True,
            max_length=self.max_len,
            return_token_type_ids=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt')
        
        inputs = {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
        }

        if self.has_label:
            inputs['labels'] = torch.tensor(label, dtype=torch.long)
        
        return inputs

def create_data_loader(x, y, tokenizer, max_len, batch_size, label_list):
    dataset = NewsDataset(
        texts=x,
        labels=y,
        tokenizer=tokenizer,
        max_len=max_len, 
        label_list=label_list)
    
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size)

In [None]:
label_list = train_data['Category'].unique().tolist()
train_data_loader = create_data_loader(train_data['clean_text'].to_numpy(), train_data['Category'].to_numpy(), tokenizer, MAX_LEN, TRAIN_BATCH_SIZE, label_list)
test_data_loader = create_data_loader(test_data['clean_text'].to_numpy(), None, tokenizer, MAX_LEN, TEST_BATCH_SIZE, label_list)

In [None]:
sample_data = next(iter(train_data_loader))

print(sample_data.keys())

print(sample_data['text'])
print(sample_data['input_ids'].shape)
print(sample_data['input_ids'][0, :])
print(sample_data['attention_mask'].shape)
print(sample_data['attention_mask'][0, :])
print(sample_data['token_type_ids'].shape)
print(sample_data['token_type_ids'][0, :])
print(sample_data['labels'].shape)
print(sample_data['labels'][0])

dict_keys(['text', 'input_ids', 'attention_mask', 'token_type_ids', 'labels'])
['خبرنامه دانشگاه علم و صنعت ایران شماره یازدهم از خبرنامه روابط عمومی دانشگاه علم و صنعت ایران در ۴۸ صفحه با اخبار و مطالب علمی متنوعی از استادان این دانشگاه منتشر شد. در این شماره از خبرنامه، علاوه بر اختصاص صفحاتی چند به دیدار رئیس جمهوری از دانشگاه علم و صنعت و سخنرانی در جمع دانشجویان این دانشگاه، دانشکده مهندسی شیمی، پژوهشکده مکانیک خودرو و چند بخش دیگر این دانشگاه معرفی شده است. تلفن روابط عمومی دانشگاه علم و صنعت ایران و نشانی اینترنتی آن به قرار زیر است: ir. ac. iust. www', 'تا پایان سال ۱۳۷۸ دهها زمین فوتبال و سالن ورزش برای کارگران ساخته میشود تا پایان سال ۱۳۷۸ برای گسترش ورزش کارگران ۲۷ زمین چمن فوتبال ۲۱ پیست دوومیدانی ۵۱ زمین بازی چند منظوره و ۶ استخر شنای سرپوشیده به همراه سایر تاسیسات و امکانات جانبی آماده بهره برداری خواهد شد. به گزارش روابط عمومی سازمان برنامه و بودجه ایجاد و توسعه ۴۶۷ هزار و ۸۲۹ متر مربع فضای ورزشی کارگران مورد موافقت این سازمان و وزارت کار و امور اجتماعی قرار گرفت. این گز

In [None]:
sample_test = next(iter(test_data_loader))
print(sample_test.keys())

dict_keys(['text', 'input_ids', 'attention_mask', 'token_type_ids'])


In [None]:
class NewsClassificationModel(nn.Module):

    def __init__(self, config):
        super(NewsClassificationModel, self).__init__()

        self.bert = BertModel.from_pretrained(MODEL_NAME_OR_PATH)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
    
    def forward(self, input_ids, attention_mask, token_type_ids):
        output = self.bert(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            token_type_ids=token_type_ids)
        
        pooler_output = self.dropout(output.pooler_output)
        logits = self.classifier(pooler_output)
        return logits

In [None]:
gc.collect()
torch.cuda.empty_cache()
pt_model = None

!nvidia-smi

Wed Jan 13 18:19:46 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.27.04    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P8    10W /  70W |     10MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
nc_model = NewsClassificationModel(config=config)
nc_model = nc_model.to(device)

print('nc_model', type(nc_model))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=654226731.0, style=ProgressStyle(descri…


nc_model <class '__main__.NewsClassificationModel'>


In [None]:
# sample data output
sample_data_text = sample_data['text']
sample_data_input_ids = sample_data['input_ids']
sample_data_attention_mask = sample_data['attention_mask']
sample_data_token_type_ids = sample_data['token_type_ids']
sample_data_labels = sample_data['labels']

# available for using in GPU
sample_data_input_ids = sample_data_input_ids.to(device)
sample_data_attention_mask = sample_data_attention_mask.to(device)
sample_data_token_type_ids = sample_data_token_type_ids.to(device)
sample_data_labels = sample_data_labels.to(device)

outputs = nc_model(sample_data_input_ids, sample_data_attention_mask, sample_data_token_type_ids)
_, preds = torch.max(outputs, dim=1)

print(outputs[:5, :])
print(preds[:5])

tensor([[ 0.4597, -0.0221, -0.2418,  0.0188,  0.0965,  0.1726, -0.2904,  0.5441,
          0.3218,  0.2498,  0.3632, -0.2466, -0.1315, -0.3908,  0.6214, -0.0727,
          0.2721, -0.2409, -0.4014,  0.1241,  0.1270, -0.0117,  0.2336, -0.4981,
         -0.4572, -0.3386, -0.6869, -0.2599, -0.4486,  0.7855, -0.2743, -0.2409,
         -0.6847, -0.5159],
        [ 0.2599, -0.2317, -0.3044, -0.2361, -0.2446, -0.3033, -0.3584,  0.4665,
          0.3240, -0.1012, -0.1377,  0.0841, -0.0658, -0.5144,  0.4746, -0.0944,
          0.4215, -0.0482, -0.8741, -0.1557, -0.2532, -0.4572, -0.0319, -0.6333,
         -0.3263, -0.2241, -0.6212, -0.4144, -0.3333,  0.9036, -0.2552, -0.1567,
         -0.5833, -0.0923],
        [ 0.3822,  0.0150,  0.2622, -0.5011, -0.2892,  0.1159, -0.5942,  0.2892,
          0.1709, -0.2460,  0.1438,  0.0267, -0.1187, -0.0524,  0.3141, -0.1760,
          0.2128, -0.2020, -0.6090,  0.1565,  0.2539,  0.2651,  0.0994, -0.2235,
          0.1531, -0.3715, -0.5731,  0.0438, -0.5487,

In [None]:
def simple_accuracy(y_true, y_pred):
    return (y_true == y_pred).mean()

def acc_and_f1(y_true, y_pred, average='weighted'):
    acc = simple_accuracy(y_true, y_pred)
    f1 = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {
        "acc": acc,
        "f1": f1,
    }

def y_loss(y_true, y_pred, losses):
    y_true = torch.stack(y_true).cpu().detach().numpy()
    y_pred = torch.stack(y_pred).cpu().detach().numpy()
    y = [y_true, y_pred]
    loss = np.mean(losses)

    return y, loss


def eval_op(model, data_loader, loss_fn):
    model.eval()

    losses = []
    y_pred = []
    y_true = []

    with torch.no_grad():
        for dl in tqdm(data_loader, total=len(data_loader), desc="Evaluation... "):
            
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']
            targets = dl['labels']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            targets = targets.to(device)

            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            # calculate the batch loss
            loss = loss_fn(outputs, targets)

            # accumulate all the losses
            losses.append(loss.item())

            y_pred.extend(preds)
            y_true.extend(targets)
    
    eval_y, eval_loss = y_loss(y_true, y_pred, losses)
    return eval_y, eval_loss


def train_op(model, 
             data_loader, 
             loss_fn, 
             optimizer, 
             scheduler, 
             step=0, 
             print_every_step=100, 
             eval=False,
             eval_cb=None,
             eval_loss_min=np.Inf,
             eval_data_loader=None, 
             clip=0.0):
    
    model.train()

    losses = []
    y_pred = []
    y_true = []

    for dl in tqdm(data_loader, total=len(data_loader), desc="Training... "):
        step += 1

        input_ids = dl['input_ids']
        attention_mask = dl['attention_mask']
        token_type_ids = dl['token_type_ids']
        targets = dl['labels']

        # move tensors to GPU if CUDA is available
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        targets = targets.to(device)

        # clear the gradients of all optimized variables
        optimizer.zero_grad()

        # compute predicted outputs by passing inputs to the model
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids)
        
        # convert output probabilities to predicted class
        _, preds = torch.max(outputs, dim=1)

        # calculate the batch loss
        loss = loss_fn(outputs, targets)

        # accumulate all the losses
        losses.append(loss.item())

        # compute gradient of the loss with respect to model parameters
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        if clip > 0.0:
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=clip)

        # perform optimization step
        optimizer.step()

        # perform scheduler step
        scheduler.step()

        y_pred.extend(preds)
        y_true.extend(targets)

        # if eval:
        #     train_y, train_loss = y_loss(y_true, y_pred, losses)
        #     train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')

        #     if step % print_every_step == 0:
        #         eval_y, eval_loss = eval_op(model, eval_data_loader, loss_fn)
        #         eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')

        #         if hasattr(eval_cb, '__call__'):
        #             eval_loss_min = eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min)

    train_y, train_loss = y_loss(y_true, y_pred, losses)

    return train_y, train_loss, step
    # , eval_loss_min

In [None]:
optimizer = AdamW(nc_model.parameters(), lr=LEARNING_RATE, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss()

step = 0
eval_loss_min = np.Inf
history = collections.defaultdict(list)


def eval_callback(epoch, epochs, output_path):
    def eval_cb(model, step, train_score, train_loss, eval_score, eval_loss, eval_loss_min):
        statement = ''
        statement += 'Epoch: {}/{}...'.format(epoch, epochs)
        statement += 'Step: {}...'.format(step)
        
        statement += 'Train Loss: {:.6f}...'.format(train_loss)
        statement += 'Train Acc: {:.3f}...'.format(train_score['acc'])

        # statement += 'Valid Loss: {:.6f}...'.format(eval_loss)
        # statement += 'Valid Acc: {:.3f}...'.format(eval_score['acc'])

        print(statement)

        if eval_loss <= eval_loss_min:
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
                eval_loss_min,
                eval_loss))
            
            torch.save(model.state_dict(), output_path)
            eval_loss_min = eval_loss
        
        return eval_loss_min


    return eval_cb


for epoch in tqdm(range(1, EPOCHS + 1), desc="Epochs... "):
    train_y, train_loss, step = train_op(
        model=nc_model, 
        data_loader=train_data_loader, 
        loss_fn=loss_fn, 
        optimizer=optimizer, 
        scheduler=scheduler, 
        step=step, 
        print_every_step=EEVERY_EPOCH, 
        clip=CLIP)
    
    train_score = acc_and_f1(train_y[0], train_y[1], average='weighted')
    
    # eval_y, eval_loss = eval_op(
    #     model=nc_model, 
    #     data_loader=valid_data_loader, 
    #     loss_fn=loss_fn)
    
    # eval_score = acc_and_f1(eval_y[0], eval_y[1], average='weighted')
    
    history['train_acc'].append(train_score['acc'])
    history['train_loss'].append(train_loss)
    # history['val_acc'].append(eval_score['acc'])
    # history['val_loss'].append(eval_loss)

HBox(children=(FloatProgress(value=0.0, description='Epochs... ', max=1.0, style=ProgressStyle(description_wid…

HBox(children=(FloatProgress(value=0.0, description='Training... ', max=9381.0, style=ProgressStyle(descriptio…





In [None]:
def predict(model, comments, tokenizer, max_len=128, batch_size=32):
    data_loader = create_data_loader(comments, None, tokenizer, max_len, batch_size, None)
    
    predictions = []
    prediction_probs = []

    
    model.eval()
    with torch.no_grad():
        for dl in tqdm(data_loader, position=0):
            input_ids = dl['input_ids']
            attention_mask = dl['attention_mask']
            token_type_ids = dl['token_type_ids']

            # move tensors to GPU if CUDA is available
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            
            # compute predicted outputs by passing inputs to the model
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids)
            
            # convert output probabilities to predicted class
            _, preds = torch.max(outputs, dim=1)

            predictions.extend(preds)
            prediction_probs.extend(F.softmax(outputs, dim=1))

    predictions = torch.stack(predictions).cpu().detach().numpy()
    prediction_probs = torch.stack(prediction_probs).cpu().detach().numpy()

    return predictions, prediction_probs

In [None]:
test_comments = test_data['clean_text'].to_numpy()
preds, probs = predict(nc_model, test_comments, tokenizer, max_len=128)

print(preds.shape, probs.shape)

HBox(children=(FloatProgress(value=0.0, max=522.0), HTML(value='')))


(16678,) (16678, 34)


In [None]:
test_data.shape[0]

16678

In [None]:
preds

array([4, 9, 2, ..., 3, 9, 2])

In [None]:
final_preds = []

In [None]:
for pred in preds:
  final_preds.append(id2label[pred])

In [None]:
final_ids = list(range(0, test_data.shape[0]))
final_ids

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 31,
 32,
 33,
 34,
 35,
 36,
 37,
 38,
 39,
 40,
 41,
 42,
 43,
 44,
 45,
 46,
 47,
 48,
 49,
 50,
 51,
 52,
 53,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 61,
 62,
 63,
 64,
 65,
 66,
 67,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 79,
 80,
 81,
 82,
 83,
 84,
 85,
 86,
 87,
 88,
 89,
 90,
 91,
 92,
 93,
 94,
 95,
 96,
 97,
 98,
 99,
 100,
 101,
 102,
 103,
 104,
 105,
 106,
 107,
 108,
 109,
 110,
 111,
 112,
 113,
 114,
 115,
 116,
 117,
 118,
 119,
 120,
 121,
 122,
 123,
 124,
 125,
 126,
 127,
 128,
 129,
 130,
 131,
 132,
 133,
 134,
 135,
 136,
 137,
 138,
 139,
 140,
 141,
 142,
 143,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 155,
 156,
 157,
 158,
 159,
 160,
 161,
 162,
 163,
 164,
 165,
 166,
 167,
 168,
 169,
 170,
 171,
 172,
 173,
 174,
 175,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,


In [None]:
final_dataframe = pd.DataFrame({'Id': final_ids, 'Category': final_preds})

In [None]:
final_dataframe.head()

Unnamed: 0,Id,Category
0,0,Miscellaneous.Urban
1,1,Miscellaneous
2,2,Economy
3,3,Science and Culture
4,4,Politics


In [None]:
final_dataframe.to_csv('submission.csv')