In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
import string
import warnings
warnings.filterwarnings('ignore')

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def remove_stopwords(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

In [4]:
train_data = pd.read_csv("/kaggle/input/news-category-2/Data_Train_News.csv", encoding='mac_roman')
test_data = pd.read_csv("/kaggle/input/news-category-2/Data_Test_News.csv", encoding='mac_roman')

In [5]:
train_data

Unnamed: 0,STORY,SECTION
0,But the most painful was the huge reversal in ...,3
1,How formidable is the opposition alliance amon...,0
2,Most Asian currencies were trading lower today...,3
3,"If you want to answer any question, click on ë...",1
4,"In global markets, gold prices edged up today ...",3
...,...,...
7623,"Karnataka has been a Congress bastion, but it ...",0
7624,"The film, which also features Janhvi Kapoor, w...",2
7625,The database has been created after bringing t...,1
7626,"The state, which has had an uneasy relationshi...",0


In [6]:
test_data

Unnamed: 0,STORY
0,2019 will see gadgets like gaming smartphones ...
1,It has also unleashed a wave of changes in the...
2,It can be confusing to pick the right smartpho...
3,The mobile application is integrated with a da...
4,We have rounded up some of the gadgets that sh...
...,...
2743,"According to researchers, fraud in the mobile ..."
2744,The iPhone XS and XS Max share the Apple A12 c...
2745,"On the photography front, the Note 5 Pro featu..."
2746,UDAY mandated that discoms bring the gap betwe...


In [7]:
train_data['STORY'] = train_data['STORY'].apply(remove_stopwords)
test_data['STORY'] = test_data['STORY'].apply(remove_stopwords)

In [8]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_data['STORY'].values, 
    train_data['SECTION'].values, 
    test_size=0.2, 
    random_state=42
)

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [10]:
def encode_texts(texts):
    return tokenizer.batch_encode_plus(
        texts,
        add_special_tokens=True,
        max_length=128,
        truncation=True,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )

train_encodings = encode_texts(train_texts)
val_encodings = encode_texts(val_texts)
test_encodings = encode_texts(test_data['STORY'].values)

In [11]:
train_dataset = TensorDataset(
    train_encodings['input_ids'],
    train_encodings['attention_mask'],
    torch.tensor(train_labels)
)
val_dataset = TensorDataset(
    val_encodings['input_ids'],
    val_encodings['attention_mask'],
    torch.tensor(val_labels)
)
test_dataset = TensorDataset(
    test_encodings['input_ids'],
    test_encodings['attention_mask']
)

In [12]:
batch_size = 32
train_dataloader = DataLoader(train_dataset, sampler=RandomSampler(train_dataset), batch_size=batch_size)
val_dataloader = DataLoader(val_dataset, sampler=SequentialSampler(val_dataset), batch_size=batch_size)
test_dataloader = DataLoader(test_dataset, sampler=SequentialSampler(test_dataset), batch_size=batch_size)

In [13]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=4,
    output_attentions=False,
    output_hidden_states=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [15]:
epochs = 4

for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{epochs}'):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        optimizer.zero_grad()
    
    model.eval()
    val_accuracy = 0
    for batch in val_dataloader:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        
        with torch.no_grad():
            outputs = model(**inputs)
        
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        val_accuracy += (predictions == inputs['labels']).float().mean().item()
    
    val_accuracy /= len(val_dataloader)
    print(f'Validation Accuracy: {val_accuracy:.4f}')

Epoch 1/4: 100%|██████████| 191/191 [01:52<00:00,  1.70it/s]


Validation Accuracy: 0.9701


Epoch 2/4: 100%|██████████| 191/191 [01:52<00:00,  1.70it/s]


Validation Accuracy: 0.9789


Epoch 3/4: 100%|██████████| 191/191 [01:52<00:00,  1.70it/s]


Validation Accuracy: 0.9662


Epoch 4/4: 100%|██████████| 191/191 [01:52<00:00,  1.70it/s]


Validation Accuracy: 0.9730


In [16]:
model.eval()
test_predictions = []

for batch in tqdm(test_dataloader, desc='Testing'):
    batch = tuple(t.to(device) for t in batch)
    inputs = {'input_ids': batch[0], 'attention_mask': batch[1]}
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    test_predictions.extend(predictions.cpu().numpy())

Testing: 100%|██████████| 86/86 [00:16<00:00,  5.23it/s]


In [17]:
submission_df = pd.DataFrame({'SECTION': test_predictions})

In [18]:
submission_df["SECTION"].value_counts()

SECTION
1    1201
2     817
0     400
3     330
Name: count, dtype: int64

In [19]:
submission_df

Unnamed: 0,SECTION
0,1
1,2
2,1
3,1
4,1
...,...
2743,1
2744,1
2745,1
2746,3
