<a href="https://www.kaggle.com/code/nur988/sentiment-classification-using-transformers?scriptVersionId=93193464" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
import logging
logging.basicConfig(level='ERROR')
import transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import torch
import torch.nn as nn
from sklearn import metrics
from sklearn.model_selection import train_test_split
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import warnings
warnings.filterwarnings("ignore")


In [2]:
config={
    "max_len":512,
    "train_batch":8,
    "valid_batch":4,
    "epochs":0,
    "model_path":"bert-base-uncased",
    "train_path":"../input/nlp-getting-started/train.csv",
    "test_path":"../input/nlp-getting-started/test.csv",
    
}

In [3]:
tokenizer=transformers.BertTokenizer.from_pretrained(config['model_path'],do_lower_case=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

# Dataset Class

In [4]:
class SentimentDataset():
    def __init__(self,text,target):
        self.text=text
        self.target=target
        self.tokenizer=transformers.BertTokenizer.from_pretrained(config['model_path'],do_lower_case=True)
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,index):
        
        text=str(self.text[index])
        text=" ".join(text.split())
        target=self.target[index]
        
        input=self.tokenizer.encode_plus(
            text,
            None,
            max_length=config['max_len'],
            truncation=True,
            pad_to_max_length=True,
        
        )
        ids=input['input_ids']
        mask=input['attention_mask']
        token_type_ids=input['token_type_ids']
        
        return {
        "ids":torch.tensor(ids,dtype=torch.long),
        "masks":torch.tensor(mask,dtype=torch.long),
        "token_type_ids":torch.tensor(token_type_ids,dtype=torch.long),
        "target":torch.tensor(target,dtype=torch.float)
                  }
        
        
        

In [5]:
class Bertmodel(nn.Module):
    def __init__(self):
        super(Bertmodel,self).__init__()
        self.bert=transformers.BertModel.from_pretrained(config['model_path'])
        self.dropout=nn.Dropout(0.3)
        self.fc=nn.Linear(768,1)
    def forward(self,ids,mask,token_type):
        
        _,x=self.bert(ids,attention_mask=mask,token_type_ids=token_type,return_dict=False)
        
        #x=self.dropout(x)
        x=self.fc(x)
        return x

In [6]:
def loss_fn(output,target):
    loss=nn.BCEWithLogitsLoss()(output,target.view(-1,1))
    return loss

In [7]:
def train_fn(data_loader,model,optimizer,device,scheduler):
    model.train()
    
    for step,data  in enumerate(data_loader):
        
        ids=data['ids']
        masks=data['masks']
        token_type=data['token_type_ids']
        target=data['target']
        
        
        
        ids=ids.to(device,dtype=torch.long)
        masks=masks.to(device,dtype=torch.long)
        token_type=token_type.to(device,dtype=torch.long)
        target=target.to(device,dtype=torch.float)
        
        
        
        optimizer.zero_grad()
        
        
        preds=model(ids,masks,token_type)
        
        
        loss=loss_fn(preds,target)
        
        
        loss.backward()
        
        
        optimizer.step()
        
        
        scheduler.step()
        
        return loss.item()
        
        
        
        
    

# Evaluation function

In [8]:
def eval_fn(data_loader,model,device):
    fin_targets=[]
    fin_outputs=[]
    
    
    
    model.eval()
    
    with torch.no_grad():
        for data in data_loader:
            ids=data['ids']
            masks=data['masks']
            token_type=data['token_type_ids']
            target=data['target']
        
        
        
            ids=ids.to(device,dtype=torch.long)
            masks=masks.to(device,dtype=torch.long)
            token_type=token_type.to(device,dtype=torch.long)
            target=target.to(device,dtype=torch.float)
        
            preds=model(ids,masks,token_type)
        
        
            loss=loss_fn(preds,target)
            
            target=target.cpu().detach()
            
            fin_targets.extend(target.numpy().tolist())
            outputs=torch.sigmoid(preds).cpu().detach()
            fin_outputs.extend(outputs.numpy().tolist())
            
            return fin_outputs,fin_targets
        
    
    

In [9]:
device=torch.device("cuda")
model=Bertmodel()
model.to(device)
def train():
    
    df=pd.read_csv(config["train_path"]).fillna("none")
    df_train,df_valid=train_test_split(df,test_size=0.2,random_state=42,stratify=df.target.values)
    df_train=df_train.reset_index(drop=True)
    df_valid=df_valid.reset_index(drop=True)
    
    train_dataset=SentimentDataset(df_train.text.values,df_train.target.values)
    valid_dataset=SentimentDataset(df_valid.text.values,df_valid.target.values)
    
    train_loader=torch.utils.data.DataLoader(train_dataset,batch_size=config['train_batch'],num_workers=1)
    valid_loader=torch.utils.data.DataLoader(valid_dataset,batch_size=config['valid_batch'],num_workers=1)
    
    
    num_train_steps=int(len(df_train)/config['train_batch']*config['epochs'])
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [{
    "params": [
    p for n, p in param_optimizer if
    not any(nd in n for nd in no_decay)
    ],
    "weight_decay": 0.001,
    },
    {
    "params": [
    p for n, p in param_optimizer if
    any(nd in n for nd in no_decay)
    ],
    "weight_decay": 0.0,
    },
    ]
    
    optimizer=AdamW(optimizer_parameters,lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
       optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
       )
    best_accuracy=0
    for epoch in range(config['epochs']):
        loss=train_fn(train_loader,model,optimizer,device,scheduler)
        print(f"Train_Loss-->>{loss}")
        outputs,targets=eval_fn(valid_loader,model,device)
        outputs=np.array(outputs)>=0.5
        
        accuracy=metrics.accuracy_score(targets,outputs)
        #if accuracy>best_accuracy:
        print(f"Epoch_{epoch}-->Accuracy---->{accuracy}")
    
    

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
train()

In [11]:
class TestDataset():
    def __init__(self,text):
        self.text=text
        
        self.tokenizer=transformers.BertTokenizer.from_pretrained(config['model_path'],do_lower_case=True)
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,index):
        
        text=str(self.text[index])
        text=" ".join(text.split())
        
        
        input=self.tokenizer.encode_plus(
            text,
            None,
            max_length=config['max_len'],
            truncation=True,
            pad_to_max_length=True,
        
        )
        ids=input['input_ids']
        mask=input['attention_mask']
        token_type_ids=input['token_type_ids']
        
        return {
        "ids":torch.tensor(ids,dtype=torch.long),
        "masks":torch.tensor(mask,dtype=torch.long),
        "token_type_ids":torch.tensor(token_type_ids,dtype=torch.long)
        
                  }

In [12]:
df_test=pd.read_csv("../input/nlp-getting-started/test.csv")


In [13]:
test_dataset=TestDataset(df_test.text.values)
test_loader=torch.utils.data.DataLoader(test_dataset,batch_size=8)

In [14]:
total_preds=[]
with torch.no_grad():
    for data in test_loader:
        ids=data["ids"].to(device,dtype=torch.long)
        mask=data["masks"].to(device,dtype=torch.long)
        token_type=data['token_type_ids'].to(device,dtype=torch.long)
    
        output=model(ids,mask,token_type)
        preds=torch.sigmoid(output).cpu().detach()
        preds=preds.numpy().tolist()
        total_preds.extend(preds)

In [15]:
total_preds=np.array(total_preds)>=0.5


In [16]:
submission=pd.read_csv("../input/nlp-getting-started/sample_submission.csv")
submission['target']=total_preds.astype(int)
submission.to_csv("submission.csv",index=False)
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
