In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import os 
import pandas as pd
import numpy as np 
import transformers
import pytorch_lightning as pl
from sklearn.model_selection import train_test_split
from datasets import load_dataset, Dataset
import torchmetrics
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import MobileBertModel, MobileBertConfig, MobileBertTokenizer, MobileBertForSequenceClassification


In [None]:
full_data = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')

In [None]:
full_data.head(n=20)

In [None]:
train_df, val_df = train_test_split(full_data, test_size = 0.05)

In [None]:
train_df.shape, val_df.shape

In [None]:
Label_Columns = ['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']

In [None]:
class TestAESDataset(pl.LightningDataModule):
    
    def __init__(
        self,
        data,
        tokenizer,
        max_token_len = 512
        ):
        self.tokenizer - tokenizer
        self.data = data
        self.max_token_len = max_token_len
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index: int):
        data_row= self.data.iloc[index]
        comment_text

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

In [None]:
train_csv = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")

In [None]:
train_csv['labels']=''

In [None]:
for index, row in train_csv.iterrows():
    train_csv.at[index, 'labels'] = np.zeros((6))
    train_csv.at[index, 'labels'][0] = row['cohesion']
    train_csv.at[index, 'labels'][1] = row['syntax']
    train_csv.at[index, 'labels'][2] = row['vocabulary']
    train_csv.at[index, 'labels'][3] = row['phraseology']
    train_csv.at[index, 'labels'][4] = row['grammar']
    train_csv.at[index, 'labels'][5] = row['conventions']
   

In [None]:
train_csv 

In [None]:
tokenizer = MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')

In [None]:

def tokenizer_function(text):
    return tokenizer(text['full_text'], add_special_tokens=True, max_length=512,truncation=True, padding='max_length')

In [None]:
train_full = Dataset.from_dict(train_csv)

In [None]:
train_full

In [None]:
train_tokenized = train_full.map(tokenizer_function, batched= True)

In [None]:
train_tokenized.column_names

In [None]:
train_tokenized = train_tokenized.remove_columns(['full_text', 'text_id', 'cohesion',
 'syntax',
 'vocabulary',
 'phraseology',
 'grammar',
 'conventions'])

In [None]:
train_tokenized.column_names

In [None]:
train_tokenized

In [None]:
train_tokenized = train_tokenized.train_test_split(train_size=0.95)

In [None]:
train_tokenized

In [None]:
type(train_tokenized['train']['labels'][0])

In [None]:
model_config =  MobileBertConfig.from_pretrained('google/mobilebert-uncased')


In [None]:
class AESDataModule(pl.LightningDataModule):
    def __init__(self, data_dir: str = '../input/feedback-prize-english-language-learning/', batch_size: int = 64, max_length: int = 512):
        super().__init__()
        self.tokenizer= MobileBertTokenizer.from_pretrained('google/mobilebert-uncased')
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.max_length= max_length
    
    def tokenizer_function(self, text):
        return self.tokenizer(text['full_text'], max_length=512,truncation=True, padding='max_length')
  
    def setup(self, stage):
        train_csv = pd.read_csv("../input/feedback-prize-english-language-learning/train.csv")
        train_csv['labels']=''
        for index, row in train_csv.iterrows():
            train_csv.at[index, 'labels'] = np.zeros((6))
            train_csv.at[index, 'labels'][0] = row['cohesion']
            train_csv.at[index, 'labels'][1] = row['syntax']
            train_csv.at[index, 'labels'][2] = row['vocabulary']
            train_csv.at[index, 'labels'][3] = row['phraseology']
            train_csv.at[index, 'labels'][4] = row['grammar']
            train_csv.at[index, 'labels'][5] = row['conventions']    
        train_full = Dataset.from_dict(train_csv)
        test_full = load_dataset("csv", data_files= '../input/feedback-prize-english-language-learning/test.csv')
        train_tokenized = train_full.map(self.tokenizer_function, batched=True)
        test_tokenized = test_full.map(self.tokenizer_function, batched=True)
        train_tokenized = train_tokenized.remove_columns(['full_text', 'text_id', 'cohesion','syntax','vocabulary','phraseology','grammar','conventions'])
        test_tokenized = test_tokenized.remove_columns(['full_text', 'text_id'])
        train_tokenized = train_tokenized.train_test_split(train_size=0.95)
        self.train_ds = train_tokenized['train']
        self.val_ds = train_tokenized['test']
        self.test_ds = test_tokenized['train']
   
    def train_dataloader(self):
        return DataLoader(self.train_ds, self.batch_size, num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.val_ds, self.batch_size, num_workers=2)
    
    def test_dataloader(self):
        return DataLoader(self.test_ds, self.batch_size, num_workers=2)


In [None]:
BATCH_SIZE = 12
data_module = AESDataModule(batch_size=BATCH_SIZE)
NUM_CLASSES = 6 

In [None]:
class AESModule(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.loss = nn.MSELoss()
        self.gain = nn.CosineSimilarity()
        #We need a couple of scalars to mix-up the losses
        self.alpha = 1
        self.beta = 0.001
        self.backbone = MobileBertForSequenceClassification.from_pretrained('google/mobilebert-uncased', num_labels=6)
        
    def forward(self,input_ids, attention_mask):
        x = self.backbone(input_ids, attention_mask).logits
        return(x)
    
    def training_step(self,batch,batch_idx):
        input_ids = torch.stack(batch['input_ids'], dim=1)
        
        attention_mask = torch.stack(batch['attention_mask'], dim=1)
    
        target = torch.stack(batch['labels'], dim=1)
        
        outputs = self(input_ids, attention_mask)
        loss = (self.loss(outputs.float(), (target.float() - 3.0)/2) 
                + self.alpha*torch.mean((1-self.gain(outputs.float(), (target.float() - 3.0)/2)))
                + self.beta*(torch.mean(torch.norm(outputs.float(), p='fro', dim=1))))
        
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return loss
    
    def validation_step(self,batch,batch_idx):
        input_ids = torch.stack(batch['input_ids'], dim=1)
        attention_mask = torch.stack(batch['attention_mask'], dim=1)
        target = torch.stack(batch['labels'], dim=1)
        
        outputs = self(input_ids, attention_mask)
        loss = self.loss(outputs.float(), (target.float() - 3.0)/2)
        
        self.log("val_loss" ,loss, prog_bar=True )
        return {"loss": loss, "logits": outputs, "labels": target}
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters() , 1e-3)
        sch = torch.optim.lr_scheduler.StepLR(optimizer, step_size = 7, gamma=0.1)
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": sch,
                "monitor": "val_loss",
            },
        }

In [None]:
model= AESModule()

In [None]:
logger = TensorBoardLogger("light_logs/", name="AESlog")
checkpoint_callback = ModelCheckpoint(dirpath="checkpoints/",
    filename="best-checkpoint", save_top_k=1, verbose=True, 
    monitor="val_loss", mode="min")

In [None]:
trainer = pl.Trainer(max_epochs=50, gpus=1, callbacks=[checkpoint_callback])

In [None]:
trainer.fit(model, data_module)