# Train

In [None]:
import numpy as np
import pandas as pd
import argparse
import torch 
import os
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import T5Tokenizer, T5ForConditionalGeneration


device = "cuda" if torch.cuda.is_available() else "cpu"


class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.formal = self.data.formal
        self.informal = self.data.informal

    def __len__(self):
        return len(self.formal)

    def __getitem__(self, index):
        informal = str(self.informal[index])
        informal = ' '.join(informal.split())

        formal = str(self.formal[index])
        formal = ' '.join(formal.split())

        source = self.tokenizer.batch_encode_plus([informal], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt',truncation=True)
        target = self.tokenizer.batch_encode_plus([formal], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }



def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, lm_labels=lm_labels)
        loss = outputs[0]

        if _%10 == 0:
            print({"Training Loss": loss.item()})

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # xm.optimizer_step(optimizer)
        # xm.mark_step()



def validate(epoch, tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]
            if _%100==0:
                print(f'Completed {_}')

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals


def get_argments():
    parser = argparse.ArgumentParser()
    parser.add_argument("--train_batch_size",type=int, default=32)
    parser.add_argument("--valid_batch_size",type=int, default=32)
    parser.add_argument("--train_epochs",type=int, default=10)
    parser.add_argument("--val_epochs",type=int,default=4)
    parser.add_argument("--learning_rate",type=float,default=1e-4)
    parser.add_argument('--seed',type=int,default=42)
    parser.add_argument("--max_len",type=int,default=128)
    parser.add_argument("--generate_len",type=int,default=128)
    parser.add_argument("--pretrained_model",type=str,default='../model/T5')
    parser.add_argument("--tokenizer_dir",type=str,default='../model/T5')
    parser.add_argument("--train_data",type=str,default='../data/informal_to_formal/em/train.csv')
    parser.add_argument("--save_model",type=str,default='../model/informal_to_formal_t5/epoch-')

    args = parser.parse_args()
    return args
def main():

    args = get_argments()



    # Set random seeds and deterministic pytorch for reproducibility
    torch.manual_seed(args.seed) # pytorch random seed
    np.random.seed(args.seed) # numpy random seed
    torch.backends.cudnn.deterministic = True

    # tokenzier for encoding the text
    tokenizer = T5Tokenizer.from_pretrained(args.tokenizer_dir)
    

    df = pd.read_csv(args.train_data)
    df = df[['formal','informal']]
    df.informal = 'paraphrase: ' + df.informal + ' </s>'
    df.formal = df.formal + ' </s>'
    print(df.head())

    train_dataset = df.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
#    print("TEST Dataset: {}".format(val_dataset.shape))


    training_set = CustomDataset(train_dataset, tokenizer, args.max_len, args.generate_len)
#    val_set = CustomDataset(val_dataset, tokenizer, config.MAX_LEN, config.SUMMARY_LEN)

    # Defining the parameters for creation of dataloaders
    train_params = {
        'batch_size': args.train_batch_size,
        'shuffle': True,
        'num_workers': 0
        }

    val_params = {
        'batch_size': args.valid_batch_size,
        'shuffle': False,
        'num_workers': 0
        }

    # Creation of Dataloaders for testing and validation. This will be used down for training and validation stage for the model.
    training_loader = DataLoader(training_set, **train_params)
#    val_loader = DataLoader(val_set, **val_params)


    model = T5ForConditionalGeneration.from_pretrained(args.pretrained_model)
    model = model.to(device)

    # Defining the optimizer that will be used to tune the weights of the network in the training session. 
    optimizer = torch.optim.Adam(params =  model.parameters(), lr=args.learning_rate)

    # Training loop
    print('Initiating Fine-Tuning for the model on our dataset')

    for epoch in range(args.train_epochs):
        train(epoch, tokenizer, model, device, training_loader, optimizer)
        
        output_dir = args.save_model + str(epoch)
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)

    print("Training is over !")
#    torch.save(model, "../model/test_t5")

    # Validation loop and saving the resulting file with predictions and acutals in a dataframe.
    # Saving the dataframe as predictions.csv
#    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
#    for epoch in range(config.VAL_EPOCHS):
#        predictions, actuals = validate(epoch, tokenizer, model, device, val_loader)
#        print("predictions:",predictions)
#        final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
#        final_df.to_csv('../models/predictions.csv')
#        print('Output Files generated for review')

if __name__ == '__main__':
    main()







# Generate

In [None]:
import numpy as np
import pandas as pd
import argparse
import torch 
import os
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers3 import T5Tokenizer, T5ForConditionalGeneration


device = "cuda" if torch.cuda.is_available() else "cpu"


class CustomDataset(Dataset):

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.formal = self.data.formal
        self.informal = self.data.informal

    def __len__(self):
        return len(self.formal)

    def __getitem__(self, index):
        informal = str(self.informal[index])
        informal = ' '.join(informal.split())

        formal = str(self.formal[index])
        formal = ' '.join(formal.split())

        source = self.tokenizer.batch_encode_plus([informal], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt',truncation=True)
        target = self.tokenizer.batch_encode_plus([formal], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt', truncation=True)

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }





def genrate(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for _, data in enumerate(loader, 0):
            y = data['target_ids'].to(device, dtype = torch.long)
            ids = data['source_ids'].to(device, dtype = torch.long)
            mask = data['source_mask'].to(device, dtype = torch.long)

            generated_ids = model.generate(
                input_ids = ids,
                attention_mask = mask, 
                max_length=150, 
                num_beams=2,
                repetition_penalty=2.5, 
                length_penalty=1.0, 
                early_stopping=True
                )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True)for t in y]

            predictions.extend(preds)
            actuals.extend(target)
    return predictions, actuals


def get_argments():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gen_batch_size",type=int, default=32)
    parser.add_argument("--train_epochs",type=int, default=10)
    parser.add_argument("--val_epochs",type=int,default=1)
    parser.add_argument("--learning_rate",type=float,default=1e-4)
    parser.add_argument('--seed',type=int,default=42)
    parser.add_argument("--max_len",type=int,default=128)
    parser.add_argument("--generate_len",type=int,default=128)
    parser.add_argument("--pretrained_model",type=str,default='../model/T5')
    parser.add_argument("--tokenizer_dir",type=str,default='../model/T5')
    parser.add_argument("--gen_data",type=str,default='../data/informal_to_formal/em/train.csv')
    parser.add_argument("--save_model",type=str,default='../model/informal_to_formal_t5/epoch-')
    parser.add_argument("--output_file",type=str,default='')

    args = parser.parse_args()
    return args
def main():

    args = get_argments()



    torch.manual_seed(args.seed) # pytorch random seed
    np.random.seed(args.seed) # numpy random seed
    torch.backends.cudnn.deterministic = True

    tokenizer = T5Tokenizer.from_pretrained(args.tokenizer_dir)
    

    df = pd.read_csv(args.gen_data)
    df = df[['formal','informal']]
    df.informal = 'paraphrase: ' + df.informal
    print(df.head())

    gen_dataset = df.reset_index(drop=True)

    print("FULL Dataset: {}".format(df.shape))
    print("TRAIN Dataset: {}".format(gen_dataset.shape))


    gen_set = CustomDataset(gen_dataset, tokenizer, args.max_len, args.generate_len)


    gen_params = {
        'batch_size': args.gen_batch_size,
        'shuffle': False,
        'num_workers': 0
        }

    gen_loader = DataLoader(gen_set, **gen_params)


    model = T5ForConditionalGeneration.from_pretrained(args.pretrained_model)
    model = model.to(device)

    fw = open(args.output_file,"w")

    print('Now generating summaries on our fine tuned model for the validation dataset and saving it in a dataframe')
    predictions, actuals = genrate( tokenizer, model, device, gen_loader)
    for i in range(len(predictions)):
        fw.write(predictions[i] +"\n")
    print("predictions:",predictions)
#    final_df = pd.DataFrame({'Generated Text':predictions,'Actual Text':actuals})
#    final_df.to_csv('../t5/predictions.csv')
    print('Output Files generated for review')

if __name__ == '__main__':
    main()





