<a href="https://colab.research.google.com/github/NirantK/Hinglish/blob/RoBERTa/RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
# !pip install sentencepiece
# !pip install cleantext
# import nltk
# nltk.download('stopwords')
# !pip install tqdm --upgrade --force

In [0]:
!pip install fairseq

In [0]:
# !git clone https://github.com/pytorch/fairseq.git
# %cd ..
# !ls
# !pip install --editable .

In [0]:
# !cp drive/My\ Drive/Hinglish/big/lm_data.txt hinglish/

In [0]:
from datetime import datetime
import sentencepiece as spm
from pathlib import Path
import cleantext
from tqdm import tqdm
import pandas as pd
import gdown

tqdm.pandas()
data_folder = Path("drive/My Drive/Hinglish/big")

In [0]:
from sklearn.model_selection import train_test_split
data = open('hinglish/lm_data.txt').readlines()
train, test = train_test_split(data, test_size=0.2, random_state=1)
train, valid = train_test_split(train, test_size=0.2, random_state=1)
len(train), len(test), len(valid)

In [0]:
import re
def clean(df, col):
    """Cleaning Twitter data
    
    Arguments:
        df {[pandas dataframe]} -- Dataset that needs to be cleaned
        col {[string]} -- column in which text is present
    
    Returns:
        [pandas dataframe] -- Datframe with a "clean_text" column
    """
    df["clean_text"] = df[col]
    df["clean_text"] = (
        (df["clean_text"])
        .progress_apply(lambda text: re.sub(r"RT\s@\w+:", "", text))  # Removes RTS
        .progress_apply(
            lambda text: re.sub(r"@\w+ ?", "", text)
        )  # Replaces @ with mention
        .progress_apply(lambda text: re.sub(r"RT", "", text))  # Replaces @ with mention
        .progress_apply(
            lambda text: re.sub(r"#\w+ ?", "", text)
        )  # Replaces # with hastag
        .progress_apply(lambda text: re.sub(r"http\S+", "", text))  # Removes URL
    )
    df["clean_text"] = df["clean_text"].progress_apply(
        lambda x: cleantext.clean(x, all=True)
    )
    return df


toy = pd.DataFrame(["RT @meghana https://something hello"], columns=["text"])
clean(toy, "text")

In [0]:
def clean_text(data):
    df = clean(pd.DataFrame(data, columns=["text"]), "text")
    return list(df['clean_text'])
    
train = clean_text(train)
test = clean_text(test)
valid = clean_text(valid)

In [0]:
train[:5], test[:5], valid[:5]

In [0]:
with open(f'hinglish/lm_data.valid.txt', 'w') as f:
    for item in valid:
        f.write("%s\n" % item)

In [0]:
!cp hinglish/lm_data.train.txt  ../drive/My\ Drive/Hinglish/ 
!cp hinglish/lm_data.test.txt  ../drive/My\ Drive/Hinglish/ 
!cp hinglish/lm_data.valid.txt  ../drive/My\ Drive/Hinglish/ 

In [0]:
# import pdb 
# !mkdir -p gpt2_bpe
# !wget -O gpt2_bpe/encoder.json https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json
# !wget -O gpt2_bpe/vocab.bpe https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe
!for SPLIT in train valid test; do python -m examples.roberta.multiprocessing_bpe_encoder --encoder-json gpt2_bpe/encoder.json --vocab-bpe gpt2_bpe/vocab.bpe --inputs hinglish/lm_data.${SPLIT}.txt --outputs hinglish/lm_data.${SPLIT}.bpe --keep-empty --workers 60; done

In [0]:
# !wget -O gpt2_bpe/dict.txt https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt
!fairseq-preprocess --only-source --srcdict gpt2_bpe/dict.txt --trainpref hinglish/lm_data.train.bpe --validpref hinglish/lm_data.valid.bpe --testpref hinglish/lm_data.test.bpe --destdir data-bin/hinglish --workers 60

In [0]:
!fairseq-train  data-bin/hinglish --task masked_lm --criterion masked_lm --arch roberta_base --sample-break-mode complete --tokens-per-sample 512 --optimizer adam --adam-betas '(0.9,0.98)' --adam-eps 1e-6 --clip-norm 0.0 --lr-scheduler polynomial_decay --lr 0.0005 --warmup-updates 10000 --total-num-update 125000 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.01 --max-sentences 16 --update-freq 16 --max-update 125000 --log-format simple --log-interval 1

In [0]:
from fairseq.models.roberta import RobertaModel
import torch
roberta = RobertaModel.from_pretrained('checkpoints', 'checkpoint_best.pt', '/content/fairseq/data-bin/hinglish')
assert isinstance(roberta.model, torch.nn.Module)

In [0]:
roberta.eval()  # disable dropout for evaluation

# Encode a pair of sentences and make a prediction
tokens = roberta.encode('Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.')

In [0]:
tokens

In [0]:
!ls /content/fairseq/checkpoints

In [0]:
!cp /content/fairseq/data-bin/hinglish/dict.txt /content/fairseq/checkpoints

In [0]:
!cp -r checkpoints ../drive/My\ Drive/Hinglish/
!cp -r data-bin ../drive/My\ Drive/Hinglish/
!cp -r hinglish ../drive/My\ Drive/Hinglish/

In [0]:
!cp ../drive/My\ Drive/Hinglish/interim/train.json hinglish 
!cp ../drive/My\ Drive/Hinglish/interim/test.json hinglish 

In [0]:
train_df = pd.read_json("hinglish/train.json")
train_df = clean(train_df, 'text')
train_df.head()

In [0]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(train_df['sentiment'])

In [0]:
le.classes_, le.inverse_transform([0, 1, 2])

In [0]:
with open(f'hinglish/train.labels', 'w') as f:
    for item in list(le.transform(train_df['sentiment'])):
        f.write("%s\n" % item)

In [0]:
with open(f'hinglish/train.input0', 'w') as f:
    for item in list(train_df['clean_text']):
        f.write("%s\n" % item)

In [0]:
test_df = pd.read_json("hinglish/test.json")
test_df = clean(test_df, 'text')
test_df.head()

with open(f'hinglish/test.labels', 'w') as f:
    for item in list(le.transform(test_df['sentiment'])):
        f.write("%s\n" % item)

In [0]:
with open(f'hinglish/test.input0', 'w') as f:
    for item in list(test_df['clean_text']):
        f.write("%s\n" % item)

In [0]:
# Download encoder.json and vocab.bpe
# !wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
# !wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'

!python -m examples.roberta.multiprocessing_bpe_encoder --encoder-json encoder.json --vocab-bpe vocab.bpe --inputs "hinglish/train.input0" --outputs "hinglish/train.input0.bpe" --workers 60 --keep-empty

In [0]:
!python -m examples.roberta.multiprocessing_bpe_encoder --encoder-json encoder.json --vocab-bpe vocab.bpe --inputs "hinglish/test.input0" --outputs "hinglish/test.input0.bpe" --workers 60 --keep-empty

In [0]:
# Download fairseq dictionary.
!wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'  

!fairseq-preprocess --only-source --trainpref "hinglish/train.input0.bpe" --validpref "hinglish/test.input0.bpe" --destdir "hinglish-class/input0" --workers 60 --srcdict dict.txt

In [0]:
!fairseq-preprocess --only-source --trainpref "hinglish/train.labels" --validpref "hinglish/test.labels" --destdir "hinglish-class/label" --workers 60

In [0]:
!python train.py hinglish-class/ --restore-file checkpoints/checkpoint_best.pt --max-positions 512 --max-sentences 8 --max-tokens 4400 --task sentence_prediction --reset-optimizer --reset-dataloader --reset-meters --required-batch-size-multiple 1 --init-token 0 --separator-token 2 --criterion sentence_prediction --classification-head-name hinglish_head --num-classes 3 --dropout 0.1 --attention-dropout 0.1 --weight-decay 0.1 --optimizer adam --adam-betas "(0.9, 0.98)" --adam-eps 1e-06 --clip-norm 0.0 --lr-scheduler polynomial_decay --lr 1e-05 --total-num-update 7812 --warmup-updates 469 --fp16 --fp16-init-scale 4 --threshold-loss-scale 1 --fp16-scale-window 128 --max-epoch 10 --best-checkpoint-metric accuracy --maximize-best-checkpoint-metric --truncate-sequence --find-unused-parameters --update-freq 4 --arch roberta_base