<a href="https://colab.research.google.com/github/NirantK/Hinglish/blob/utils/utils.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Installs 
Restart the runtime after all the installs are done

In [None]:
# !pip install gdown

In [None]:
# !pip install jsonlines

In [None]:
# !pip install ninja
# !pip install --upgrade --force-reinstall fastai

In [None]:
# !pip install sentencepiece
# !pip install cleantext
# import nltk
# nltk.download('stopwords')

In [None]:
# !pip install tqdm --upgrade --force

# Data Cleaning and Loading and Training Utils

In [None]:
import sys
sys.path.insert(0, "../")
from hinglishutils import get_files_from_gdrive
from pathlib import Path
datapath = Path("../data")
data_raw = datapath/"raw"
data_interim = datapath/"interim"
data_processed = datapath/"processed"
cleanlab_datapath = datapath/"cleanlab"

In [None]:
from fastai.text import *
from datetime import datetime
import sentencepiece as spm
from pathlib import Path
import cleantext
from tqdm import tqdm
import pandas as pd
import gdown

tqdm.pandas()
data_folder = Path("drive/My Drive/Hinglish/big")

In [None]:
sp = spm.SentencePieceProcessor()
sp.Load(str(data_folder / "hinglish_sp.model"))


class SpTokenizer(BaseTokenizer):
    def __init__(self, lang: str, vocab_size: int = 8000):
        self.lang = lang
        self.vocab_size = vocab_size
        self.sp = spm.SentencePieceProcessor()
        self.sp.Load(str(data_folder / "hinglish_sp.model"))
        self.vocab = Vocab([self.sp.IdToPiece(int(i)) for i in range(self.vocab_size)])

    def tokenizer(self, t: str) -> List[str]:
        return self.sp.EncodeAsPieces(t)

    def detokenizer(self, t: List[str]) -> str:
        return self.sp.DecodePieces(t)

In [None]:
def clean(df, col):
    """Cleaning Twitter data
    
    Arguments:
        df {[pandas dataframe]} -- Dataset that needs to be cleaned
        col {[string]} -- column in which text is present
    
    Returns:
        [pandas dataframe] -- Datframe with a "clean_text" column
    """
    df["clean_text"] = df[col]
    df["clean_text"] = (
        (df["clean_text"])
        .progress_apply(lambda text: re.sub(r"RT\s@\w+:", "", text))  # Removes RTS
        .progress_apply(
            lambda text: re.sub(r"@\w+ ?", "", text)
        )  # Replaces @ with mention
        .progress_apply(lambda text: re.sub(r"RT", "", text))  # Replaces @ with mention
        .progress_apply(
            lambda text: re.sub(r"#\w+ ?", "", text)
        )  # Replaces # with hastag
        .progress_apply(lambda text: re.sub(r"http\S+", "", text))  # Removes URL
    )
    df["clean_text"] = df["clean_text"].progress_apply(
        lambda x: cleantext.clean(x, all=True)
    )
    return df


toy = pd.DataFrame(["RT @meghana https://something hello"], columns=["text"])
clean(toy, "text")

In [None]:
def load_and_clean_files():
    test = pd.read_json("drive/My Drive/Hinglish/interim/test.json")
    train = pd.read_json("drive/My Drive/Hinglish/interim/train.json")
    valid = pd.read_json("drive/My Drive/Hinglish/interim/valid.json")
    final_test = pd.read_json("drive/My Drive/Hinglish/interim/final_test.json")
    hinglish_unsup_high_confidence = pd.read_json(
        "drive/My Drive/Hinglish/interim/hinglish_unsup_high_confidence.json"
    )
    hinglish_unsup_less_confidence = pd.read_json(
        "drive/My Drive/Hinglish/interim/hinglish_unsup_less_confidence.json"
    )
    test = clean(test, "text")
    train = clean(train, "text")
    valid = clean(valid, "text")
    final_test = clean(final_test, "text")
    hinglish_unsup_high_confidence = clean(hinglish_unsup_high_confidence, 0)
    hinglish_unsup_less_confidence = clean(hinglish_unsup_less_confidence, 0)
    return (
        train,
        test,
        valid,
        final_test,
        hinglish_unsup_high_confidence,
        hinglish_unsup_less_confidence,
    )

In [None]:
def load_unstructured_hinglish_from_txt():
    data = open("big_unstruct.txt", "r").readlines()
    data += open(data_folder / "train.txt", "r").readlines()
    data += open(data_folder / "valid.txt", "r").readlines()
    df = clean(pd.DataFrame(data, columns=["text"]), "text")
    return df


def load_unstructured_hinglish_from_df(
    train,
    test,
    valid,
    final_test,
    hinglish_unsup_high_confidence,
    hinglish_unsup_less_confidence,
):
    data = list(load_unstructured_hinglish_from_txt()["clean_text"])
    data += list(train["clean_text"])
    data += list(test["clean_text"])
    data += list(valid["clean_text"])
    data += list(final_test["clean_text"])
    data += list(hinglish_unsup_high_confidence[0])
    data += list(hinglish_unsup_less_confidence[0])
    return data

In [None]:
def train(
    learn,
    model_name: str,
    lr=2e-03,
    loops=5,
    num_epocs_per_loop=2,
    encoder=False,
    test=False,
):
    learn.fit_one_cycle(1, slice(lr))
    learn.unfreeze()
    if test:
        learn.save(f"{model_name}_test")
        return
    for i in range(loops):
        learn.unfreeze()
        learn.fit_one_cycle(num_epocs_per_loop, slice(lr))
        learn.save(f"{model_name}_{i}")
        if encoder:
            learn.save_encoder(f"{model_name}_enc_{i}")
        print(f"saved {model_name}_{i}")
        learn.recorder.plot_losses()
        learn.recorder.plot_metrics()

# Cleaning and Creating databunch

In [None]:
big_unstruct_id = "1TcUKflmq4nAV2-YopYz1VqyEG9xrVIc1"
get_files_from_gdrive(fname="big_unstruct.txt", file_id=big_unstruct_id)
tokenizer = Tokenizer(tok_func=SpTokenizer)
(
    train,
    test,
    valid,
    final_test,
    hinglish_unsup_high_confidence,
    hinglish_unsup_less_confidence,
) = load_and_clean_files()
data = load_unstructured_hinglish_from_df(
    train,
    test,
    valid,
    final_test,
    hinglish_unsup_high_confidence,
    hinglish_unsup_less_confidence,
)
len(data)

In [None]:
from sklearn.model_selection import train_test_split

df = pd.DataFrame(data, columns=["clean_text"])
train_lm, valid_lm = train_test_split(df, test_size=0.1)

In [None]:
data_lm = TextLMDataBunch.from_df(
    data_folder,
    train_df=train_lm,
    valid_df=valid_lm,
    text_cols="clean_text",
    tokenizer=tokenizer,
)
print("done")
data_lm.save("clean_lm." + "pkl")
data_lm.show_batch()

In [None]:
data_cls_lm = TextLMDataBunch.from_df(
    data_folder,
    train_df=pd.concat([train, final_test, test]),
    valid_df=valid,
    text_cols="clean_text",
    label_cols="sentiment",
    tokenizer=tokenizer,
    vocab=data_lm.vocab,
)
print("done")
data_cls_lm.save("clean_cls_lm." + "pkl")
data_cls_lm.show_batch()

In [None]:
data_cls = TextClasDataBunch.from_df(
    data_folder,
    train_df=pd.concat([train, test]),
    valid_df=valid,
    text_cols="clean_text",
    label_cols="sentiment",
    tokenizer=tokenizer,
    vocab=data_lm.vocab,
)
print("done")
data_cls.save("clean_cls." + "pkl")
data_cls.show_batch()

# Loading Data

In [None]:
data_lm = load_data(path=data_folder, file="clean_lm.pkl")
data_cls_lm = load_data(path=data_folder, file="clean_cls_lm.pkl")
data_cls = load_data(path=data_folder, file="clean_cls.pkl")

# Pre-Training LM

In [None]:
awd_lstm_lm_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.25,
    hidden_p=0.1,
    input_p=0.2,
    embed_p=0.02,
    weight_p=0.15,
    tie_weights=True,
    out_bias=True,
)
learn = language_model_learner(
    data_lm,
    arch=AWD_LSTM,
    config=awd_lstm_lm_config,
    drop_mult=0.5,
    metrics=[accuracy, Perplexity()],
    pretrained=False,
)

In [None]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
train(
    learn=learn,
    model_name="language_model",
    lr=2e-03,
    loops=5,
    num_epocs_per_loop=2,
    encoder=False,
    test=False,
)

# Training LM Encoder with Classification Data

In [None]:
awd_lstm_lm_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.25,
    hidden_p=0.1,
    input_p=0.2,
    embed_p=0.02,
    weight_p=0.15,
    tie_weights=True,
    out_bias=True,
)
learn = language_model_learner(
    data_cls_lm, arch=AWD_LSTM, config=awd_lstm_lm_config, pretrained=False
)

In [None]:
i = 4
learn.load(f"language_model_{i}")

In [None]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
train(
    learn=learn,
    model_name="language_model_class",
    lr=2e-03,
    loops=10,
    num_epocs_per_loop=10,
    encoder=True,
    test=False,
)

# Training Classifier

In [None]:
awd_lstm_clas_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.4,
    hidden_p=0.2,
    input_p=0.6,
    embed_p=0.1,
    weight_p=0.5,
)
learn = text_classifier_learner(
    data_cls,
    AWD_LSTM,
    config=awd_lstm_clas_config,
    drop_mult=0.5,
    metrics=[accuracy],
    pretrained=False,
).to_fp16()

In [None]:
i = 9
learn.load_encoder(f"language_model_class_enc_{i}")
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
train(
    learn=learn,
    model_name="class_model",
    lr=2e-03,
    loops=10,
    num_epocs_per_loop=10,
    encoder=False,
    test=False,
)

# Predict

In [None]:
data_cls = load_data(path=data_folder, file="clean_cls.pkl")
final_test = pd.read_json("drive/My Drive/Hinglish/interim/final_test.json")
final_test = clean(final_test, "text")
awd_lstm_clas_config = dict(
    emb_sz=400,
    n_hid=1150,
    n_layers=3,
    pad_token=1,
    qrnn=True,
    output_p=0.4,
    hidden_p=0.2,
    input_p=0.6,
    embed_p=0.1,
    weight_p=0.5,
)
learn = text_classifier_learner(
    data_cls,
    AWD_LSTM,
    config=awd_lstm_clas_config,
    drop_mult=0.5,
    metrics=[accuracy],
    pretrained=False,
).to_fp16()

In [None]:
i = 9
learn.load(f"test_class_model_{i}")
final_test["predicted"] = final_test["clean_text"].progress_apply(
    lambda x: str(learn.predict(x)[0])
)
final_test.head()

In [None]:
final_test.to_csv(data_folder / "answer.csv")

In [None]:
with open(data_folder / "answer.txt", "w") as f:
    f.write("Uid, Sentiment\n")

with open(data_folder / "answer.txt", "a") as f:
    for i in range(len(final_test["predicted"].tolist())):
        f.write(f"{final_test.loc[i]['uid']},{final_test.loc[i]['predicted']}\n")

In [None]:
from google.colab import files

files.download(data_folder / "answer.txt")