<a href="https://colab.research.google.com/github/PSingla-ds/NLP-Projects/blob/main/parapharasing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install simpletransformers==0.60.9



In [None]:
import warnings
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

### Data Loading Function

In [None]:
def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

### Data Cleaning Operations

In [None]:
def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pwd

'/content'

In [None]:
#!unzip /content/drive/MyDrive/nlp_para/RephraseitDev.zip

In [None]:
%cd /content/drive/MyDrive/paraphrase_project/

/content/drive/MyDrive/paraphrase_project


### Google Paws Dataset

In [None]:
# Google Data
train_df = pd.read_csv("/content/drive/MyDrive/paraphrase_project/RephraseitDev/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("/content/drive/MyDrive/paraphrase_project/RephraseitDev/dev.tsv", sep="\t").astype(str)


train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                              input_text  ...      prefix
1      The NBA season of 1975 -- 76 was the 30th seas...  ...  paraphrase
3      When comparable rates of flow can be maintaine...  ...  paraphrase
4      It is the seat of Zerendi District in Akmola R...  ...  paraphrase
5      William Henry Henry Harman was born on 17 Febr...  ...  paraphrase
7      With a discrete amount of probabilities Formul...  ...  paraphrase
...                                                  ...  ...         ...
49384  The Romanesque language , Galician ( Galego ) ...  ...  paraphrase
49390  Note that k is a vector consisting of three in...  ...  paraphrase
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...  ...  paraphrase
49395  He was considered an active member of the coun...  ...  paraphrase
49397  She was in Cork on June 24 and arrived on 8 Ju...  ...  paraphrase

[21829 rows x 3 columns]
-------------------------------------------------------------
                        

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


### MSRP Data Loading

In [None]:
# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("/content/drive/MyDrive/paraphrase_project/RephraseitDev/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("/content/drive/MyDrive/paraphrase_project/RephraseitDev/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)


b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

In [None]:
print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                             input_text  ...      prefix
1     The NBA season of 1975 -- 76 was the 30th seas...  ...  paraphrase
3     When comparable rates of flow can be maintaine...  ...  paraphrase
4     It is the seat of Zerendi District in Akmola R...  ...  paraphrase
5     William Henry Henry Harman was born on 17 Febr...  ...  paraphrase
7     With a discrete amount of probabilities Formul...  ...  paraphrase
...                                                 ...  ...         ...
3931  Knox County Health Department is following nat...  ...  paraphrase
3932  The new rules will allow a single company to o...  ...  paraphrase
3933  At this point, Mr. Brando announced: 'Somebody...  ...  paraphrase
3935  We have concluded that the outlook for price s...  ...  paraphrase
3936  The notification was first reported Friday by ...  ...  paraphrase

[24490 rows x 3 columns]
-------------------------------------------------------------
                                    

### Quora Datset

In [None]:
# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "/content/drive/MyDrive/paraphrase_project/RephraseitDev/quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)

### ParaBank Dataset

In [None]:
#Parabank Data
# paradata = pd.read_csv("/content/drive/MyDrive/paraphrase_project/RephraseitDev/parabank_5m.tsv", sep='\t', header=None, error_bad_lines=False)

In [None]:
paradata['prefix'] = 'paraphrase'
#paradata = paradata.reindex(columns=['prefix',0,1])
paradata.rename(columns={0:'input_text',1:'target_text'}, inplace = True)
para_train, para_test = train_test_split(paradata)

NameError: ignored

### Data Transformation &b Cleaning

In [None]:
train_df = pd.concat([train_df, q_train])
eval_df = pd.concat([eval_df, q_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

# My Datset Ready....let's Go for training

### Simple Transformers

In [None]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 4
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/136422 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model: Training started


Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 10:   0%|          | 0/34106 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_utils: Creating features from dataset file at cache_dir/


  0%|          | 0/41937 [00:00<?, ?it/s]

INFO:simpletransformers.seq2seq.seq2seq_model:{'eval_loss': 1.4378500619205603}
INFO:simpletransformers.seq2seq.seq2seq_model:Saving model into outputs/best_model


In [None]:
#!nvidia-smi

In [None]:
#!conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.1 -c pytorch -y

In [None]:
model.save