In [None]:
!nvidia-smi

Sat Jul  8 10:35:18 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   47C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!pip install simpletransformers==0.60.9

Collecting simpletransformers==0.60.9
  Downloading simpletransformers-0.60.9-py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.7/206.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.2.0 (from simpletransformers==0.60.9)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m123.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers==0.60.9)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting tensorboardx (from simpletransformers==0.60.9)
  Downloading tensorboardX-2.6.1-py2.py3-none-any.whl (101 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.6/101.6 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting to

In [None]:
import warnings
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

### Data Loading Function

In [None]:
def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

### Data Cleaning Operations

In [None]:
def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!unzip /content/drive/MyDrive/nlp_para/RephraseitDev.zip

In [None]:
%cd /content/drive/MyDrive/nlp_para/RephraseitDev

/content/drive/MyDrive/nlp_para/RephraseitDev


### Google Paws Dataset

In [None]:
# Google Data
train_df = pd.read_csv("train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("dev.tsv", sep="\t").astype(str)


train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                              input_text  \
1      The NBA season of 1975 -- 76 was the 30th seas...   
3      When comparable rates of flow can be maintaine...   
4      It is the seat of Zerendi District in Akmola R...   
5      William Henry Henry Harman was born on 17 Febr...   
7      With a discrete amount of probabilities Formul...   
...                                                  ...   
49384  The Romanesque language , Galician ( Galego ) ...   
49390  Note that k is a vector consisting of three in...   
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...   
49395  He was considered an active member of the coun...   
49397  She was in Cork on June 24 and arrived on 8 Ju...   

                                             target_text      prefix  
1      The 1975 -- 76 season of the National Basketba...  paraphrase  
3      The results are high when comparable flow rate...  paraphrase  
4      It is the seat of the district of Zerendi in A...  paraphra

### MSRP Data Loading

In [None]:
# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)




  df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
Skipping line 102: expected 5 fields, saw 6
Skipping line 656: expected 5 fields, saw 6
Skipping line 867: expected 5 fields, saw 6
Skipping line 880: expected 5 fields, saw 6
Skipping line 980: expected 5 fields, saw 6
Skipping line 1439: expected 5 fields, saw 6
Skipping line 1473: expected 5 fields, saw 6
Skipping line 1822: expected 5 fields, saw 6
Skipping line 1952: expected 5 fields, saw 6
Skipping line 2009: expected 5 fields, saw 6
Skipping line 2230: expected 5 fields, saw 6
Skipping line 2506: expected 5 fields, saw 6
Skipping line 2523: expected 5 fields, saw 6
Skipping line 2809: expected 5 fields, saw 6
Skipping line 2887: expected 5 fields, saw 6
Skipping line 2920: expected 5 fields, saw 6
Skipping line 2944: expected 5 fields, saw 6
Skipping line 3241: expected 5 fields, saw 6
Skipping line 3358: expected 5 fields, saw 6
Skipping line 3459: expected 5 fields, saw 6
Skipping line 3491: expected 5 fields, s

In [None]:
print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                             input_text  \
1     The NBA season of 1975 -- 76 was the 30th seas...   
3     When comparable rates of flow can be maintaine...   
4     It is the seat of Zerendi District in Akmola R...   
5     William Henry Henry Harman was born on 17 Febr...   
7     With a discrete amount of probabilities Formul...   
...                                                 ...   
3931  Knox County Health Department is following nat...   
3932  The new rules will allow a single company to o...   
3933  At this point, Mr. Brando announced: 'Somebody...   
3935  We have concluded that the outlook for price s...   
3936  The notification was first reported Friday by ...   

                                            target_text      prefix  
1     The 1975 -- 76 season of the National Basketba...  paraphrase  
3     The results are high when comparable flow rate...  paraphrase  
4     It is the seat of the district of Zerendi in A...  paraphrase  
5     Willi

### Quora Datset

In [None]:
# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)



  df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)


In [None]:
q_train

Unnamed: 0,input_text,target_text,prefix
27865,Why has flipkart acquired Jabong although it a...,Why did Flipkart acquired Jabong though they h...,paraphrase
281514,Is there any karma rule is applied in real life?,What is the best example of Karma in your life?,paraphrase
80099,What is the best time for doing meditation?,Which is the best time for meditation ?,paraphrase
238378,What is the best way make extra money?,What are some interesting ways to make money?,paraphrase
43770,How can you get your Quora question or answer ...,How do you get your Quora question to go viral?,paraphrase
...,...,...,...
19102,How should I prepare for the GATE 2018 for CSE?,How do I prepare for gate 2018 (CSE)?,paraphrase
183135,What is the best way to take a screenshot on a...,How do I take a screenshot on a samsung galaxy...,paraphrase
193331,How can I enhance my English writing skills?,How can I improve my English vocabulary and wr...,paraphrase
114293,How to start preparing for UPSC 2018?,How and when should I start my preparations fo...,paraphrase


### ParaBank Dataset

In [None]:
#Parabank Data
paradata = pd.read_csv("parabank_5m.tsv", sep='\t', header=None, error_bad_lines=False)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Skipping line 1387401: expected 2 fields, saw 3
Skipping line 1387414: expected 2 fields, saw 3
Skipping line 1388204: expected 2 fields, saw 3
Skipping line 1388693: expected 2 fields, saw 3
Skipping line 1389233: expected 2 fields, saw 3
Skipping line 1389248: expected 2 fields, saw 4
Skipping line 1389441: expected 2 fields, saw 3
Skipping line 1389636: expected 2 fields, saw 3
Skipping line 1390851: expected 2 fields, saw 3
Skipping line 1391404: expected 2 fields, saw 3
Skipping line 1392552: expected 2 fields, saw 3
Skipping line 1392865: expected 2 fields, saw 3
Skipping line 1393098: expected 2 fields, saw 3
Skipping line 1393413: expected 2 fields, saw 3
Skipping line 1394616: expected 2 fields, saw 3
Skipping line 1394964: expected 2 fields, saw 4
Skipping line 1395353: expected 2 fields, saw 3
Skipping line 1398921: expected 2 fields, saw 3
Skipping line 1399615: expected 2 fields, saw 3
Skipping line 1401546: 

In [None]:
paradata['prefix'] = 'paraphrase'
#paradata = paradata.reindex(columns=['prefix',0,1])
paradata.rename(columns={0:'input_text',1:'target_text'}, inplace = True)
para_train, para_test = train_test_split(paradata)

### Data Transformation &b Cleaning

In [None]:
train_df = pd.concat([train_df, q_train,para_train])
eval_df = pd.concat([eval_df, q_test,para_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

In [None]:
eval_df

Unnamed: 0,prefix,input_text,target_text
1,paraphrase,They were there to enjoy us and they were ther...,They were there for us to enjoy and they were ...
2,paraphrase,"After the end of the war in June 1902, Higgins...","In August, after the end of the war in June 19..."
3,paraphrase,From the merger of the Four Rivers Council and...,Shawnee Trails Council was formed from the mer...
4,paraphrase,The group toured extensively and became famous...,The group toured extensively and was famous in...
5,paraphrase,Kathy and her husband Pete Beale ( Peter Dean ...,Kathy and her husband Peter Dean ( Pete Beale ...
...,...,...,...
2437080,paraphrase,"Once you find something good, Max, you have to...","If you find something good, Max, you should ta..."
4369734,paraphrase,Can I come in?,May I come in?
3198172,paraphrase,"Here, you take these to the garage.","Here, take this to the garage."
4615396,paraphrase,What are the rules relating to reorganisation ...,What are the rules for reorganisation proceedi...


# My Datset Ready....let's Go for training

### Simple Transformers

In [None]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 8
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART in FSDS BOOTCAMP Class"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

  0%|          | 0/3641250 [00:00<?, ?it/s]



Epoch:   0%|          | 0/10 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Running Epoch 0 of 10:   0%|          | 0/455157 [00:00<?, ?it/s]

  0%|          | 0/1210318 [00:00<?, ?it/s]

--- Logging error ---
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router_sock.py", line 27, in _read_message
    resp = self._sock_client.read_server_response(timeout=1)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py", line 285, in read_server_response
    data = self._read_packet_bytes(timeout=timeout)
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/lib/sock_client.py", line 269, in _read_packet_bytes
    raise SockClientClosedError
wandb.sdk.lib.sock_client.SockClientClosedError

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router.py", line 70, in message_loop
    msg = self._read_message()
  File "/usr/local/lib/python3.10/dist-packages/wandb/sdk/interface/router_sock.py", line 29, in _read_message
    raise MessageRouterClosedError
wandb.sdk.interface.router.M

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/IPython/core/interactiveshell.py", line 3553, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-18-b7be7fe033b4>", line 31, in <cell line: 31>
    model.train_model(train_df, eval_data=eval_df)
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/seq2seq/seq2seq_model.py", line 310, in train_model
    global_step, training_details = self.train(
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/seq2seq/seq2seq_model.py", line 625, in train
    results = self.eval_model(
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/seq2seq/seq2seq_model.py", line 829, in eval_model
    result = self.evaluate(eval_dataset, output_dir, verbose=verbose, silent=silent, **kwargs)
  File "/usr/local/lib/python3.10/dist-packages/simpletransformers/seq2seq/seq2seq_model.py", line 895, in evaluate
    with open(output_eval_file, "w") as writ

In [None]:
!nvidia-smi

Sun Jul 17 11:02:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 300W |      2MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
#!conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.1 -c pytorch -y