# Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from tqdm import tqdm

In [3]:
df=pd.read_csv('../Dataset/Flight_Timings.csv')

In [4]:
df.head()

Unnamed: 0,From,To,Departure Time,Arrival Time
0,Rwanda,Switzerland,02:51 AM,8:13 AM
1,United Kingdom,United States Minor Outlying Islands,09:58 AM,3:39 PM
2,United Arab Emirates,Bangladesh,06:22 PM,11:55 PM
3,Mexico,Australia,09:06 AM,12:02 PM
4,Hungary,Angola,08:55 PM,12:20 AM


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 249 entries, 0 to 248
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   From            249 non-null    object
 1   To              249 non-null    object
 2   Departure Time  249 non-null    object
 3   Arrival Time    249 non-null    object
dtypes: object(4)
memory usage: 7.9+ KB


In [6]:
text = []
for r in df.iterrows():
    row = r[1]
    active_sentence = "The Bus starts from " + row['From'] + " at " + row['Departure Time'] + " and goes to " + row['To'] + " reaching at " + row['Arrival Time']
    passive_sentence = "The bus arrives at " + row['To'] + " by " + row['Arrival Time'] + " after departing from " + row['From'] + " at " + row["Arrival Time"]
    text.append(active_sentence)
    text.append(passive_sentence)

In [7]:
print("Active Voice:",text[0])
print("Passive voice:", text[1])

Active Voice: The Bus starts from Rwanda at 02:51 AM and goes to Switzerland reaching at 8:13 AM
Passive voice: The bus arrives at Switzerland by 8:13 AM after departing from Rwanda at 8:13 AM


In [8]:
len(text[0])

82

In [9]:
len(text)

498

In [10]:
text.remove("The Bus starts from French Guiana at 02:04 PM and goes to Yemen reaching at 4:15 PM")

In [11]:
len(text)

497

In [12]:
text.remove("The bus arrives at Yemen by 4:15 PM after departing from French Guiana at 4:15 PM")
len(text)

496

In [13]:
text.index("The Bus starts from Palau at 04:15 PM and goes to Sweden reaching at 4:45 PM")

450

In [14]:
text.remove("The Bus starts from Palau at 04:15 PM and goes to Sweden reaching at 4:45 PM")
text.remove("The bus arrives at Sweden by 4:45 PM after departing from Palau at 4:45 PM")
len(text)

494

### Paraphrasing the data

In [15]:
import torch
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [16]:
model_name = 'tuner007/pegasus_paraphrase'
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [17]:
torch_device

'cuda'

In [18]:
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

Old Apporach

In [13]:
def get_response(input_text, num_return_sequences):
    input_len = len(input_text)
    batch = tokenizer.prepare_seq2seq_batch([input_text],truncation=True,padding='longest',max_length=100, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=100, num_beams=10, num_return_sequences=num_return_sequences, temperature=1)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    return tgt_text

New approach

In [19]:
def get_response(input_text, num_return_sequences):
    input_len = len(input_text)
    batch = tokenizer([input_text], truncation=True, padding='longest', max_length=100, return_tensors="pt").to(torch_device)
    translated = model.generate(**batch, max_length=100, num_beams=10, num_return_sequences=num_return_sequences, temperature=1)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return tgt_text

In [14]:
batch = tokenizer([text[0]], truncation=True, padding='longest', max_length=100, return_tensors="pt").to(torch_device)

In [15]:
batch

{'input_ids': tensor([[  139,  7588,  2171,   135, 23321,   134, 73057,   740,  3887,   111,
          1168,   112,  7317,  4379,   134,   110, 88831,  3887,     1]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

In [16]:
translated = model.generate(**batch, max_length=100, num_beams=5, num_return_sequences=3)
translated

tensor([[    0,   139,  2238,  1168,   135, 23321,   112,  7317,   134,   110,
         88831,  3887,   107,     1,     0,     0,     0,     0,     0,     0],
        [    0,   139,  2238,  2163, 23321,   134, 73057,   740,  3887,   111,
          1168,   112,  7317,   134,   110, 88831,  3887,   107,     1,     0],
        [    0,   139,  2238,  2163,   135, 23321,   134, 73057,   740,  3887,
           111,  1168,   112,  7317,   134,   110, 88831,  3887,   107,     1]],
       device='cuda:0')

In [17]:
tokenizer.batch_decode(translated, skip_special_tokens=True)

['The bus goes from Rwanda to Switzerland at 8:13 AM.',
 'The bus leaves Rwanda at 02:51 AM and goes to Switzerland at 8:13 AM.',
 'The bus leaves from Rwanda at 02:51 AM and goes to Switzerland at 8:13 AM.']

In [20]:
text[0]

'The Bus starts from Rwanda at 02:51 AM and goes to Switzerland reaching at 8:13 AM'

In [21]:
get_response(text[0], 3)

['The bus goes from Rwanda to Switzerland at 8:13 AM.',
 'The bus leaves Rwanda at 02:51 AM and goes to Switzerland at 8:13 AM.',
 'The bus leaves from Rwanda at 02:51 AM and goes to Switzerland at 8:13 AM.']

In [18]:
len(text[0])

82

In [19]:
82 - 25

57

In [20]:
len('The bus goes from Rwanda to Switzerland at 8:13 AM.')

51

In [22]:
paraphrased_text = []
for sentence in tqdm(text[450:]):
    try:
        paraphrased_sentences = get_response(sentence, 3)
        for p_sent in paraphrased_sentences:
            if(len(p_sent) > len(sentence)-25):
                paraphrased_text.append(p_sent)
    except:
        print(sentence)
        continue

100%|██████████████████████████████████████████████████████████████████████████████████| 44/44 [00:29<00:00,  1.50it/s]


In [23]:
len(paraphrased_text)

1068

In [29]:
final_text = []
final_text += text
final_text += paraphrased_text

In [31]:
len(final_text)

1564

In [38]:
final_text.append("The Bus starts from French Guiana at 02:04 PM and goes to Yemen reaching at 4:15 PM")
final_text.append("The bus arrives at Yemen by 4:15 PM after departing from French Guiana at 4:15 PM")

In [39]:
len(final_text)

1566

In [40]:
paraphrased_df = pd.DataFrame({'text': final_text})

In [41]:
paraphrased_df.head()

Unnamed: 0,text
0,The Bus starts from Rwanda at 02:51 AM and goe...
1,The bus arrives at Switzerland by 8:13 AM afte...
2,The Bus starts from United Kingdom at 09:58 AM...
3,The bus arrives at United States Minor Outlyin...
4,The Bus starts from United Arab Emirates at 06...


In [42]:
paraphrased_df.to_csv("../Dataset/paraphrased_data.csv", index=False)

In [24]:
len(paraphrased_text)

100

In [32]:
final_text = []
final_text += text[450:]
final_text += paraphrased_text

In [33]:
len(final_text)

144

In [34]:
final_text.append("The Bus starts from Palau at 04:15 PM and goes to Sweden reaching at 4:45 PM")
final_text.append("The bus arrives at Sweden by 4:45 PM after departing from Palau at 4:45 PM")

In [35]:
final_text

['The Bus starts from Holy See (Vatican City State) at 01:30 PM and goes to Bouvet Island reaching at 4:46 PM',
 'The bus arrives at Bouvet Island by 4:46 PM after departing from Holy See (Vatican City State) at 4:46 PM',
 'The Bus starts from Bangladesh at 06:22 PM and goes to Afghanistan reaching at 8:01 PM',
 'The bus arrives at Afghanistan by 8:01 PM after departing from Bangladesh at 8:01 PM',
 'The Bus starts from Mozambique at 09:27 AM and goes to Bouvet Island reaching at 11:19 AM',
 'The bus arrives at Bouvet Island by 11:19 AM after departing from Mozambique at 11:19 AM',
 'The Bus starts from Jamaica at 11:36 PM and goes to New Caledonia reaching at 3:02 AM',
 'The bus arrives at New Caledonia by 3:02 AM after departing from Jamaica at 3:02 AM',
 'The Bus starts from Holy See (Vatican City State) at 08:13 AM and goes to Montserrat reaching at 12:07 PM',
 'The bus arrives at Montserrat by 12:07 PM after departing from Holy See (Vatican City State) at 12:07 PM',
 'The Bus star

In [36]:
len(final_text)

146

In [37]:
paraphrased_df = pd.DataFrame({'text': final_text})

In [38]:
paraphrased_df

Unnamed: 0,text
0,The Bus starts from Holy See (Vatican City Sta...
1,The bus arrives at Bouvet Island by 4:46 PM af...
2,The Bus starts from Bangladesh at 06:22 PM and...
3,The bus arrives at Afghanistan by 8:01 PM afte...
4,The Bus starts from Mozambique at 09:27 AM and...
...,...
141,The bus leaves Hungary at 10:50 AM and arrives...
142,The bus leaves Hungary at 10:50 AM and arrives...
143,The bus leaves Hungary at 10:50 AM and arrives...
144,The Bus starts from Palau at 04:15 PM and goes...


In [39]:
df = pd.read_csv('../Dataset/paraphrased_data.csv')

In [40]:
df

Unnamed: 0,text
0,The Bus starts from Rwanda at 02:51 AM and goe...
1,The bus arrives at Switzerland by 8:13 AM afte...
2,The Bus starts from United Kingdom at 09:58 AM...
3,The bus arrives at United States Minor Outlyin...
4,The Bus starts from United Arab Emirates at 06...
...,...
1561,The bus leaves Ethiopia at 9:54 AM and arrives...
1562,The bus leaves Ethiopia at 9:54 AM and arrives...
1563,The bus arrives at Switzerland at 9:54 AM afte...
1564,The Bus starts from French Guiana at 02:04 PM ...


In [42]:
final_df = pd.concat([df, paraphrased_df])

In [44]:
final_df.to_csv('../Dataset/final_paraphrased_df.csv', index=False)

In [45]:
final_df

Unnamed: 0,text
0,The Bus starts from Rwanda at 02:51 AM and goe...
1,The bus arrives at Switzerland by 8:13 AM afte...
2,The Bus starts from United Kingdom at 09:58 AM...
3,The bus arrives at United States Minor Outlyin...
4,The Bus starts from United Arab Emirates at 06...
...,...
141,The bus leaves Hungary at 10:50 AM and arrives...
142,The bus leaves Hungary at 10:50 AM and arrives...
143,The bus leaves Hungary at 10:50 AM and arrives...
144,The Bus starts from Palau at 04:15 PM and goes...
