In [2]:
import pandas as pd
import numpy as np
from pickle import dump,load

import torch
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim

import tqdm

import sys
sys.path.append('../../')

In [2]:
df=pd.read_csv('../data/raw/english2french.csv')

In [3]:
df

Unnamed: 0,english,french
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !
...,...,...
145432,A carbon footprint is the amount of carbon dio...,Une empreinte carbone est la somme de pollutio...
145433,Death is something that we're often discourage...,La mort est une chose qu'on nous décourage sou...
145434,Since there are usually multiple websites on a...,Puisqu'il y a de multiples sites web sur chaqu...
145435,If someone who doesn't know your background sa...,Si quelqu'un qui ne connaît pas vos antécédent...


In [4]:
#dropping any null present in data
df.dropna(inplace=True)

In [5]:
#Converting columns to lists before cleaning
english=df['english'].to_list()
french=df['french'].to_list()

In [6]:
# Lets import our DfCleaner module
%autoreload 2
from headline_generator.src.preprocessing.cleaning import DfCleaner

In [7]:
cleaner=DfCleaner()

In [8]:
# Cleaning the articles
cleaned_english=cleaner.clean(english,stem=False)


10000 examples cleaned out of 145437
20000 examples cleaned out of 145437
30000 examples cleaned out of 145437
40000 examples cleaned out of 145437
50000 examples cleaned out of 145437
60000 examples cleaned out of 145437
70000 examples cleaned out of 145437
80000 examples cleaned out of 145437
90000 examples cleaned out of 145437
100000 examples cleaned out of 145437
110000 examples cleaned out of 145437
120000 examples cleaned out of 145437
130000 examples cleaned out of 145437
140000 examples cleaned out of 145437
Cleaning Done


In [10]:
# Cleaning the headline
cleaned_french=cleaner.clean(french,stem=False)

10000 examples cleaned out of 145437
20000 examples cleaned out of 145437
30000 examples cleaned out of 145437
40000 examples cleaned out of 145437
50000 examples cleaned out of 145437
60000 examples cleaned out of 145437
70000 examples cleaned out of 145437
80000 examples cleaned out of 145437
90000 examples cleaned out of 145437
100000 examples cleaned out of 145437
110000 examples cleaned out of 145437
120000 examples cleaned out of 145437
130000 examples cleaned out of 145437
140000 examples cleaned out of 145437
Cleaning Done


In [12]:
df1=cleaned_english
df2=cleaned_french

In [15]:
%%time
# Remove frequent and rare words (This might take some time to run)
cleaned_english =cleaner.remove_frequent_rare(cleaned_english,frequent=True,rare=True)
cleaned_french =cleaner.remove_frequent_rare(cleaned_french,frequent=True,rare=True)

10000 examples cleaned out of 145437
20000 examples cleaned out of 145437
30000 examples cleaned out of 145437
40000 examples cleaned out of 145437
50000 examples cleaned out of 145437
60000 examples cleaned out of 145437
70000 examples cleaned out of 145437
80000 examples cleaned out of 145437
90000 examples cleaned out of 145437
100000 examples cleaned out of 145437
110000 examples cleaned out of 145437
120000 examples cleaned out of 145437
130000 examples cleaned out of 145437
140000 examples cleaned out of 145437
Cleaning Done
10000 examples cleaned out of 145437
20000 examples cleaned out of 145437
30000 examples cleaned out of 145437
40000 examples cleaned out of 145437
50000 examples cleaned out of 145437
60000 examples cleaned out of 145437
70000 examples cleaned out of 145437
80000 examples cleaned out of 145437
90000 examples cleaned out of 145437
100000 examples cleaned out of 145437
110000 examples cleaned out of 145437
120000 examples cleaned out of 145437
130000 examples 

In [24]:
# We might have empty strings in our data right now so lets remove them by making a dataframe and replace empty string
# with NaN and then using the dropna() function

df_temp=pd.DataFrame(list(zip(cleaned_english,cleaned_french)),columns=['english','french'])
df_temp['english'].replace('',np.nan,inplace=True)
df_temp['french'].replace('',np.nan,inplace=True)
df_temp.dropna(inplace=True)

cleaned_english=df['english'].tolist()
cleaned_french=df['french'].tolist()

In [27]:
# Dumping the interm data to reuse it and not have to run the above code again
dump(cleaned_english, open('../data/interim/cleaned_english.pkl', 'wb'))
dump(cleaned_french, open('../data/interim/cleaned_french.pkl', 'wb'))

In [3]:
# Loading the data (from now on we will load this data and use it further)
cleaned_english=load(open('../data/interim/cleaned_english.pkl', 'rb'))
cleaned_french=load(open('../data/interim/cleaned_french.pkl', 'rb'))

In [4]:
# Creating the dataframe
data=pd.DataFrame(list(zip(cleaned_english,cleaned_french)),columns=['english','french'])

In [5]:
data.head()

Unnamed: 0,english,french
0,Go.,Va !
1,Run!,Cours !
2,Run!,Courez !
3,Wow!,Ça alors !
4,Fire!,Au feu !


In [6]:
from headline_generator.src.preprocessing.preprocess import PrepareTheData

In [7]:
dataset=PrepareTheData(data,max_vocab=1000)

Tokenization Done
Replacing Done
token2idx created
Added Start and End tokens
tokens_to_indices created
Presprocessing done


In [8]:
# No of examples in our final dataset
len(dataset)

145437

In [9]:
train_size = int(0.999 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

In [10]:
# To clear any cache memory
torch.cuda.empty_cache()

In [None]:
# # You can run this code to see the pytorch memory usage, this can be useful to display periodically during training, or when handling out-of-memory exceptions.
# print(torch.cuda.memory_summary())

In [11]:

# define the device to work on
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:

def collate(batch):
    inputs = [torch.LongTensor(item[0]) for item in batch]
    targets = [torch.LongTensor(item[1]) for item in batch]
    
    # Pad sequencse so that they are all the same length (within one minibatch)
    padded_inputs = pad_sequence(inputs, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)
    padded_targets = pad_sequence(targets, padding_value=dataset.token2idx_targets[dataset.padding_token], batch_first=True)
    
    # Sort by length for CUDA optimizations
    lengths = torch.LongTensor([len(x) for x in inputs])
    lengths, permutation = lengths.sort(dim=0, descending=True)

    return padded_inputs[permutation].to(device), padded_targets[permutation].to(device), lengths.to(device)


batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate)
test_loader = DataLoader(test_dataset, batch_size=1, collate_fn=collate)

In [27]:
from headline_generator.src.models.encoder_decoder import EncoderDecoder

In [27]:
model = EncoderDecoder(
    inputs_vocab_size=len(dataset.token2idx_inputs),
    targets_vocab_size=len(dataset.token2idx_targets),
    hidden_size=256,
    embedding_dim=100, 
    batch_size=batch_size, 
    targets_start_idx=dataset.token2idx_targets[dataset.start_of_sequence_token],
    targets_stop_idx=dataset.token2idx_targets[dataset.end_of_sequence_token],
).to(device)

optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

# Training loop
model.train()
for epoch in range(10):
    total_loss = total = 0
    progress_bar = tqdm.tqdm_notebook(train_loader, desc='Training', leave=False,total=len(train_dataloader))
    for inputs, targets, lengths in progress_bar:
        # Clean old gradients
        optimizer.zero_grad()

        # Forwards pass
        loss = model(inputs, targets, lengths)

        # Perform gradient descent, backwards pass
        loss.backward()

        # Take a step in the right direction
        optimizer.step()

        # Record metrics
        total_loss += loss.item()
        total += targets.size(1)

    train_loss = total_loss / total
    tqdm.write(f'epoch #{epoch + 1:3d}\ttrain_loss: {train_loss:.2e}\n')

Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  1	train_loss: 4.14e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  2	train_loss: 2.42e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  3	train_loss: 1.90e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  4	train_loss: 1.68e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  5	train_loss: 1.55e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  6	train_loss: 1.45e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  7	train_loss: 1.38e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  8	train_loss: 1.32e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch #  9	train_loss: 1.27e-02



Training:   0%|          | 0/568 [00:00<?, ?it/s]

epoch # 10	train_loss: 1.23e-02



In [29]:
torch.save(model.state_dict(), '../models/model')

In [35]:
# Let's Predict from our test data
model.eval()
total_loss = total = 0
with torch.no_grad():
    for inputs, _, lengths in test_loader:
        print('>', ' '.join([
            dataset.idx2token_inputs[idx] if dataset.idx2token_inputs[idx]!="<UNK>" else ''
            for idx in inputs.cpu()[0].numpy()[1:-1]
        ]))

        # Forwards pass
        outputs = model.predict(inputs, lengths)
        print(' '.join([
            dataset.idx2token_targets[idx] if dataset.idx2token_targets[idx]!="<UNK>" else ''
            for idx in outputs[:-1]
        ]))
        
        print()

> it s been a while hasn t it
cela fait un peu de papier ne l on donc nous sommes pas

> which  are you 
quelle  d  tu 

> tom is a good 
tom est un bon 

> he went out of the room without being  by anyone
il est allé dans la pièce sans lire ne  personne

> there s still so much left to do
il beaucoup encore pour faire

> i don t have enough 
je n ai pas assez d une vie

> what do you plan to do
que tu t  à faire

> that s your half
c est votre photo

> if you act like a child you will be  as such
si tu te prie comme un enfant tu seras du petit

> i felt sorry for you
je  être à votre part

> i  we change clothes first
je n ai  que nous  dimanche à premier premier

> i hate it here
je déteste ça

> she didn t want him to play 
elle n a pas voulu qu il joue au lit

> can you take a little break
est ce que tu  prendre une 

> i have some  in my left hand
je prends des  dans ma main 

> tom is  ever on time
tom  jamais sur le temps

> you can smoke in this room
vous pouvez fumer dans cinq