In [1]:
# References:
# https://github.com/bentrevett/pytorch-seq2seq/blob/master/1%20-%20Sequence%20to%20Sequence%20Learning%20with%20Neural%20Networks.ipynb

In [2]:
import torch
from torch import nn, optim
from torchtext.legacy.data import Field, BucketIterator, TabularDataset
from sklearn.model_selection import train_test_split
import numpy as np
import unicodedata
import re
import pandas as pd
import os
import random
import spacy
import shutil

In [3]:
# !python -m spacy download fr_core_news_sm
# !python -m spacy download en_core_web_sm
spacy_fr = spacy.load('fr_core_news_sm')
spacy_eng = spacy.load('en_core_web_sm')

In [4]:
path = "D:/Datasets/Eng-French Translation"
os.chdir(path)

In [5]:
df = pd.read_csv('eng_-french.csv')
df.columns = ['english', 'french']

In [6]:
def unicode2Ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def normalizeString(s):
    s = unicode2Ascii(s.lower().strip())
    s = re.sub(r"([.!?])", r"\1", s)
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

In [7]:
df['english'] = df['english'].apply(lambda x: normalizeString(x))
df['french'] = df['french'].apply(lambda x: normalizeString(x))

In [8]:
MAX_LENGTH = 10

def filter_sentence(rows):
    if len(rows['english'].split(' ')) < MAX_LENGTH and len(rows['french'].split(' ')) < MAX_LENGTH:
        return rows
    else:
        return np.nan

df = df.apply(filter_sentence, axis = 'columns')

In [9]:
df_sample = df.dropna().reset_index(drop = True)

In [10]:
np.random.seed(1234)
random.seed(1234)
torch.manual_seed(1234)
n_samples = 1500
df_sample = df_sample.sample(n_samples).reset_index(drop = True)

In [11]:
# tokenizers
def french_tokenizer(text):
    return [tok.text for tok in spacy_fr.tokenizer(text)]

def english_tokenizer(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]

In [12]:
ENGLISH_TEXT = Field(sequential = True, 
                     tokenize = english_tokenizer, 
                     lower = True,
                     init_token = "<sos>", 
                     eos_token = "<eos>")

FRENCH_TEXT = Field(sequential = True, 
                    tokenize = french_tokenizer, 
                    lower= True,
                    init_token = "<sos>",
                    eos_token = "<eos>")

In [13]:
# train - validation split
train, valid = train_test_split(df_sample, test_size = 0.25, shuffle = True, random_state = 1234)
print("Train : ", train.shape)
print("Valid : ", valid.shape)

Train :  (1125, 2)
Valid :  (375, 2)


In [14]:
# writing train and valid files into folder
if not os.path.exists("inputs"):
    os.mkdir("inputs")
    print("inputs folder created succesfully")
    if not os.path.isfile("/inputs/train.csv"):
        train.to_csv("train.csv", index = False)
        print("train.csv written successfully")
        shutil.move("train.csv", "inputs/train.csv")
        print("train.csv moved successfully")
        
    if not os.path.isfile("/inputs/valid.csv"):
        valid.to_csv("valid.csv", index = False)
        print("valid.csv written successfully")
        shutil.move("valid.csv", "inputs/valid.csv")
        print("valid.csv moved successfully")
else:
    print("Folder already exists")
    train.to_csv("train.csv", index = False)
    print("train.csv written successfully")
    shutil.move("train.csv", "inputs/train.csv")
    print("train.csv moved successfully")
    
    valid.to_csv("valid.csv", index = False)
    print("valid.csv written successfully")
    shutil.move("valid.csv", "inputs/valid.csv")
    print("valid.csv moved successfully")

Folder already exists
train.csv written successfully
train.csv moved successfully
valid.csv written successfully
valid.csv moved successfully
