# English To French Translation

- [Dataset Link](https://www.kaggle.com/datasets/soupmonster/english-french)

## Prepare the Notebook

In [4]:
# Install Packages

!pip install pytorch-lightning torchmetrics torchviz -q

[0m

In [33]:
# Import Packages

import re
import os
import string
import numpy as np
import pandas as pd
from string import digits
from unicodedata import normalize

import torch
import torch.nn as nn

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline
sns.set_style('darkgrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 18

In [13]:
# Load Dataset

filePath = '/kaggle/input/en-fr-translation-dataset/en-fr.csv'

df = pd.read_csv(filePath, nrows = 100000)
df.head(5)

Unnamed: 0,en,fr
0,Changing Lives | Changing Society | How It Wor...,Il a transformé notre vie | Il a transformé la...
1,Site map,Plan du site
2,Feedback,Rétroaction
3,Credits,Crédits
4,Français,English


## Exploratory Data Analysis

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   en      99999 non-null   object
 1   fr      100000 non-null  object
dtypes: object(2)
memory usage: 1.5+ MB


In [25]:
df.en[df.en.isna() == True]

2807    NaN
Name: en, dtype: object

In [27]:
df = df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 99999 entries, 0 to 99999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   en      99999 non-null  object
 1   fr      99999 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


## Preprocessing

In [34]:
# Get Device - (GPU or CPU)


def getDevice() -> torch.device:
    return torch.device('cuda' if torch.cuda.is_available() else 'cpu')

DEVICE = getDevice()

print(f"Default Device: {DEVICE}")

Default Device: cuda


In [44]:
# Clean Lines

from typing import List

def cleanLinesProcess(lines) -> List:
    cleaned = list()
    
    rePrint = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    
    for line in lines:
        line = normalize('NFD', str(line)).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        line = line.split()
        line = [word.lower() for word in line]
        line = [word.translate(table) for word in line]
        line = [rePrint.sub('', w) for w in line]
        line = [word for word in line if word.isalpha()]
        cleaned.append(' '.join(line))
    
    return cleaned

df.en = cleanLinesProcess(df.en)
df.fr = cleanLinesProcess(df.fr)

In [45]:
df.sample(10)

Unnamed: 0,en,fr
28283,the influx of tourists to spain during the sum...,lafflux de touristes en espagne durant lete et...
15384,two other countries provided cad worth of prod...,vingt et un autres pays ont fourni pour millio...
69663,their families at home rarely had access to ba...,leurs familles a la maison avaient rarement ac...
80751,expanded health hazard alert various brands of...,extension danger pour la sante presence possib...
12783,canadian companies requiring additional inform...,les entreprises canadiennes qui desirent obten...
73471,public hearings see citizen participation,radiocommunication voir television
28657,key contacts canadian contacts canadian embass...,principaux contacts au canada ambassade du can...
40600,the bangkok post vietnam,the bangkok post vietnam
14800,importers are required to obtain special autho...,les importateurs doivent obtenir lautorisation...
39192,item perfect fry deep fryer,article friteuse de comptoir perfect fry


In [47]:
# Splitting For Training and Validating

valFrac = 0.2
valSplitIdx = int(len(df) * valFrac)

dataIdx = list(range(len(df)))
np.random.shuffle(dataIdx)

trainIdx = dataIdx[:valSplitIdx]
valIdx = dataIdx[valSplitIdx:]

In [48]:
trainData = df.iloc[trainIdx].reset_index().drop('index', axis = 1)
valData = df.iloc[valIdx].reset_index().drop('index', axis = 1)

In [50]:
SPECIAL_CHAR={'<UNX>':0,'<SOS>':1,'<EOS>':2,'<PAD>':3}