In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
import numpy as np
import pandas as pd

import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split as tts


### READING THE DATASET

In [3]:
lines=pd.read_csv("/content/drive/MyDrive/VIT SEM/WINTER20-21/E1_CSE1015_MACHINELEARNINGESSENTIALS/PROJECT/END/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

### PREPROCESSING

In [4]:
lines['source'].value_counts()

tides        50000
ted          39881
indic2012    37726
Name: source, dtype: int64

WE HAVE 3 SOURCES OF INPUT

In [5]:
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


#### CLEANING THE DATASET

In [6]:
pd.isnull(lines).sum()

source              0
english_sentence    2
hindi_sentence      0
dtype: int64

2 ENGLISH SENTENCES HAVE NULL VALUES


In [7]:
lines=lines[~pd.isnull(lines['english_sentence'])]
lines.shape

(127605, 3)

NULL VALUES ARE ROPPED


In [8]:
lines.drop_duplicates(inplace=True)
lines.shape

(124827, 3)

DUPLICATE SENTENCES ARE DROPPED

In [9]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.lower())

In [10]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("“", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("“", '', x))

lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("”", '', x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub("”", '', x))

In [11]:
# Set of all special characters
exclude = set(string.punctuation) 

# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [12]:
#Remove Digits
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.translate(remove_digits))
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['hindi_sentence']=lines['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))

In [13]:
# Add start and end tokens to target sequences
lines['hindi_sentence'] = lines['hindi_sentence'].apply(lambda x : 'START_ '+ x + ' _END')
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...
1,ted,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...
2,indic2012,this percentage is even greater than the perce...,START_ यह प्रतिशत भारत में हिन्दुओं प्रतिशत से...
3,ted,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...
4,indic2012,the ending portion of these vedas is called up...,START_ इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता...


### EXTRACTING UNIQUE WORDS

In [14]:
### Get English and Hindi Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_hindi_words=set()
for hin in lines['hindi_sentence']:
    for word in hin.split():
        if word not in all_hindi_words:
            all_hindi_words.add(word)

In [15]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_hin_sentence']=lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

#### LENGTH VISUALIZATION

In [16]:
lines[lines['length_eng_sentence']>100].shape

(299, 5)

In [17]:
lines[lines['length_eng_sentence']>70].shape

(852, 5)

In [18]:
lines[lines['length_eng_sentence']<3].shape

(5213, 5)

In [19]:
lines[lines['length_hin_sentence']>20].shape

(41124, 5)

OPTIMUM LENGTH THAT HELPS TO REDUCE THE MODEL TRAINING TIME

In [20]:
print("maximum length of Hindi Sentence ",max(lines['length_hin_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Hindi Sentence  419
maximum length of English Sentence  398


In [21]:
lines = shuffle(lines)
lines.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
47381,tides,and so the dreaded kabuliwalla becomes in tago...,START_ इसी तरह एक कठोर काबुलीवाला इसी नाम से र...,22,29
36616,tides,parkinson s disease,START_ पार्किन्सन रोग _END,3,4
117340,ted,until we fix citizenship,START_ जब तक हम नागरिकता को प्रशस्त नहीं करते ...,4,10
90922,indic2012,emission of carbon monoxide dropped from lakh ...,START_ कार्बन मोनोऑक्साइड उत्सर्जन लाख टन से ग...,11,13
65798,indic2012,kalpana was the certified flying trainer and h...,START_ कल्पना जी को हवाईजहाज़ों ग्लाइडरों व व्...,18,21
