In [1]:
pip install indic-nlp-library

Note: you may need to restart the kernel to use updated packages.


In [1]:
import numpy as np
import pandas as pd
import indicnlp
import string
import sys
import re
from unicodedata import normalize
from indicnlp import common
from indicnlp import loader
from indicnlp.normalize.indic_normalize import IndicNormalizerFactory
from indicnlp.tokenize import indic_tokenize

In [2]:
line = 'á'
line = normalize('NFC',line).encode('ASCII','ignore')
line = line.decode('utf-8')
print(line)




In [3]:
df = pd.read_csv('raw_data.csv',encoding='utf-8')

In [4]:
df.head()

Unnamed: 0,English,Hindi
0,Sharaabi,शराबी
1,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
2,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
3,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
4,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते


In [5]:
df.shape

(327637, 2)

In [6]:
df.isna().sum()

English    0
Hindi      0
dtype: int64

In [7]:
df['English'][1234]

'Username and/or password invalid. Please try again'

In [8]:
df['Hindi'][1234]

'उपयोक्तानाम और/या कूटशब्द अवैध. कृपया फिर कोशिश करें'

In [9]:
def clean_text(text):
    text = text.replace(u',',' ')
    text = text.replace(u'.',' ')
    text = text.replace(u'|',' ')
    text = text.replace(u'।',' ')
    text = text.replace(u' |','')
    text = text.replace(u'/',' ')
    text = text.replace(u';','')
    text = text.replace(u':','')
    text = text.replace(u'(',' ')
    text = text.replace(u')',' ')
    text = text.replace(u'!',' ')
    text = text.replace(u'"',' ')
    text = text.replace(u'-','')
    text = text.replace(u'_',' ')
    text=text.replace(u"‘‘",' ')
    text=text.replace(u"’’",' ')
    text=text.replace(u"?",' ')
    text=text.replace(u"\\",' ')
    text= re.sub("'", '', text)
    text = re.sub('[0-9+\-*/.%]','',text) #check for digit and some signs
    pun = set(string.punctuation)
    text = text.strip() #removing age and piche ka whitespace if present
    text=re.sub(' +', ' ',text) #removing extra whitespaces between words
    text = ''.join(word for word in text if word not in pun)
    return text

In [10]:
clean_text('उपयोक्तानाम और/या कूटशब्द अवैध. कृपया फिर कोशिश करें|')

'उपयोक्तानाम और या कूटशब्द अवैध कृपया फिर कोशिश करें'

In [11]:
clean_text('Username and/or password invalid. Please try again')

'Username and or password invalid Please try again'

In [12]:
def preprocess_english(line):
    line = line.lower()
    line = clean_text(line)
    line = normalize('NFC',line).encode('ASCII','ignore') #normalizing the text basically normal form decomposition if other
    #than ascii characters are there we will ignore them like greek character
    line = line.decode('utf-8') #decoding the text in utf-8 format
    line = line.split()
    line = [word for word in line if word.isalpha()] #if not [a-z] isalpha() return false
    line = ' '.join(line)
    return line

In [13]:
preprocess_english('Username and/or password invalid. Please try again')

'username and or password invalid please try again'

In [14]:
def preprocess_hindi(line):
    line = re.sub('[a-zA-Z]','',line)
    line = clean_text(line)
    factory=IndicNormalizerFactory() #normalizing the hindi text
    normalizer=factory.get_normalizer(language = "hi",remove_nuktas=False)
    line = normalizer.normalize(line)
    token = []
    
    for tok in indic_tokenize.trivial_tokenize(line,lang="hi"): #tokenizing the text
        token.append(tok)
    line = token
    line = [word for word in line if not re.search(r'\d', word)] #check for digits
    line = ' '.join(line)
    line = 'START_ '+ line + ' _END'
    return line

In [15]:
preprocess_hindi('यह एक छोटी तीन गुम्बद वाली तराशे हुए श्वेत संगमर्मर से निर्मित है।')

'START_ यह एक छोटी तीन गुम्बद वाली तराशे हुए श्वेत संगमर्मर से निर्मित है _END'

In [16]:
def process_text(df):
    preprocessed_text = []
    
    for i in range(len(df)):
        eng_text = preprocess_english(df['English'][i])
        hin_text = preprocess_hindi(df['Hindi'][i])
        preprocessed_text.append((eng_text,hin_text))
    return preprocessed_text

In [17]:
clean_data = process_text(df)

cleaned_df = pd.DataFrame(clean_data)

In [18]:
cleaned_df.head()

Unnamed: 0,0,1
0,sharaabi,START_ शराबी _END
1,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...
2,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...
3,this percentage is even greater than the perce...,START_ यह प्रतिशत भारत में हिन्दुओं प्रतिशत से...
4,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...


In [19]:
cleaned_df.rename(columns = {0:'english',1:'hindi'},inplace=True)

In [20]:
cleaned_df.head()

Unnamed: 0,english,hindi
0,sharaabi,START_ शराबी _END
1,politicians do not have permission to do what ...,START_ राजनीतिज्ञों के पास जो कार्य करना चाहिए...
2,id like to tell you about one such child,START_ मई आपको ऐसे ही एक बच्चे के बारे में बता...
3,this percentage is even greater than the perce...,START_ यह प्रतिशत भारत में हिन्दुओं प्रतिशत से...
4,what we really mean is that theyre bad at not ...,START_ हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...


In [21]:
cleaned_df.isna().sum()

english    0
hindi      0
dtype: int64

In [22]:
for i in range(100,110):
    print(cleaned_df['english'][i])
    print(cleaned_df['hindi'][i])

list of bollywood films of
START_ की बॉलीवुड फिल्में _END
stack overflow
START_ स्टैक ओवरफ़्लो _END
humans destroyed the commons that they depended on
START_ मानवों ने उन ही साझे संसाधनों को नष्ट किया जिन पर वो आधारित थे _END
almost goes to e but otherwise the play would be over
START_ रचना करीब करीब ई तक जाती है मगर तब तो नाटक ख़त्म हो जाएगा _END
failed to activate configuration server s
START_ विन्यास सर्वर को सक्रिय करने में विफल _END
aryans did not make any statues or temples for deities
START_ आर्य देवताओं की कोई मूर्ति या मन्दिर नहीं बनाते थे _END
your system uses an arm cpu that is older than the armv architecture all packages in karmic were built with optimizations requiring armv as the minimal architecture it is not possible to upgrade your system to a new elementary os release with this hardware
START_ आपका तंत्र सीपीयु का उपयोग कर रहा है जो संरचना से ज्यादा पुराना है कार्मिक के सभी पैकेज इस आशावादिता के साथ बनाए गए है कि न्यूनतम संरचना की आवश्यकता होगी इन हार्डवेयर के साथ उब

In [23]:
for i in range(100,110):
    print(df['English'][i])
    print(df['Hindi'][i])

List of Bollywood films of 2013
2013 की बॉलीवुड फिल्में
Stack overflow
स्टैक ओवरफ़्लो
humans destroyed the commons that they depended on.
मानवों ने उन ही साझे संसाधनों को नष्ट किया जिन पर वो आधारित थे।
Almost goes to E, but otherwise the play would be over.
रचना करीब करीब ई तक जाती है, मगर तब तो नाटक ख़त्म हो जाएगा.
Failed to activate configuration server: %s 
विन्यास सर्वर को सक्रिय करने में विफल: %s
Aryans did not make any statues or temples for deities.
आर्य देवताओं की कोई मूर्ति या मन्दिर नहीं बनाते थे।
Your system uses an ARM CPU that is older than the ARMv6 architecture. All packages in karmic were built with optimizations requiring ARMv6 as the minimal architecture. It is not possible to upgrade your system to a new elementary OS release with this hardware.
आपका तंत्र ARM सीपीयु का उपयोग कर रहा है जो ARMv6 संरचना से ज्यादा पुराना है. 'कार्मिक' के सभी पैकेज इस आशावादिता के साथ बनाए गए है कि न्यूनतम संरचना ARMv6 की आवश्यकता होगी. इन हार्डवेयर के साथ उबुन्टू के नए प्रकाशन द्वारा अप

In [24]:
df['English'][107]

'2021-01-01 00:00:00'

In [25]:
len(cleaned_df['english'][107].split())

0

In [26]:
cleaned_df.shape

(327637, 2)

In [34]:
total_size = len(cleaned_df)
drop_rows = []
for ind in range(total_size):
    if(len(cleaned_df['english'][ind].split())==0 or len(cleaned_df['hindi'][ind].split())==0):
        drop_rows.append(ind)
    elif(len(cleaned_df['english'][ind].split())>100 or len(cleaned_df['hindi'][ind].split())>100):
        drop_rows.append(ind)

In [35]:
cleaned_df.shape

(327636, 2)

In [36]:
cleaned_df.drop(drop_rows,axis=0,inplace=True)

In [37]:
cleaned_df = cleaned_df.reset_index().drop('index',axis=1)

In [38]:
cleaned_df.shape

(324562, 2)

In [39]:
eng_vocab=[]
hin_vocab=[]

for i in range(len(cleaned_df)):
    eng_text = cleaned_df['english'][i]
    eng_text = eng_text.split()
    for word in eng_text:
        if word not in eng_vocab:
            eng_vocab.append(word)

for i in range(len(cleaned_df)):
    hin_text = cleaned_df['hindi'][i]
    hin_text = hin_text.split()
    for word in hin_text:
        if word not in hin_vocab:
            hin_vocab.append(word)

In [40]:
print(len(eng_vocab))
print(len(hin_vocab))

102460
126135


In [41]:
max_eng_len = 0
max_hin_len = 0

for i in range(len(cleaned_df)):
    eng_text = cleaned_df['english'][i]
    if(len(eng_text.split())>max_eng_len):
        max_eng_len = len(eng_text.split())

for i in range(len(cleaned_df)):
    hin_text = cleaned_df['hindi'][i]
    hin_text = hin_text.split()
    if(len(hin_text)>max_hin_len):
        max_hin_len = len(hin_text)

In [42]:
print(max_eng_len)
print(max_hin_len)

100
100


In [42]:
eng_tokens = {}
rev_eng_tokens = {}
for ind,word in enumerate(eng_vocab):
    eng_tokens[ind]=word
    rev_eng_tokens[word]=ind
    
hin_tokens = {}
rev_hin_tokens = {}
for ind,word in enumerate(hin_vocab):
    hin_tokens[ind]=word
    rev_hin_tokens[word]=ind

In [43]:
encoder_tokens = len(eng_vocab)+1
decoder_tokens = len(hin_vocab)+1

In [46]:
from sklearn.model_selection import train_test_split

In [47]:
X, y = cleaned_df.english,cleaned_df.hindi
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
X_train.shape, X_test.shape

((292105,), (32457,))

In [48]:
def generate_batch(X = X_train, y = y_train, batch_size = 12):
    while True:
        # range(start, stop, step)
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

In [49]:
generate_batch(batch_size=1)

<generator object generate_batch at 0x00000178274385C8>

In [50]:
cleaned_df.to_csv('cleaned_df.csv',index=False)