In [1]:
import pandas as pd
import pickle

In [2]:
df_train=pd.read_csv('aksharantar_sampled/hin/hin_train.csv')

In [3]:
df_valid=pd.read_csv('aksharantar_sampled/hin/hin_valid.csv')

In [4]:
df_test=pd.read_csv('aksharantar_sampled/hin/hin_test.csv')

In [5]:
def hindi_tokenizer(hindi_word):
    return list(hindi_word)

In [6]:
def hindi_alphabet_vocab(df):
    hindi_vocab=[]
    for word in df['शस्त्रागार']:
        for alphabet in hindi_tokenizer(word):
            if alphabet in hindi_vocab:
                pass
            else:
                hindi_vocab.append(alphabet)
            
    hindi_vocab.append('<')
    hindi_vocab.append('>')
    hindi_vocab.append('.')
    return hindi_vocab
        
    
    
        

In [7]:
len(hindi_alphabet_vocab(df_train))

67

In [8]:
def english_tokenizer(english_word):
    return list(english_word)

In [9]:
def english_alphabet_vocab(df):
    english_vocab=[]
    for word in df['shastragaar']:
        for alphabet in english_tokenizer(word):
            if alphabet in english_vocab:
                pass
            else:
                english_vocab.append(alphabet)
            
    english_vocab.append('<')
    english_vocab.append('>')
    english_vocab.append('.')
    return english_vocab

In [10]:
def encode(do_onehot=True):
    'returns encoder function'
    def one_hot_encode_alphabet(alphabet,alphabet_to_index):
    
        # Initialize a zero vector of length equal to the number of alphabets plus one for unseen alphabets
        one_hot_vector = [0] * (len(alphabet_to_index) + 1)
        # Set the indices of the alphabets present in the sentence to 1

        if alphabet.lower() in alphabet_to_index:
            one_hot_vector[alphabet_to_index[alphabet.lower()]] = 1
        else:
            # Set the last index to 1 to represent any unseen alphabets
            one_hot_vector[-1] = 1
        return one_hot_vector
    def simple(alphabet,alphabet_to_index):
        try:
            return alphabet_to_index[alphabet.lower()]
        except:
            return len(alphabet_to_index)
    
    
    if do_onehot:
        return one_hot_encode_alphabet
    else:
        return simple
        

In [11]:
def encode_word(word,alphabet_to_index,encoder=encode()):
    max_len=30
    word='<'+word+'>'
    
    while len(word)<30:
        word+='.'
    
    
    
    word_encode=[]
    for letter in word:
        word_encode.append(encoder(letter,alphabet_to_index))
    
    return word_encode

In [12]:
import numpy as np

In [13]:
def dataset_generator(df_train,df_val=None,df_test=None,do_onehot=True):
    # we generate the vocabulary from train and validation only!
    hindi_vocab=hindi_alphabet_vocab(df_train)
    english_vocab=english_alphabet_vocab(df_train)
    

    
    '''
    vocab generated. Now we one hot them?
    yes-> because one hot encoding will ensure that distances don't have meaning 1,2 for a,b
    and 26 for z would imply b is closer to a
    '''
    
    # Create a dictionary to map alphabets to their index
    

    
    hindi_alphabet_to_index = {alphabet: index for index, alphabet in enumerate(hindi_vocab)}
    index_to_hindi_alphabet = {index:alphabet  for index, alphabet in enumerate(hindi_vocab)}
    
    english_alphabet_to_index = {alphabet: index for index, alphabet in enumerate(english_vocab)}
    index_to_english_alphabet = {index:alphabet  for index, alphabet in enumerate(english_vocab)}

    if do_onehot:
        with open('vocab_tools/hindi_alphabet_to_index.pickle', 'wb') as f:
            pickle.dump(hindi_alphabet_to_index, f)
        with open('vocab_tools/index_to_hindi_alphabet.pickle', 'wb') as f:
            pickle.dump(index_to_hindi_alphabet, f)
        with open('vocab_tools/english_alphabet_to_index.pickle', 'wb') as f:
            pickle.dump(english_alphabet_to_index, f)
        with open('vocab_tools/index_to_english_alphabet.pickle', 'wb') as f:
            pickle.dump(index_to_english_alphabet, f)
    
    #training_data
    training_input=[]
    training_output=[]
    
    #validation
    valid_input=[]
    valid_output=[]
    
    #test
    test_input=[]
    test_output=[]
    
    for eng_word,hindi_word in zip(df_train['shastragaar'],df_train['शस्त्रागार']):
        training_input.append(encode_word(eng_word,english_alphabet_to_index,encoder=encode(do_onehot)))
        training_output.append(encode_word(hindi_word,hindi_alphabet_to_index,encoder=encode(do_onehot)))

        
    if df_val is not None:
        for eng_word,hindi_word in zip(df_val['jaisawal'],df_val['जयसवाल']):
            valid_input.append(encode_word(eng_word,english_alphabet_to_index,encoder=encode(do_onehot)))
            valid_output.append(encode_word(hindi_word,hindi_alphabet_to_index,encoder=encode(do_onehot)))

        
    if df_test is not None:
        for eng_word,hindi_word in zip(df_test['thermax'],df_test['थरमैक्स']):
            test_input.append(encode_word(eng_word,english_alphabet_to_index,encoder=encode(do_onehot)))
            test_output.append(encode_word(hindi_word,hindi_alphabet_to_index,encoder=encode(do_onehot)))

    
    if do_onehot:
        np.save('one_hot_data/X_train.npy',training_input)
        np.save('one_hot_data/y_train.npy',training_output)
        np.save('one_hot_data/X_val.npy',valid_input)
        np.save('one_hot_data/y_val.npy',valid_output)
        np.save('one_hot_data/X_test.npy',test_input)
        np.save('one_hot_data/y_test.npy',test_output)
    else:
        np.save('simple_data/X_train.npy',training_input)
        np.save('simple_data/y_train.npy',training_output)
        np.save('simple_data/X_val.npy',valid_input)
        np.save('simple_data/y_val.npy',valid_output)
        np.save('simple_data/X_test.npy',test_input)
        np.save('simple_data/y_test.npy',test_output)        
    
    return

    
    
    
    
    
    

    
    
    
    
    
    
    

In [14]:
def get_alphabet_from_encode(one_hot_vector_or_index, index_to_alphabet, do_onehot=True):
    
    if do_onehot:
        index = one_hot_vector_or_index.index(1)
    else:
        index = one_hot_vector_or_index
    
    try:
        alphabet = index_to_alphabet[index]
        return alphabet
    except:
        alphabet=' '
        return alphabet



In [15]:
def word_from_vecs(vecs,index_to_alphabet, do_onehot=True):
    
    if get_alphabet_from_encode(vecs[0],index_to_alphabet, do_onehot)!='<':
        print("Invalid Word")
    else:
        word=[]
        for ij in range(1,len(vecs)):
            aphab=get_alphabet_from_encode(vecs[ij],index_to_alphabet, do_onehot)
            if aphab=='>':
                return ''.join(word)
            word.append(aphab)
        return word
            
    

In [16]:
dataset_generator(df_train,df_valid,df_test)

In [17]:
dataset_generator(df_train,df_valid,df_test,do_onehot=False)