In [1]:
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
def read_and_split_data(train_file, test_file, val_file):
    # Read data
    train_data = pd.read_csv(train_file, header=None)
    test_data = pd.read_csv(test_file, header=None)
    val_data = pd.read_csv(val_file, header=None)

    # Split into English and Marathi words
    english_train = train_data.iloc[:, 0]
    marathi_train = train_data.iloc[:, 1]

    english_test = test_data.iloc[:, 0]
    marathi_test = test_data.iloc[:, 1]

    english_val = val_data.iloc[:, 0]
    marathi_val = val_data.iloc[:, 1]

    return (english_train, marathi_train, english_test, marathi_test, english_val, marathi_val)



In [3]:
# File paths
lang="mar"
train_file = f"/content/drive/MyDrive/DeepLearning/aksharantar_sampled/{lang}/{lang}_train.csv"
test_file = f"/content/drive/MyDrive/DeepLearning/aksharantar_sampled/{lang}/{lang}_test.csv"
val_file = f"/content/drive/MyDrive/DeepLearning/aksharantar_sampled/{lang}/{lang}_valid.csv"

# Call the function
english_train, marathi_train, english_test, marathi_test, english_val, marathi_val = read_and_split_data(train_file, test_file, val_file)

In [4]:
def create_char_list(words):
    char_set = set(char for word in words for char in word)
    char_list = sorted(char_set)
    max_length_word = max(len(word) for word in words)
    return char_list, max_length_word


def find_max_length(word_list):
    max_length = -1
    for word in word_list:
        max_length = max(max_length, len(word))
    return max_length

# Create character lists and find maximum word lengths
english_chars, english_max_len = create_char_list(english_train)
marathi_chars, marathi_max_len = create_char_list(marathi_train)

# Find maximum word lengths from validation and test data
english_max_len = max(find_max_length(english_val), find_max_length(english_test), english_max_len)
marathi_max_len = max(find_max_length(marathi_val), find_max_length(marathi_test), marathi_max_len)

In [5]:
def word_to_vector(word, lang):
    max_len = -1
    if lang == "english":
        max_len = english_max_len
    else:
        max_len = marathi_max_len

    vector = [0] * (max_len + 2)  # Initialize vector with max length + 2 (for special tokens)
    vector[0] = len(english_chars) + 1 if lang == "english" else len(marathi_chars) + 1
    count=1
    if(lang == "english"):
        for char in word:
            for i in range(len(english_chars)):
                if(english_chars[i] == char):
                    vector[count]=i+1
                    count+=1
    else :
        for char in word:
            for i in range(len(marathi_chars)):
                if(marathi_chars[i] == char):
                    vector[count]=i+1
                    count+=1

    return vector


In [6]:
print(marathi_train[13])

रुद्रांची


In [7]:
vec = word_to_vector(marathi_train[13],"marathi")

In [8]:
vec

[64, 42, 54, 33, 63, 42, 51, 2, 21, 53, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [9]:
# creating matrix of representation
def word_matrix(words, language):
    matrix = []
    for word in words:
        matrix.append(word_to_vector(word, language))
    return torch.tensor(matrix)

In [10]:
def prepare_word_matrices(train_data, val_data, test_data, language):
    train_matrix = word_matrix(train_data, language)
    val_matrix = word_matrix(val_data, language)
    test_matrix = word_matrix(test_data, language)
    return train_matrix, val_matrix, test_matrix

In [11]:
english_matrix, english_matrix_val, english_matrix_test = prepare_word_matrices(english_train, english_val, english_test, "english")
marathi_matrix, marathi_matrix_val, marathi_matrix_test = prepare_word_matrices(marathi_train, marathi_val, marathi_test, "marathi")

In [12]:
english_matrix

tensor([[27,  6, 21,  ...,  0,  0,  0],
        [27,  2,  8,  ...,  0,  0,  0],
        [27, 22,  8,  ...,  0,  0,  0],
        ...,
        [27,  1, 14,  ...,  0,  0,  0],
        [27,  7,  8,  ...,  0,  0,  0],
        [27,  1, 20,  ...,  0,  0,  0]])