In [26]:
# Importing the Libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [41]:
# Importing the Datasets from original Source
train = pd.read_csv("odiencorp20-train", delimiter = "\t", header=None, on_bad_lines='skip')
validation = pd.read_csv("odiencorp20-dev", delimiter = "\t", header=None, on_bad_lines='skip')
test = pd.read_csv("odiencorp20-test", delimiter = "\t", header=None, on_bad_lines='skip')

In [42]:
print(f"Train Shape is: {train.shape}")
print(f"Validation Shape is: {validation.shape}")
print(f"Test Shape is: {test.shape}")

Train Shape is: (58170, 3)
Validation Shape is: (12098, 3)
Test Shape is: (12417, 3)


In [43]:
# Concatinating all the Dataset to make a combined one
full_df = pd.concat([train, validation, test], axis=0, ignore_index=True)
full_df.columns = ["Source", "English", "Odia"]

In [44]:
full_df.head()

Unnamed: 0,Source,English,Odia
0,odiencorp10,That they may keep thee from the strange woman...,ତବେେ ତାହା ତୁମ୍ଭକୁ ଅନ୍ୟ ପର ସ୍ତ୍ରୀଠାରୁ ରକ୍ଷା କରି...
1,odiencorp10,"And they shall dwell safely therein, and shall...",ସମାନେେ ତାହା ମଧିଅରେ ନିରାପଦ ରେ ବାସ କରିବେ। ସମାନେେ...
2,pmindia,The MoU aims to develop bilateral cooperation ...,"ଏହି ବୁଝାମଣାର ଉଦ୍ଦେଶ୍ୟ, ଦୁଗ୍ଧ ଉତ୍ପାଦନ ବିକାଶ ଏବଂ..."
3,dict,oxytocin,ଅକ୍ସିଟୋସିନ
4,books,But that night was special.,ଆଜି ରାତ୍ରିର କଥା କିନ୍ତୁ ସ୍ଵତନ୍ତ୍ର ।


In [45]:
# Preprocessing the Dataset
def preprocess(data):
    data["English"] = data["English"].str.lower()
    data.dropna(inplace=True)
    data.drop("Source", axis=1, inplace=True)
    return data.sample(frac=1, random_state=42, ignore_index=True)

full_df_preprocess = preprocess(full_df)

In [46]:
full_df_preprocess.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82680 entries, 0 to 82679
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   English  82680 non-null  object
 1   Odia     82680 non-null  object
dtypes: object(2)
memory usage: 1.3+ MB


In [47]:
full_df_preprocess.head()

Unnamed: 0,English,Odia
0,coral snake,ଶଙ୍ଖାମୁଠିଆ
1,"earlier, president mirziyoyev, leading a large...","ପୂର୍ବରୁ, ରାଷ୍ଟ୍ରପତି ମିର୍ଜିୟୋୟେଭ, ଏକ ବଡ଼ ଏବଂ ଉଚ୍..."
2,"and joseph said unto them, what deed is this t...","ଯୋଷଫେ ସମାନଙ୍କେୁ କହିଲେ, ""ତୁମ୍ଭେ କାହିଁକି ଏପରି କ..."
3,work is being carried out on this project keep...,ଏହି ପ୍ରକଳ୍ପ ଉପରେ ଏବେ କାର୍ଯ୍ୟ ଜାରି ରହିଛି ଏବଂ ଏଥ...
4,suddenly a stream of nectar flowed through my ...,ହଠାତ୍‌ ଅମ୍ପୃତର ଧାରାଟିଏ କର୍ଣ୍ଵପଟ ଦେଇ ପ୍ରବାହିତ ହ...


In [48]:
# Dividing the Dataset in 85%, 7.5%, 7.5% Ratio for Train, Val and Test Respectively

train_df, test_val_df = train_test_split(full_df_preprocess, test_size=0.15, random_state=42)
val_df, test_df = train_test_split(test_val_df, test_size=0.5, random_state=42)

In [49]:
print(f"Train Shape is: {train_df.shape}")
print(f"Validation Shape is: {val_df.shape}")
print(f"Test Shape is: {test_df.shape}")

Train Shape is: (70278, 2)
Validation Shape is: (6201, 2)
Test Shape is: (6201, 2)


In [50]:
train_df.to_csv("odiencorp20-train.csv", index=False)
val_df.to_csv("odiencorp20-val.csv", index=False)
test_df.to_csv("odiencorp20-test.csv", index=False)