In [1]:
 from google.colab import drive

In [17]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import re


### function to clean up hindi text ###
def fixup_hi(input_text):

  remove_chars = "‘’/"
  #hindi fullstop
  hifs = "।"
  
  #remove text between {}
  str2 = re.sub("\{.*?\}","", input_text)

  #remove text between ##
  str3 = re.sub("\#.*?\#","", str2)

  #remove all the numbers in text
  str4 = re.sub("[0-9]","",str3)

  #remove all the english words in the text
  english_words =  re.findall(r"[a-zA-z]+", str4)
  str5 = re.sub("|".join(english_words), "", str4)

  #remove all the text between ()
  str6 = re.sub("\(.*?\)","", str5)

  #remove unwated characters
  pattern = "[" + remove_chars + "]"
  str7 = re.sub(pattern, "", str6)

  #remove special character
  str8 = str7.replace("–", "")

  #remove space between ','
  str9 = str8.replace(" ,", ",")

  #remove '.' from the text
  str10 = str9.replace(".", "")

  #remove extra white spaces
  str11 = ' '.join(str10.split())

  #add hindi full stop at the end of the text and return it
  hi_tokens = str11.split(" ")
  hi_tokens[-1] = hi_tokens[-1] + hifs
  final_text = ' '. join(hi_tokens)

  return final_text


### function to cleanup english text ###
def fixup_en(input_text):

  remove_chars = "‘’/"

  #remove numbers starting with underscore
  interm1 = re.sub(r'_\d+', '', input_text)

  #remove unwanted chars
  pattern = "[" + remove_chars + "]"
  interm2 = re.sub(pattern, "", interm1)

  #replace _ with space
  interm3 = interm2.replace("_", " ")

  #remove text between ()
  interm4 = re.sub("\(.*?\)","", interm3)

  #remove space before ','
  interm5 = interm4.replace(" ,", ",")

  #remove space before '.'
  interm6 = interm5.replace(" .", ".")

  #replace double space chars with single space
  final_text = ' '.join(interm6.split())

  return final_text

In [None]:
sample_dirty_en = "A paragraph_16314305  from the author_110632698  book_16325082  on cancer_114051451 , published_21730386  in 1973 , deserves_22621127  repetition_11004630  here_3109779 ."

print(fixup_en(sample_dirty_en))

A paragraph from the author book on cancer, published in 1973, deserves repetition here.


In [None]:
sample_dirty_hi = "﻿#﻿#u#w1#0 .#.#u#w2#0 कैन्सर#कैन्सर#n#w3#6559 के#के#u#w4#0 बारे#बार#u#w5#0 में#में#u#w6#0 लेखक#लेखक#n#w7#7790 की#{कर,की,करना}#u#w8#0 एक#एक#u#w9#0 पुस्तक#पुस्तक#n#w10#4352 ,#,#u#w11#0 जो#जो#u#w12#0 १९७३#१९७३#u#w13#0 में#में#u#w14#0 प्रकाशित#प्रकाशित#a#w15#9497 हुई#{है,होना}#u#w16#0 ,#,#u#w17#0 से#{सेना,से}#u#w18#0 एक#एक#n#w19#2929 अनुच्छेद#अनुच्छेद#n#w20#1695 यहाँ#यहाँ#n#w21#27070 दुहराए#{दुहराना,दुहरा}#u#w22#0 जाने#{जा,जान,जाना,जानना}#u#w23#0 योग्य#योग्य#a#w24#10186 है#है#u#w25#0"

print(fixup_hi(sample_dirty_hi))

﻿ कैन्सर के बारे में लेखक की एक पुस्तक, जो १९७३ में प्रकाशित हुई, से एक अनुच्छेद यहाँ दुहराए जाने योग्य है।


In [None]:
def fixup_file(input_file, output_file, lang):

  with open(input_file, 'r', encoding='utf-8') as readfile, open(output_file, 'w', encoding = 'utf-8') as writefile:
    lines = readfile.readlines()

    if lang == "en":
      for line in lines:
        writefile.write(fixup_en(line))
        writefile.write("\n")
    elif lang == "hi":
      for line_index in range(1, len(lines), 2):
        writefile.write(fixup_hi(lines[line_index]))
        writefile.write("\n")


In [None]:
## gdrive locations

health_domain_en_files = "/content/drive/MyDrive/health_domain_en_hi_unprocessed/ENG-HEALTH/Corpus/"
health_domain_hi_files = "/content/drive/MyDrive/health_domain_en_hi_unprocessed/HIN-HEALTH/"

In [None]:
test_en = health_domain_en_files + "0003_eng_health.txt"

test_en_output = "cleaned_english.txt"

fixup_file(test_en, test_en_output, "en")

In [None]:
test_hi = health_domain_hi_files + "0003_hin_health.txt"

test_hi_output = "cleaned_hindi.txt"

fixup_file(test_hi, test_hi_output, "hi")

In [None]:
import os

en_files = os.listdir(health_domain_en_files)
hi_files = os.listdir(health_domain_hi_files)

en_files.sort()
hi_files.sort()

hi_files_numbers = [ x.split('_')[0] for x in hi_files]
hi_files_numbers.sort()

required_en_files = [ x + "_eng_health.txt" for x in hi_files_numbers]

!rm -rf domain_data/
!mkdir -p domain_data/cleaned_english_files
!mkdir -p domain_data/cleaned_hi_files

clean_hindi_files = "/content/domain_data/cleaned_hi_files/"
clean_english_files = "/content/domain_data/cleaned_english_files/"


## cleaning up hindi files
for hi_file in hi_files:
  fixup_file(os.path.join(health_domain_hi_files, hi_file), os.path.join(clean_hindi_files, hi_file), "hi")

## cleaning up english files
for en_file in required_en_files:
  fixup_file(os.path.join(health_domain_en_files, en_file), os.path.join(clean_english_files, en_file), "en")


In [18]:
clean_hindi_files = "/content/drive/MyDrive/health_domain_cleaned/cleanzed/content/domain_data/cleaned_hi_files/"
clean_english_files = "/content/drive/MyDrive/health_domain_cleaned/cleanzed/content/domain_data/cleaned_english_files/"

unaligned_en = "/content/drive/MyDrive/health_domain_cleaned/cleanzed/content/domain_data/unaligned_cleaned_en_files/"
unaligned_hi = "/content/drive/MyDrive/health_domain_cleaned/cleanzed/content/domain_data/unaligned_cleaned_hi_files/"

os.environ["unaligned_en"] = unaligned_en
os.environ["unaligned_hi"] = unaligned_hi

!mkdir -p $unaligned_en $unaligned_hi

import os
import shutil

chf = os.listdir(clean_hindi_files)
cef = os.listdir(clean_english_files)

chf.sort()
cef.sort()
equal = []
unequal = []
for h, e in zip(chf, cef):
  with open(clean_hindi_files + h, 'r') as hfile, open(clean_english_files + e, 'r') as efile, open(aligned_file_en_hi, 'a') as wfile:
    h_lines = hfile.readlines()
    e_lines = efile.readlines()
    if len(h_lines) == len(e_lines):
      equal.append((h,e))
    else:
      line_diff = len(h_lines) - len(e_lines)
      unequal.append((h, len(h_lines), e, len(e_lines)))
      shutil.move(clean_hindi_files + h, unaligned_hi + h)
      shutil.move(clean_english_files + e, unaligned_en + e)

print(len(unequal), "files mismatch and moved to unaligned directory")


0 files mismatch and moved to unaligned directory


In [96]:
for file in equal:
  with open(clean_hindi_files + file[0], 'r', encoding = "utf-8-sig") as hfile, open(clean_english_files + file[1], 'r', encoding="utf-8-sig") as efile, open('aligned_file_en_hi.tsv', 'a', encoding="utf-8-sig") as wfile:
    hlines = hfile.readlines()
    elines = efile.readlines()

    for linee, lineh in zip(elines, hlines):
      wfile.write(linee.strip() + "\t" + lineh.strip() + "\n")

In [97]:
df_domain_en_hi = pd.read_csv('aligned_file_en_hi.tsv', sep = '\t',  header = None, names = ['src_sentence', 'tgt_sentence'])

In [98]:
df_domain_en_hi.head()

Unnamed: 0,src_sentence,tgt_sentence
0,"A paragraph from the author book on cancer, pu...","कैन्सर के बारे में लेखक की एक पुस्तक, जो १९७३ ..."
1,Doing nothing neither diagnosing nor treating ...,कुछ भी न करना जब तक कैन्सर के मरीज के मर्ज़ से ...
2,It is refusing to interfere backing the refusa...,यह अस्वीकृति को एक सुयोग्य आश्वासन या सोच - सम...
3,"Agreed that there is never nothing to be done,...","माना कि कभी भी कुछ न करने की नौबत नहीं आती, पर..."
4,It cannot be overemphasized that a doctor is a...,इस पर अधिक ज़ोर देने की आवश्यकता हो ही नहीं सक...


In [99]:
df_domain_en_hi.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5468 entries, 0 to 5467
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   src_sentence  5468 non-null   object
 1   tgt_sentence  5468 non-null   object
dtypes: object(2)
memory usage: 85.6+ KB


In [104]:
en_lines = df_domain_en_hi['src_sentence'].to_list()
hi_lines = df_domain_en_hi['tgt_sentence'].to_list()

def list_to_file(input_list, file_name):
  with open(file_name, 'w', encoding = "utf-8-sig") as filex:
    for line in input_list:
      filex.write(line.split("\t")[0] + "\n")

list_to_file(en_lines, "health_data_1.en")
list_to_file(hi_lines, "health_data_1.hi")