# Importing required libraries

In [97]:
import numpy as np
import pandas as pd
import os
import re

In [98]:
pd.options.mode.chained_assignment = None  # default='warn'

# Read source file

In [99]:
'''
Reads a source csv file and keeps the content in a DataFrame object
'''
def read_file(sourcePath : str):
  content = pd.read_csv(sourcePath, encoding='utf-8')
  print(f"Read {sourcePath} and created a DataFrame")
  return content

In [100]:
# all_uttrs = read_file('./21_sorted.csv')

In [101]:
# all_uttrs.head()

In [102]:
# all_uttrs.describe()

# Unique Utterances

In [103]:
'''
Finds unique utterances in a DataFrame of utterances
'''
def find_unique_utterances(df : pd.DataFrame):
  unique = df.drop_duplicates(subset=['utterance'])
  print(f"Found {unique.shape[0]} unique utterances")
  return unique

In [104]:
# uniqueAll_0 = find_unique_utterances(all_uttrs)

In [105]:
# uniqueAll_0.head()

In [106]:
# uniqueAll_0.describe()

# Unique single words

In [107]:
'''
Finds unique single words in a given DataFrame of utterances
'''
def find_unique_single_words(df : pd.DataFrame, delimiter = " "):
  uttrCol = df.utterance
  # First, split each utterance to a list of words
  splitUttrs = uttrCol.str.strip().str.split(delimiter)
  # Include all words in a list
  allWords = []
  for row in splitUttrs.index:
    for word in splitUttrs[row]:
      if word.strip() != '':
        allWords.append(word.strip())
  # Get only the unique words by 'set' operation
  uniqueSingle = pd.DataFrame(list(set(allWords)))
  uniqueSingle.rename(columns={0:'original'}, inplace=True)
  # Sort the words in alphabetical order
  uniqueSingle.sort_values(by=['original'], inplace=True)
  uniqueSingle.reset_index(inplace=True, drop=True)
  print(f"Found {uniqueSingle.shape[0]} unique words")

  return uniqueSingle

In [108]:
# uniqueSingle_0 = find_unique_single_words(uniqueAll_0)
# uniqueSingle_0.head()

# Unique Pairs of words

In [109]:
'''
Finds unique pairs of words in a given DataFrame of utterances
'''
def find_unique_pairs(df : pd.DataFrame, delimiter = " "):
  uttrCol = df.utterance
  # First, split each utterance to a list of words
  splitUttrs = uttrCol.str.strip().str.split(delimiter)
  # Include all pairs in a list
  allPairs = []
  w1, w2 = '', ''
  for row in splitUttrs.index:
    for i in range(len(splitUttrs[row])-1):
      w1, w2 = splitUttrs[row][i].strip(), splitUttrs[row][i+1].strip()
      if (w1 != '') and (w2 != ''):
        allPairs.append({'original':f"{w1} {w2}", 'first':w1, 'second':w2})

  non_unique_pairs = pd.DataFrame(allPairs)
  
  pd.options.mode.chained_assignment = None
  # Drop duplicate pairs
  uniquePairs = non_unique_pairs.drop_duplicates(subset=['original'])
  # Sort pairs by alphabetical order
  uniquePairs.sort_values(by=['original'], inplace=True)
  uniquePairs.reset_index(inplace=True, drop=True)
  pd.options.mode.chained_assignment = 'warn'

  print(f"Found {uniquePairs.shape[0]} unique pairs of words")
  return uniquePairs

In [110]:
# uniquePairs_0 = find_unique_pairs(uniqueAll_0)

# uniquePairs_0.head()

In [111]:
# uniquePairs_0.describe()

# Correction-related functions

In [112]:
'''
Extracts single words filtered by the given criterion
'''
def extract_single(uniqueSingle : pd.DataFrame, substring : str, 
                   starts_with = False):
  pd.options.mode.chained_assignment = None
  if not starts_with: df = uniqueSingle[uniqueSingle.original.str.endswith(substring)]
  else: df = uniqueSingle[uniqueSingle.original.str.startswith(substring)]
  # Duplicate the 'original' column as 'correction' which should be replaced by the corrected words
  df['correction'] = df['original']
  pd.options.mode.chained_assignment = 'warn'

  print(f"Found {df.shape[0]} words with {substring}")
  return df

'''
Extracts word pairs filtered by the given criterion
'''
def extract_pairs(uniquePairs : pd.DataFrame, first = '', second = ''):
  pd.options.mode.chained_assignment = None
  df = uniquePairs.copy()
  if first != '': df = df[df['first']==first]
  if second != '': df = df[df['second']==second]
  df['correction'] = df['original']
  pd.options.mode.chained_assignment = 'warn'

  print(f"Found {df.shape[0]} pairs with first={first} and second={second}")
  return df

'''
Writes a dataframe to a csv
'''
def write_to_csv(filename : str, df : pd.DataFrame, directory = '.'):
  df.to_csv(f"{directory}/{filename}.csv", encoding='utf-8', index=False)

'''
Utility function to apply padding: 
Replace spaces between words with X
'''
def apply_padding(original : pd.DataFrame, columns = []):
  oriCopy = original.copy()
  if not columns: columns = list(oriCopy.columns)
  for col in columns:
    oriCopy[col] = "X" + oriCopy[col] + "X"
    oriCopy[col] = oriCopy[col].str.replace(" ", "X")
  return oriCopy

'''
Reverse of the above function:
Replace X with spaces, and strip from both ends
'''
def remove_padding(original : pd.DataFrame, columns = []):
  oriCopy = original.copy()
  if not columns: columns = list(oriCopy.columns)
  for col in columns:
    oriCopy[col] = oriCopy[col].str.replace("X", " ")
    oriCopy[col] = oriCopy[col].str.strip()
  return oriCopy

'''
Applies corrections and returns a new utterance dataframe
'''
def apply_corrections(original : pd.DataFrame, corrections : pd.DataFrame):
  allCopy = original.copy()

  allCopy = apply_padding(allCopy, ['utterance'])
  corrections = apply_padding(corrections, ['original', 'correction'])

  for index, row in corrections.iterrows():
    allCopy['utterance'] = allCopy['utterance'].str.replace(row['original'], row['correction'], regex=True)
  print(f"Applied {corrections.shape[0]} corrections in Stage {STAGE}.")

  allCopy = remove_padding(allCopy, ['utterance'])
  return allCopy

'''
Method to split a single word
'''
def split_single(original : str, substring : str, toLeft = False):
  w1, w2 = '', ''
  if len(original) > len(substring):
    if toLeft:
      w1 = original[:len(substring)]
      w2 = original[len(substring):]
    else:
      w1 = original[:-len(substring)]
      w2 = original[-len(substring):]
    return f"{w1} {w2}"
  return original

# Correction Workflow Definitions

In [113]:
STAGE = 0
allDirectory = 'All_Utterances'
allFilePrefix = 'all_'
toCorrectDirectory = 'To_Correct'
toCorrectFilePrefix = 'toCorrect_'
correctedDirectory = 'Corrected'
correctedFilePrefix = 'corrected_'
modificationPrefix = 'common_modifications_'
toDeleteDirectory = 'To_Delete'
toDeleteFilePrefix = 'toDelete_'
doubtCorrectedDirectory = 'Doubt_Corrected'
doubtCorrectedFilePrefix = 'doubtCorrected_'
INITIALIZED = True

In [114]:
'''
Phase 1 of the correction workflow:
Extracting words/pairs to correct
'''
def correction_init(substring = '', onPairs = False, starts_with = False, 
                    first = '', second = '', defaultSuffix = None, justChecking = False):
  global STAGE
  if defaultSuffix: allUttrs = read_file(f"{allDirectory}/{allFilePrefix}{defaultSuffix}.csv") 
  else: allUttrs = read_file(f"{allDirectory}/{allFilePrefix}{STAGE}.csv")
  uniqueUttrs = find_unique_utterances(allUttrs)
  if not onPairs: 
    df_0 = find_unique_single_words(uniqueUttrs)
    df_1 = extract_single(df_0, substring, starts_with)
  else: 
    df_0 = find_unique_pairs(uniqueUttrs)
    df_1 = extract_pairs(df_0, first, second)

  if not justChecking:
    STAGE += 1
    write_to_csv(f"{toCorrectFilePrefix}{STAGE}", df_1, toCorrectDirectory)
    print(f"INITIALIZED STAGE {STAGE}")
    return allUttrs
  return None

'''
A preprocessing function before applying corrections:
Delete obviously incorrect utterances
'''
def correction_delete_obvious(original : pd.DataFrame):
  global STAGE
  toDelete = read_file(f"{toDeleteDirectory}/{toDeleteFilePrefix}{STAGE}.csv")
  deleted = original[~original['utterance_id'].isin(toDelete['utterance_id'])]
  print(f"Deleted {toDelete.shape[0]} utterances in Stage {STAGE}")
  return deleted

'''
A post-processing function after applying corrections:
Replace doubtful occurences with unambiguous words
'''
def correction_doubt_correct(original : pd.DataFrame):
  global STAGE
  oriCopy = original.copy()
  doubtCorrections = read_file(f"{doubtCorrectedDirectory}/{doubtCorrectedFilePrefix}{STAGE}.csv")

  oriCopy = apply_padding(oriCopy, ['utterance'])
  doubtCorrections = apply_padding(doubtCorrections, ['original', 'correction'])

  for index, row in doubtCorrections.iterrows():
    ids = row['utterance_ids'].strip().split('-')
    places = oriCopy[oriCopy['utterance_id'].isin(ids)]
    places['utterance'] = places['utterance'].str.replace(row['original'], row['correction'], regex=True)
    for id in ids:
      oriCopy.loc[oriCopy['utterance_id'] == id, 'utterance'] = places[places['utterance_id'] == id]['utterance']

  oriCopy = remove_padding(oriCopy, ['utterance'])
  print(f"Applied {doubtCorrections.shape[0]} Doubt Corrections in Stage {STAGE}.")
  return oriCopy

'''
Phase 2 of the correction workflow:
Applying corrections
'''
def correction_complete(original : pd.DataFrame):
  global STAGE, INITIALIZED
  try:
    allCorrections = read_file(f"{correctedDirectory}/{correctedFilePrefix}{STAGE}.csv")

    corrections = allCorrections[allCorrections['original'] != allCorrections['correction']]
    done = original

    # Preprocessing: toDelete
    hasToDelete = os.path.exists(f"{toDeleteDirectory}/{toDeleteFilePrefix}{STAGE}.csv")
    if hasToDelete: done = correction_delete_obvious(original)

    done = apply_corrections(done, corrections)

    # Post-processing: doubtCorrected
    hasDoubtCorrected = os.path.exists(f"{doubtCorrectedDirectory}/{doubtCorrectedFilePrefix}{STAGE}.csv")
    if hasDoubtCorrected: done = correction_doubt_correct(done)

    write_to_csv(f"{allFilePrefix}{STAGE}", done, allDirectory)

    print(f"STAGE {STAGE} COMPLETE! Utterance Count={done.shape[0]}")
    if INITIALIZED: STAGE += 1
    return done
  except FileNotFoundError:
    print(f"ERROR: Correction file for Stage {STAGE} is unavailable!")

'''
Intermediate phase of the correction workflow:
Programmatically defining corrected forms of single words
'''
def correction_define_single(substring : str, toLeft = False):
  corrections = read_file(f"{toCorrectDirectory}/{toCorrectFilePrefix}{STAGE}.csv")
  pd.options.mode.chained_assignment = None
  corrections['correction'] = corrections.apply(lambda x: split_single(x['original'], substring, toLeft), axis=1)
  pd.options.mode.chained_assignment = 'warn'

  write_to_csv(f"{correctedFilePrefix}{STAGE}", corrections, correctedDirectory)
  print(f"Defined corrections for {substring} at Stage {STAGE}")

  return corrections

'''
Intermediate phase of the correction workflow:
Programmatically defining corrected forms of word pairs
'''
def correction_define_pairs():
  corrections = read_file(f"{toCorrectDirectory}/{toCorrectFilePrefix}{STAGE}.csv")
  pd.options.mode.chained_assignment = None
  corrections['correction'] = corrections['first'] + corrections['second']
  pd.options.mode.chained_assignment = 'warn'

  write_to_csv(f"{correctedFilePrefix}{STAGE}", corrections, correctedDirectory)
  print(f"Defined corrections for word pairs at Stage {STAGE}")

  return corrections

'''
Intermediate phase of the correction workflow:
See the head of toCorrect csv
'''
def correction_inspect_toCorrect():
  corrections = read_file(f"{toCorrectDirectory}/{toCorrectFilePrefix}{STAGE}.csv")
  print(corrections.head())

'''
Intermediate phase of the correction workflow:
See the head of corrected csv
'''
def correction_inspect_corrected():
  corrected = read_file(f"{correctedDirectory}/{correctedFilePrefix}{STAGE}.csv")
  print(corrected.head())

'''
A utility function to forcibly change the STAGE
'''
def force_change_stage(stage):
  global STAGE
  STAGE = stage

'''
One-time correction to apply common modifications
'''
def correction_apply_common_modifications(name : str, original : pd.DataFrame):
  comMod = read_file(f"{modificationPrefix}{name}.csv")
  modified = apply_corrections(original, comMod)
  print(f"Applied {comMod.shape[0]} common modifications.")
  return modified

# Actual Correction Workflows

**Online Unicode Inspector:** https://apps.timwhitlock.info/unicode/inspect

# **1. Single - Ending with ආ >>> Separate**

In [None]:
s_1 = '\u0D86'
all_0 = correction_init(s_1)

Read All_Utterances/all_0.csv and created a DataFrame
Found 98432 unique utterances
Found 63374 unique words
Found 1 words with ආ
INITIALIZED STAGE 1


In [None]:
corrections_1 = correction_define_single(s_1)

Read To_Correct/toCorrect_1.csv and created a DataFrame
Defined corrections for ආ at Stage 1


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_1.csv and created a DataFrame
  original correction
0        ආ          ආ


In [None]:
correction_inspect_corrected()

Read Corrected/corrected_1.csv and created a DataFrame
  original correction
0        ආ          ආ


In [None]:
all_1 = correction_complete(all_0)

Read Corrected/corrected_1.csv and created a DataFrame
STAGE 1 COMPLETE!


# **2. Pairs - Isoated අ as first >>> Join**

In [None]:
s_2 = '\u0D85'
all_1 = correction_init(onPairs=True, first=s_2)

Read All_Utterances/all_1.csv and created a DataFrame
Found 98432 unique utterances
Found 260685 unique pairs of words
Found 0 pairs with first=අ and second=
INITIALIZED STAGE 2


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_2.csv and created a DataFrame
Empty DataFrame
Columns: [original, first, second, correction]
Index: []


In [None]:
all_2 = correction_complete(all_1)

Read Corrected/corrected_2.csv and created a DataFrame
STAGE 2 COMPLETE!


# **3. Pairs - Isolated ඉ as first >>> Join**

In [None]:
s_3 = '\u0D89'
all_2 = correction_init(onPairs=True, first=s_3)

Read All_Utterances/all_2.csv and created a DataFrame
Found 98432 unique utterances
Found 260685 unique pairs of words
Found 0 pairs with first=ඉ and second=
INITIALIZED STAGE 3


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_3.csv and created a DataFrame
Empty DataFrame
Columns: [original, first, second, correction]
Index: []


In [None]:
correction_inspect_corrected()

Read Corrected/corrected_3.csv and created a DataFrame
Empty DataFrame
Columns: [original, first, second, correction]
Index: []


In [None]:
all_3 = correction_complete(all_2)

Read Corrected/corrected_3.csv and created a DataFrame
STAGE 3 COMPLETE!


# **4. Pairs - Isolated එ as first >>> Join**

In [None]:
s_4 = '\u0D91'
all_3 = correction_init(onPairs=True, first=s_4)

Read All_Utterances/all_3.csv and created a DataFrame
Found 98432 unique utterances
Found 260685 unique pairs of words
Found 0 pairs with first=එ and second=
INITIALIZED STAGE 4


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_4.csv and created a DataFrame
Empty DataFrame
Columns: [original, first, second, correction]
Index: []


In [None]:
all_4 = correction_complete(all_3)

Read Corrected/corrected_4.csv and created a DataFrame
STAGE 4 COMPLETE!


**Important:**
After getting `all_4.csv` created, it was manually edited to do the following changes:


1.   Modified 5f6bc35485 after listening (උපරිමයි => උපරිමයයි)
2.   Removed ae3b50e2fa (එ වගෙ), 5590e25b5a (එ හමුදුරුවො) after listening

The modified `all_4.csv` has been (or if restarted, needs to be) uploaded.



In [None]:
all_4 = read_file('./All_Utterances/all_4.csv')

Read ./All_Utterances/all_4.csv and created a DataFrame


In [None]:
all_4.describe()

Unnamed: 0,utterance_id,speaker_id,utterance,gender
count,178407,178407,178407,178407
unique,178407,478,98432,2
top,5eba99b978,0b586,ජය වේවා,f
freq,1,798,16,96723


# **5. Single - Starting with ඒ >>> Manual**

In [None]:
s_5 = '\u0D92'
all_4 = correction_init(substring=s_5, starts_with=True)

Read All_Utterances/all_4.csv and created a DataFrame
Found 98432 unique utterances
Found 63374 unique words
Found 222 words with ඒ
INITIALIZED STAGE 5


**Important:** When analyzing for corrections on ඒ, found many obviously-wrong words. Therefore, had to remove utterances contatining them. So, we should upload `all_4_edited.csv` and read it to `all_4` object.

In [None]:
all_4 = read_file('./All_Utterances/all_4_edited.csv')

Read ./All_Utterances/all_4_edited.csv and created a DataFrame


In [None]:
all_4.describe()

Unnamed: 0,utterance_id,speaker_id,utterance,gender
count,178383,178383,178383,178383
unique,178383,478,98420,2
top,5eba99b978,0b586,ජය වේවා,f
freq,1,798,16,96711


In [None]:
all_5 = correction_complete(all_4)

Read Corrected/corrected_5.csv and created a DataFrame
STAGE 5 COMPLETE!


# **6. Pairs - Isolated ක as second >>> Join**

In [None]:
s_6 = '\u0D9A'
all_5 = correction_init(onPairs=True, second=s_6)

Read All_Utterances/all_5.csv and created a DataFrame
Found 98416 unique utterances
Found 260624 unique pairs of words
Found 15 pairs with first= and second=ක
INITIALIZED STAGE 6


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_6.csv and created a DataFrame
   original   first second correction
0   අක්කර ක   අක්කර      ක    අක්කර ක
1      එය ක      එය      ක       එය ක
2    කෝටි ක    කෝටි      ක     කෝටි ක
3  ගවයින් ක  ගවයින්      ක   ගවයින් ක
4     පැය ක     පැය      ක      පැය ක


**Important:** When analyzing for corrections on ක, found many obviously-wrong words. Therefore, had to remove utterances contatining them. So, we should upload `all_5_edited.csv` and read it to `all_5` object.

In [None]:
all_5 = read_file('./All_Utterances/all_5_edited.csv')

Read ./All_Utterances/all_5_edited.csv and created a DataFrame


In [None]:
all_6 = correction_complete(all_5)

Read Corrected/corrected_6.csv and created a DataFrame
STAGE 6 COMPLETE!


# **7. Pairs - Isolated ක් as second >>> Join**

In [None]:
s_7 = '\u0D9A\u0DCA'
all_6 = correction_init(onPairs=True, second=s_7)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 33 pairs with first= and second=ක්
INITIALIZED STAGE 7


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_7.csv and created a DataFrame
      original     first second   correction
0       400 ක්       400     ක්       400 ක්
1       අඩි ක්       අඩි     ක්       අඩි ක්
2  අඩිඅඟුල් ක්  අඩිඅඟුල්     ක්  අඩිඅඟුල් ක්
3   අවුරුදු ක්   අවුරුදු     ක්   අවුරුදු ක්
4   ඉක්මනටම ක්   ඉක්මනටම     ක්   ඉක්මනටම ක්


# **8. Pairs - Isolated කින් as second >>> Join**

In [None]:
s_8 = 'කින්'
all_7 = correction_init(onPairs=True, second=s_8, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 5 pairs with first= and second=කින්
INITIALIZED STAGE 8


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_8.csv and created a DataFrame
          original       first second       correction
0         අංග කින්         අංග   කින්         අංග කින්
1  අමාත්‍යාංශ කින්  අමාත්‍යාංශ   කින්  අමාත්‍යාංශ කින්
2         ඛිං කින්         ඛිං   කින්         ඛිං කින්
3        තනුව කින්        තනුව   කින්        තනුව කින්
4      දිනුම් කින්      දිනුම්   කින්      දිනුම් කින්


# **9. Pairs - Isolated කි as second >>> Join**


In [None]:
s_9 = 'කි'
all_8 = correction_init(onPairs=True, second=s_9, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 7 pairs with first= and second=කි
INITIALIZED STAGE 9


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_9.csv and created a DataFrame
       original      first second    correction
0      ආහාරය කි      ආහාරය     කි      ආහාරය කි
1        ඒකා කි        ඒකා     කි        ඒකා කි
2  ට්‍රිලියන කි  ට්‍රිලියන     කි  ට්‍රිලියන කි
3      ඩොලර් කි      ඩොලර්     කි      ඩොලර් කි
4       බැන් කි       බැන්     කි       බැන් කි


# **10. Pairs - Isolated කට as second >>> Join by meaning**

In [None]:
s_10 = 'කට'
all_9 = correction_init(onPairs=True, second=s_10, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 55 pairs with first= and second=කට
INITIALIZED STAGE 10


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_10.csv and created a DataFrame
      original     first second   correction
0       අංග කට       අංග     කට       අංග කට
1  අඩවියේදි කට  අඩවියේදි     කට  අඩවියේදි කට
2       අපි කට       අපි     කට       අපි කට
3   අවුරුදු කට   අවුරුදු     කට   අවුරුදු කට
4    ඇරෙන්න කට    ඇරෙන්න     කට    ඇරෙන්න කට


# **11. Pairs - Isolated ගේ as second >>> Join by meaning**

In [None]:
s_11 = 'ගේ'
all_10 = correction_init(onPairs=True, second=s_11, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 186 pairs with first= and second=ගේ
INITIALIZED STAGE 11


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_11.csv and created a DataFrame
            original           first second         correction
0           අංකල් ගේ           අංකල්     ගේ           අංකල් ගේ
1        අදිකාරම් ගේ        අදිකාරම්     ගේ        අදිකාරම් ගේ
2  අධ්‍යක්ෂකවරුන් ගේ  අධ්‍යක්ෂකවරුන්     ගේ  අධ්‍යක්ෂකවරුන් ගේ
3    අනුගාමිකයින් ගේ    අනුගාමිකයින්     ගේ    අනුගාමිකයින් ගේ
4           අනුන් ගේ           අනුන්     ගේ           අනුන් ගේ


# **12. Pairs - Isolated ගෙන් as second >>> Join by meaning**

In [None]:
s_12 = 'ගෙන්'
all_11 = correction_init(onPairs=True, second=s_12, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 30 pairs with first= and second=ගෙන්
INITIALIZED STAGE 12


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_12.csv and created a DataFrame
         original      first second      correction
0  උන්වහන්සේ ගෙන්  උන්වහන්සේ   ගෙන්  උන්වහන්සේ ගෙන්
1      එකෙක් ගෙන්      එකෙක්   ගෙන්      එකෙක් ගෙන්
2         එම ගෙන්         එම   ගෙන්         එම ගෙන්
3          ඒ ගෙන්          ඒ   ගෙන්          ඒ ගෙන්
4        කකා ගෙන්        කකා   ගෙන්        කකා ගෙන්


# **13. Pairs - Isolated ට as second >>> Join**

In [None]:
s_13 = 'ට'
all_12 = correction_init(onPairs=True, second=s_13, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 65 pairs with first= and second=ට
INITIALIZED STAGE 13


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_13.csv and created a DataFrame
    original    first second correction
0      අභී ට      අභී      ට      අභී ට
1  අයිතීන් ට  අයිතීන්      ට  අයිතීන් ට
2    අයියා ට    අයියා      ට    අයියා ට
3  අරවින්ද ට  අරවින්ද      ට  අරවින්ද ට
4  අවුරුදු ට  අවුරුදු      ට  අවුරුදු ට


# **14. Pairs - Isolated ත් as second >>> Join**

In [None]:
s_14 = 'ත්'
all_13 = correction_init(onPairs=True, second=s_14, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 11 pairs with first= and second=ත්
INITIALIZED STAGE 14


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_14.csv and created a DataFrame
   original  first second correction
0   අපිව ත්   අපිව     ත්    අපිව ත්
1  ඉතුරු ත්  ඉතුරු     ත්   ඉතුරු ත්
2  කතාවෙ ත්  කතාවෙ     ත්   කතාවෙ ත්
3  ගියාම ත්  ගියාම     ත්   ගියාම ත්
4     දී ත්     දී     ත්      දී ත්


# **15. Single - Ending with දී >>> Separate by meaning**

use the already corrected file

# **16. Single - Starting with නා >>> Separate by meaning**

In [None]:
s_16 = 'නා'
force_change_stage(15)
all_15 = correction_init(substring=s_16, starts_with=True, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 252 words with නා
INITIALIZED STAGE 16


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_16.csv and created a DataFrame
    original correction
0         නා         නා
1  නාකයත්වයේ  නාකයත්වයේ
2     නාකයන්     නාකයන්
3      නාකයි      නාකයි
4       නාකි       නාකි


# **17. Pairs - Isolated නා as first >>> Join by meaning**

In [None]:
s_17 = 'නා'
all_16 = correction_init(onPairs=True, first=s_17, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 24 pairs with first=නා and second=
INITIALIZED STAGE 17


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_17.csv and created a DataFrame
  original first second correction
0  නා ගත්ත    නා   ගත්ත    නා ගත්ත
1    නා ගන    නා     ගන      නා ගන
2   නා ගස්    නා    ගස්     නා ගස්
3   නා දලු    නා    දලු     නා දලු
4    නා නා    නා     නා      නා නා


# **18. Pairs - Isolated නි as first >>> Join**

In [None]:
s_18 = 'නි'
all_17 = correction_init(onPairs=True, first=s_18, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 0 pairs with first=නි and second=
INITIALIZED STAGE 18


**NO CORRECTIONS NEEDED**

# **19. Pairs - Isolated නු as first >>> Join**

In [None]:
s_19 = 'නු'
force_change_stage(18)
all_18 = correction_init(onPairs=True, first=s_19, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 0 pairs with first= and second=නු
INITIALIZED STAGE 19


**NO CORRECTIONS NEEDED**

# **20. Single - Ending with නේ >>> Separate by meaning**

In [None]:
s_20 = 'නේ'
all_19 = correction_init(substring=s_20, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 915 words with නේ
INITIALIZED STAGE 20


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_20.csv and created a DataFrame
     original  correction
0        අගනේ        අගනේ
1     අගයන්නේ     අගයන්නේ
2  අග්‍රවන්නේ  අග්‍රවන්නේ
3     අඩුගනනේ     අඩුගනනේ
4     අඩුගානේ     අඩුගානේ


# **21. Pairs - Isolated නේ as second >>> Join by meaning**

In [None]:
s_21 = 'නේ'
all_20 = correction_init(onPairs=True, second=s_21, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 75 pairs with first= and second=නේ
INITIALIZED STAGE 21


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_21.csv and created a DataFrame
        original       first second     correction
0       අඩවිය නේ       අඩවිය     නේ       අඩවිය නේ
1       අඩුයි නේ       අඩුයි     නේ       අඩුයි නේ
2  අසාධාරණයක් නේ  අසාධාරණයක්     නේ  අසාධාරණයක් නේ
3         ඇති නේ         ඇති     නේ         ඇති නේ
4       ඇහැකි නේ       ඇහැකි     නේ       ඇහැකි නේ


# **22. Pairs - Isolated ප්‍ර as first >>> Join**

In [None]:
s_22 = 'ප්‍ර'
force_change_stage(21)
all_22 = correction_init(onPairs=True, first=s_22, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 0 pairs with first=ප්‍ර and second=
INITIALIZED STAGE 22


**NO CORRECTIONS NEEDED**

# Attention:

**After 22, the method parameter `justChecking` in `correction_init()` function shall be used to avoid unnecessarily incrementing the `STAGE` and writing `toCorrect` files.**

# **23. Pairs - Isolated ප්‍රති as first >>> Join**

In [None]:
s_23 = 'ප්‍රති'
all_22 = correction_init(onPairs=True, first=s_23, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 6 pairs with first=ප්‍රති and second=
INITIALIZED STAGE 23


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_23.csv and created a DataFrame
             original   first       second          correction
0     ප්‍රති නිර්මාණය  ප්‍රති     නිර්මාණය     ප්‍රති නිර්මාණය
1  ප්‍රති නිර්මාණයක්ද  ප්‍රති  නිර්මාණයක්ද  ප්‍රති නිර්මාණයක්ද
2      ප්‍රති ප්‍රහාර  ප්‍රති      ප්‍රහාර      ප්‍රති ප්‍රහාර
3     ප්‍රති ප්‍රහාරය  ප්‍රති     ප්‍රහාරය     ප්‍රති ප්‍රහාරය
4      ප්‍රති විරුද්ධ  ප්‍රති      විරුද්ධ      ප්‍රති විරුද්ධ


# **24. Pairs - Isolated පිළි as first >>> Join**

In [None]:
s_24 = 'පිළි'
all_23 = correction_init(onPairs=True, first=s_24, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 8 pairs with first=පිළි and second=
INITIALIZED STAGE 24


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_24.csv and created a DataFrame
     original first second  correction
0    පිළි අරං  පිළි    අරං    පිළි අරං
1  පිළි ගත්තා  පිළි  ගත්තා  පිළි ගත්තා
2  පිළි ගන්නෙ  පිළි  ගන්නෙ  පිළි ගන්නෙ
3  පිළි ගන්නේ  පිළි  ගන්නේ  පිළි ගන්නේ
4  පිළි ගන්නෝ  පිළි  ගන්නෝ  පිළි ගන්නෝ


# **25. Pairs - Isolated පිරි as first >>> Join by meaning**

In [None]:
s_25 = 'පිරි'
all_24 = correction_init(onPairs=True, first=s_25, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 16 pairs with first=පිරි and second=
INITIALIZED STAGE 25


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_25.csv and created a DataFrame
         original first     second      correction
0  පිරි ඉතිහාසයක්  පිරි  ඉතිහාසයක්  පිරි ඉතිහාසයක්
1      පිරි උතුමෝ  පිරි      උතුමෝ      පිරි උතුමෝ
2        පිරි කතා  පිරි        කතා        පිරි කතා
3         පිරි කර  පිරි         කර         පිරි කර
4       පිරි කුඩා  පිරි       කුඩා       පිරි කුඩා


# **26. Pairs - Isolated පරි as first >>> Join**

In [None]:
s_26 = 'පරි'
all_25 = correction_init(onPairs=True, first=s_26, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 1 pairs with first=පරි and second=
INITIALIZED STAGE 26


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_26.csv and created a DataFrame
    original first second correction
0  පරි පීඩිත   පරි  පීඩිත  පරි පීඩිත


# **27. Pairs - Isolated මි as second >>> Join**

In [None]:
s_27 = 'මි'
all_26 = correction_init(onPairs=True, second=s_27, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 2 pairs with first= and second=මි
INITIALIZED STAGE 27


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_27.csv and created a DataFrame
        original       first second     correction
0  පුරාණෝක්ති මි  පුරාණෝක්ති     මි  පුරාණෝක්ති මි
1       හරකයි මි       හරකයි     මි       හරකයි මි


# **28. Pairs - Isolated මී as first >>> Join by meaning**

In [None]:
s_28 = 'මී'
all_27 = correction_init(onPairs=True, first=s_28, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 20 pairs with first=මී and second=
INITIALIZED STAGE 28


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_28.csv and created a DataFrame
    original first  second correction
0  මී කඩන්නට    මී  කඩන්නට  මී කඩන්නට
1  මී කැටකාල    මී  කැටකාල  මී කැටකාල
2     මී කුණ    මී     කුණ     මී කුණ
3   මී කුණක්    මී   කුණක්   මී කුණක්
4   මී ගවයන්    මී   ගවයන්   මී ගවයන්


# **29. Single - Starting with මී >>> Separate by meaning**

In [None]:
s_29 = 'මී'
all_28 = correction_init(substring=s_29, starts_with=True, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 48 words with මී
INITIALIZED STAGE 29


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_29.csv and created a DataFrame
    original correction
0         මී         මී
1  මීකිරිවලට  මීකිරිවලට
2      මීගමු      මීගමු
3     මීගමුව     මීගමුව
4    මීගමුවේ    මීගමුවේ


# **30. Pairs - Isolated මෙ as first >>> Join**

In [None]:
s_30 = 'මෙ'
all_29 = correction_init(onPairs=True, first=s_30, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 25 pairs with first=මෙ and second=
INITIALIZED STAGE 30


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_30.csv and created a DataFrame
    original first  second correction
0    මෙ ආයතන    මෙ    ආයතන    මෙ ආයතන
1     මෙ කතා    මෙ     කතා     මෙ කතා
2  මෙ කරත්තෙ    මෙ  කරත්තෙ  මෙ කරත්තෙ
3    මෙ කෑලේ    මෙ    කෑලේ    මෙ කෑලේ
4      මෙ කී    මෙ      කී      මෙ කී


# **31. Single - Ending with ය >>> Join by meaning**

In [None]:
s_31 = 'ය'
all_30 = correction_init(substring=s_31, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 4161 words with ය
INITIALIZED STAGE 31


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_31.csv and created a DataFrame
  original correction
0     අංකය       අංකය
1   අංකුරය     අංකුරය
2  අංකෙළිය    අංකෙළිය
3     අංගය       අංගය
4     අංශය       අංශය


# **32. Single - Starting with යා >>> Separate by meaning**

In [None]:
s_32 = 'යා'
all_31 = correction_init(substring=s_32, starts_with=True, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 148 words with යා
INITIALIZED STAGE 32


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_32.csv and created a DataFrame
  original correction
0       යා         යා
1    යාකරන      යාකරන
2   යාකලාම     යාකලාම
3      යාග        යාග
4     යාගත       යාගත


# **33. Single - Ending with ලු >>> Separate by meaning**

In [None]:
s_33 = 'ලු'
all_32 = correction_init(substring=s_33, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 183 words with ලු
INITIALIZED STAGE 33


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_33.csv and created a DataFrame
    original correction
0   අඟවන්නලු   අඟවන්නලු
1  අඩුවෙලාලු  අඩුවෙලාලු
2       අයලු       අයලු
3    අයෙක්ලු    අයෙක්ලු
4        අලු        අලු


# **34. Single - Ending with ලදි >>> Separate by meaning**

In [None]:
s_34 = 'ලදි'
all_33 = correction_init(substring=s_34, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 7 words with ලදි
INITIALIZED STAGE 34


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_34.csv and created a DataFrame
      original   correction
0   අවස්ථාවලදි   අවස්ථාවලදි
1     තැන්වලදි     තැන්වලදි
2        තුලදි        තුලදි
3  ප්‍රශ්නවලදි  ප්‍රශ්නවලදි
4        මුලදි        මුලදි


# **35. Single - Ending with ලදී >>> Separate by meaning**

In [None]:
s_35 = 'ලදී'
all_34 = correction_init(substring=s_35, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 58 words with ලදී
INITIALIZED STAGE 35


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_35.csv and created a DataFrame
        original     correction
0       අවධිවලදී       අවධිවලදී
1  අවස්ථාවන්වලදී  අවස්ථාවන්වලදී
2     අවස්ථාවලදී     අවස්ථාවලදී
3          අසලදී          අසලදී
4     ආත්මභවවලදී     ආත්මභවවලදී


# **36. Single - Ending with ලද >>> Separate by meaning**

In [None]:
s_36 = 'ලද'
all_35 = correction_init(substring=s_36, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 49 words with ලද
INITIALIZED STAGE 36


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_36.csv and created a DataFrame
  original correction
0    අදාලද      අදාලද
1     අසලද       අසලද
2  ආහාරවලද    ආහාරවලද
3     ඉදලද       ඉදලද
4  උගන්නලද    උගන්නලද


# **37. Single - Ending with වේවා >>> Separate by meaning**

In [None]:
s_37 = 'වේවා'
all_36 = correction_init(substring=s_37, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 13 words with වේවා
INITIALIZED STAGE 37


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_37.csv and created a DataFrame
      original   correction
0  ඕනියෝජයවේවා  ඕනියෝජයවේවා
1      කියවේවා      කියවේවා
2      ජයමවේවා      ජයමවේවා
3       ජයවේවා       ජයවේවා
4   දියුණුවේවා   දියුණුවේවා


use already corrected file

# **38. Pairs - Isolated ස as first >>> Join**

In [None]:
s_38 = 'ස'
all_37 = correction_init(onPairs=True, first=s_38, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 2 pairs with first=ස and second=
INITIALIZED STAGE 38


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_38.csv and created a DataFrame
  original first second correction
0   ස අකුර     ස   අකුර     ස අකුර
1   ස වැනි     ස   වැනි     ස වැනි


# **39. Pairs - Isolated සහ as first >>> Join by meaning**

In [None]:
s_39 = 'සහ'
all_38 = correction_init(onPairs=True, first=s_39, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 524 pairs with first=සහ and second=
INITIALIZED STAGE 39


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_39.csv and created a DataFrame
         original first       second      correction
0          සහ අංක    සහ          අංක          සහ අංක
1  සහ අගමැතිතුමාට    සහ  අගමැතිතුමාට  සහ අගමැතිතුමාට
2      සහ අජාසත්ත    සහ      අජාසත්ත      සහ අජාසත්ත
3        සහ අජිත්    සහ        අජිත්        සහ අජිත්
4          සහ අඩු    සහ          අඩු          සහ අඩු


# **40. Pairs - Isolated සු as first >>> Join**

In [None]:
s_40 = 'සු'
all_39 = correction_init(onPairs=True, first=s_40, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 1 pairs with first=සු and second=
INITIALIZED STAGE 40


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_40.csv and created a DataFrame
        original first      second     correction
0  සු මිච්චාචාරා    සු  මිච්චාචාරා  සු මිච්චාචාරා


# **41. Single - Ending with සේ >>> Separate by meaning**

In [None]:
s_41 = 'සේ'
all_40 = correction_init(substring=s_41, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 93 words with සේ
INITIALIZED STAGE 41


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_41.csv and created a DataFrame
    original correction
0  අකුරැස්සේ  අකුරැස්සේ
1      අදහසේ      අදහසේ
2      අස්සේ      අස්සේ
3       අහසේ       අහසේ
4      ආකාසේ      ආකාසේ


# **42. Single - Ending with සේක >>> Separate**

use the already corrected file

# **43. Pairs - Isolated හ as second >>> Join**

In [None]:
s_43 = 'හ'
force_change_stage(42)
all_42 = correction_init(onPairs=True, second=s_43, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 6 pairs with first= and second=හ
INITIALIZED STAGE 43


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_43.csv and created a DataFrame
    original    first second correction
0     අකුර හ     අකුර      හ     අකුර හ
1  ඉගැන්වූ හ  ඉගැන්වූ      හ  ඉගැන්වූ හ
2       කළ හ       කළ      හ       කළ හ
3      වම් හ      වම්      හ      වම් හ
4       වූ හ       වූ      හ       වූ හ


# **44. Single - Ending with හා >>> Separate by meaning**

In [None]:
s_44 = 'හා'
all_43 = correction_init(substring=s_44, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 47 words with හා
INITIALIZED STAGE 44


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_44.csv and created a DataFrame
  original correction
0  අග්ගමහා    අග්ගමහා
1    අටමහා      අටමහා
2     අදහා       අදහා
3   අසූමහා     අසූමහා
4      අහා        අහා


# **45. Pairs - Isolated හි as second >>> Join**

In [None]:
s_45 = 'හි'
all_44 = correction_init(onPairs=True, second=s_45, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 23 pairs with first= and second=හි
INITIALIZED STAGE 45


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_45.csv and created a DataFrame
           original          first second        correction
0            අංක හි            අංක     හි            අංක හි
1      අවස්ථාවන් හි      අවස්ථාවන්     හි      අවස්ථාවන් හි
2         කියොතෝ හි         කියොතෝ     හි         කියොතෝ හි
3  කිරීමේකාර්යන් හි  කිරීමේකාර්යන්     හි  කිරීමේකාර්යන් හි
4           කේප් හි           කේප්     හි           කේප් හි


# **46. Pairs - Isolated හු as second >>> Join**

In [None]:
s_46 = 'හු'
all_45 = correction_init(onPairs=True, second=s_46, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 2 pairs with first= and second=හු
INITIALIZED STAGE 46


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_46.csv and created a DataFrame
  original first second correction
0   ගහල හු   ගහල     හු     ගහල හු
1  වේදී හු  වේදී     හු    වේදී හු


# **47. Single - Ending with හෝ >>> Separate by meaning**

use already corrected file

# RED LABEL CORRECTIONS

# **48. Single - Ending with ද >>> Separate by meaning**

In [None]:
force_change_stage(47)

In [None]:
s_48 = 'ද'
all_47 = correction_init(substring=s_48, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 1822 words with ද
INITIALIZED STAGE 48


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_48.csv and created a DataFrame
    original correction
0  අංගම්පොරද  අංගම්පොරද
1   අංගම්මනද   අංගම්මනද
2    අංගවලටද    අංගවලටද
3     අංශයටද     අංශයටද
4     අගමකටද     අගමකටද


# **49. Pairs - Isolated ද as second >>> Join by meaning**

In [None]:
s_49 = 'ද'
all_48 = correction_init(onPairs=True, second=s_49, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 997 pairs with first= and second=ද
INITIALIZED STAGE 49


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_49.csv and created a DataFrame
       original       first second    correction
0        අංගය ද        අංගය      ද        අංගය ද
1  අංගයන්ගෙන් ද  අංගයන්ගෙන්      ද  අංගයන්ගෙන් ද
2     අංශයෙන් ද     අංශයෙන්      ද     අංශයෙන් ද
3     අකුරින් ද     අකුරින්      ද     අකුරින් ද
4      අකුසල් ද      අකුසල්      ද      අකුසල් ද


# **50. Single - Ending with දා >>> Separate by meaning**

In [None]:
force_change_stage(49)

In [None]:
s_50 = 'දා'
all_49 = correction_init(substring=s_50, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 126 words with දා
INITIALIZED STAGE 50


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_50.csv and created a DataFrame
        original     correction
0  අගොස්තුවැනිදා  අගොස්තුවැනිදා
1  අගෝස්තුවැනිදා  අගෝස්තුවැනිදා
2      අඟහරුවාදා      අඟහරුවාදා
3       අටවැනිදා       අටවැනිදා
4         අතනොදා         අතනොදා


# **51. Single - Ending with ම >>> Separate by meaning**

In [None]:
force_change_stage(50)

In [None]:
s_51 = 'ම'
all_50 = correction_init(substring=s_51, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 3207 words with ම
INITIALIZED STAGE 51


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_51.csv and created a DataFrame
    original correction
0     අංශයකම     අංශයකම
1  අංශයකින්ම  අංශයකින්ම
2      අංශෙම      අංශෙම
3    අකමැතිම    අකමැතිම
4      අකරගම      අකරගම


# **52. Single - Ending with ව >>> Separate by meaning**

In [None]:
s_52 = 'ව'
all_51 = correction_init(substring=s_52, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 1976 words with ව
INITIALIZED STAGE 52


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_52.csv and created a DataFrame
       original    correction
0       අකරුණාව       අකරුණාව
1       අකරුනාව       අකරුනාව
2  අක්කරේපත්තුව  අක්කරේපත්තුව
3         අක්කව         අක්කව
4        අක්කාව        අක්කාව


# **53. Pairs - Isolated ව as second >>> Join by meaning**

In [None]:
s_53 = 'ව'
all_52 = correction_init(onPairs=True, second=s_53, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 105 pairs with first= and second=ව
INITIALIZED STAGE 53


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_53.csv and created a DataFrame
      original      first second   correction
0  අතිසාර්ථක ව  අතිසාර්ථක      ව  අතිසාර්ථක ව
1       අදාළ ව       අදාළ      ව       අදාළ ව
2   අන්තර්ගත ව   අන්තර්ගත      ව   අන්තර්ගත ව
3    අප්‍රකට ව    අප්‍රකට      ව    අප්‍රකට ව
4       අමතක ව       අමතක      ව       අමතක ව


# **54. Pairs - Isolated වල as second >>> Join**

In [None]:
s_54 = 'වල'
all_54 = correction_init(onPairs=True, second=s_54, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 227 pairs with first= and second=වල
INITIALIZED STAGE 54


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_54.csv and created a DataFrame
       original      first second    correction
0      අකුරු වල      අකුරු     වල      අකුරු වල
1    අගෝස්තු වල    අගෝස්තු     වල    අගෝස්තු වල
2      අඟහරු වල      අඟහරු     වල      අඟහරු වල
3       අඩවි වල       අඩවි     වල       අඩවි වල
4  අත්දැකීම් වල  අත්දැකීම්     වල  අත්දැකීම් වල


# **55. Pairs - Isolated වලට as second >>> Join**

In [None]:
s_55 = 'වලට'
all_54 = correction_init(onPairs=True, second=s_55, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 263 pairs with first= and second=වලට
INITIALIZED STAGE 55


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_55.csv and created a DataFrame
    original  first second correction
0    අංශ වලට    අංශ    වලට    අංශ වලට
1  අකුණු වලට  අකුණු    වලට  අකුණු වලට
2  අඟුරු වලට  අඟුරු    වලට  අඟුරු වලට
3    අතු වලට    අතු    වලට    අතු වලට
4  අදහස් වලට  අදහස්    වලට  අදහස් වලට


# **56. Pairs - Isolated වලින් as second >>> Join**

In [None]:
s_56 = 'වලින්'
all_55 = correction_init(onPairs=True, second=s_56, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 260522 unique pairs of words
Found 252 pairs with first= and second=වලින්
INITIALIZED STAGE 56


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_56.csv and created a DataFrame
       original   first second    correction
0     අංක වලින්     අංක  වලින්     අංක වලින්
1   අකුරු වලින්   අකුරු  වලින්   අකුරු වලින්
2  අකුසල් වලින්  අකුසල්  වලින්  අකුසල් වලින්
3     අඩි වලින්     අඩි  වලින්     අඩි වලින්
4     අත් වලින්     අත්  වලින්     අත් වලින්


# **57. Single - Ending with වන >>> Separate by meaning**

In [None]:
s_57 = 'වන'
all_56 = correction_init(substring=s_57, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 260 words with වන
INITIALIZED STAGE 57


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_57.csv and created a DataFrame
  original correction
0  අකුළුවන    අකුළුවන
1     අඟවන       අඟවන
2     අටවන       අටවන
3    අඩුවන      අඩුවන
4  අතපසුවන    අතපසුවන


# **58. Single - Ending with වනු >>> Separate by meaning**

In [None]:
force_change_stage(57)

In [None]:
s_58 = 'වනු'
all_57 = correction_init(substring=s_58, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 51 words with වනු
INITIALIZED STAGE 58


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_58.csv and created a DataFrame
   original correction
0    අත්වනු     අත්වනු
1    අස්වනු     අස්වනු
2  ඉදිනොවනු   ඉදිනොවනු
3   උගන්වනු    උගන්වනු
4  උසුරුවනු   උසුරුවනු


# **59. Single - Ending with වී >>> Separate by meaning**

In [None]:
s_59 = 'වී'
all_58 = correction_init(substring=s_59, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 163 words with වී
INITIALIZED STAGE 59


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_59.csv and created a DataFrame
      original   correction
0        අජීවී        අජීවී
1        අඩුවී        අඩුවී
2        අත්වී        අත්වී
3      අතුගෑවී      අතුගෑවී
4  අතුරුදහන්වී  අතුරුදහන්වී


# **60. Single - Ending with වූ >>> Separate by meaning**

In [None]:
s_60 = 'වූ'
all_59 = correction_init(substring=s_60, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 152 words with වූ
INITIALIZED STAGE 60


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_60.csv and created a DataFrame
               original            correction
0               අග්‍රවූ               අග්‍රවූ
1  අග්‍රාමාත්‍යවරයෙකුවූ  අග්‍රාමාත්‍යවරයෙකුවූ
2           අතලොස්සක්වූ           අතලොස්සක්වූ
3                 අත්වූ                 අත්වූ
4               අනුගතවූ               අනුගතවූ


# **61. Single - Ending with වේ >>> Separate by meaning**

In [None]:
s_61 = 'වේ'
all_60 = correction_init(substring=s_61, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 490 words with වේ
INITIALIZED STAGE 61


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_61.csv and created a DataFrame
  original correction
0  අංගමුවේ    අංගමුවේ
1  අටුවාවේ    අටුවාවේ
2    අටුවේ      අටුවේ
3  අඩංගුවේ    අඩංගුවේ
4    අඩුවේ      අඩුවේ


# **62. Single - Ending with විය >>> Separate by meaning**

In [None]:
s_62 = 'විය'
all_61 = correction_init(substring=s_62, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 124 words with විය
INITIALIZED STAGE 62


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_62.csv and created a DataFrame
    original correction
0  අකමැතිවිය  අකමැතිවිය
1      අඩවිය      අඩවිය
2     අත්විය     අත්විය
3     අලෙවිය     අලෙවිය
4       අවිය       අවිය


# **63. Single - Ending with වෙයි >>> Separate by meaning**

In [None]:
s_63 = 'වෙයි'
all_62 = correction_init(substring=s_63, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 66 words with වෙයි
INITIALIZED STAGE 63


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_63.csv and created a DataFrame
    original correction
0    අඩුවෙයි    අඩුවෙයි
1    අත්වෙයි    අත්වෙයි
2  අමාරුවෙයි  අමාරුවෙයි
3    අසුවෙයි    අසුවෙයි
4    අහුවෙයි    අහුවෙයි


# **64. Single - Ending with වුණා >>> Separate by meaning**

In [None]:
s_64 = 'වුණා'
all_63 = correction_init(substring=s_64, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 29 words with වුණා
INITIALIZED STAGE 64


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_64.csv and created a DataFrame
  original correction
0  ඇතිවුණා    ඇතිවුණා
1  ඉටුවුණා    ඉටුවුණා
2  ඉදිවුණා    ඉදිවුණා
3  උදාවුණා    උදාවුණා
4  එක්වුණා    එක්වුණා


# **65. Single - Ending with වුනා >>> Separate and Change to වුණා by meaning**

In [None]:
s_65 = 'වුනා'
all_64 = correction_init(substring=s_65, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 30 words with වුනා
INITIALIZED STAGE 65


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_65.csv and created a DataFrame
  original correction
0  ඇතිවුනා    ඇතිවුනා
1  උදාවුනා    උදාවුනා
2  එපාවුනා    එපාවුනා
3  කතාවුනා    කතාවුනා
4  කියවුනා    කියවුනා


# **66. Single - Ending with වුණේ >>> Separate by meaning**

In [None]:
force_change_stage(65)

In [None]:
s_66 = 'වුණේ'
all_65 = correction_init(substring=s_66, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 21 words with වුණේ
INITIALIZED STAGE 66


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_66.csv and created a DataFrame
    original correction
0    ඇතිවුණේ    ඇතිවුණේ
1  ඉගැන්වුණේ  ඉගැන්වුණේ
2   එකතුවුණේ   එකතුවුණේ
3    කියවුණේ    කියවුණේ
4   කියැවුණේ   කියැවුණේ


# **67. Single Ending with වුනේ - Separate and Change to වුණේ**

In [None]:
s_67 = 'වුනේ'
all_66 = correction_init(substring=s_67, defaultSuffix=6)

Read All_Utterances/all_6.csv and created a DataFrame
Found 98407 unique utterances
Found 63549 unique words
Found 16 words with වුනේ
INITIALIZED STAGE 67


In [None]:
correction_inspect_toCorrect()

Read To_Correct/toCorrect_67.csv and created a DataFrame
    original correction
0    අහුවුනේ    අහුවුනේ
1    ඇතිවුනේ    ඇතිවුනේ
2      ඔවුනේ      ඔවුනේ
3    කියවුනේ    කියවුනේ
4  ජීවත්වුනේ  ජීවත්වුනේ


# APPLYING BULK CORRECTIONS @ 2021-12-03

Today, we completed all 67 correction definitions ('corrected_' files). Now, starting from 'all_6', we are applying the corrections sequentially. The workflow is as follows:

First apply *common_modifications*. Then, apply corrections for each stage, with *toDelete* and *doubtCorrected*.

We are categorizing the rules into two classes:
1.    **More Ambiguous:** Single-character words/suffixes which are generally used in isolation as well as combined with another word. Hence, although we have defined rules for these, their application is ambiguous. The selected words/suffixes and the corresponding stages are: ය (31), ද (48, 49), ම (51), ව (52, 53), වී (59), වූ (60), වේ (61)
2.    **Less Ambiguous:** All other corrections which we can apply as strict rules.

First we shall apply *Less Ambiguous* corrections, and lastly we shall apply the *More Ambiguous* corrections.

# **Common modifications**

In [None]:
force_change_stage(6)

In [None]:
all_6 = read_file('./All_Utterances/all_6.csv')
all_6.shape

Read ./All_Utterances/all_6.csv and created a DataFrame


(178372, 4)

In [None]:
all_6 = correction_apply_common_modifications('nilmani', all_6)

Read common_modifications_nilmani.csv and created a DataFrame
Applied 15 corrections in Stage 6.
Applied 15 common modifications.


In [None]:
all_6 = correction_apply_common_modifications('disura', all_6)
write_to_csv('all_6_modified', all_6, 'All_Utterances')

Read common_modifications_disura.csv and created a DataFrame
Applied 131 corrections in Stage 6.
Applied 131 common modifications.


# **Less Ambiguous Corrections**

In [None]:
force_change_stage(7)

In [21]:
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
# Stage 7
all_7 = correction_complete(all_6)

Read Corrected/corrected_7.csv and created a DataFrame
Read To_Delete/toDelete_7.csv and created a DataFrame
Deleted 36 utterances in Stage 7
Applied 11 corrections in Stage 7.
Read Doubt_Corrected/doubtCorrected_7.csv and created a DataFrame


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


STAGE 7 COMPLETE!


In [None]:
# Stage 8
all_8 = correction_complete(all_7)

Read Corrected/corrected_8.csv and created a DataFrame
Read To_Delete/toDelete_8.csv and created a DataFrame
Deleted 1 utterances in Stage 8
Applied 2 corrections in Stage 8.
Read Doubt_Corrected/doubtCorrected_8.csv and created a DataFrame


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


STAGE 8 COMPLETE!


In [None]:
# Stage 9
all_9 = correction_complete(all_8)

Read Corrected/corrected_9.csv and created a DataFrame
Applied 4 corrections in Stage 9.
Read Doubt_Corrected/doubtCorrected_9.csv and created a DataFrame
STAGE 9 COMPLETE!


In [None]:
# Stage 10
all_10 = correction_complete(all_9)

Read Corrected/corrected_10.csv and created a DataFrame
Read To_Delete/toDelete_10.csv and created a DataFrame
Deleted 4 utterances in Stage 10
Applied 6 corrections in Stage 10.
Read Doubt_Corrected/doubtCorrected_10.csv and created a DataFrame
Applied 5 Doubt Corrections in Stage 10.
STAGE 10 COMPLETE!


In [None]:
# Stage 11
all_11 = correction_complete(all_10)

Read Corrected/corrected_11.csv and created a DataFrame
Read To_Delete/toDelete_11.csv and created a DataFrame
Deleted 7 utterances in Stage 11
Applied 157 corrections in Stage 11.
Read Doubt_Corrected/doubtCorrected_11.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 11.
STAGE 11 COMPLETE!


In [None]:
# Stage 12
all_12 = correction_complete(all_11)

Read Corrected/corrected_12.csv and created a DataFrame
Applied 30 corrections in Stage 12.
STAGE 12 COMPLETE!


In [None]:
# Stage 13
all_13 = correction_complete(all_12)

Read Corrected/corrected_13.csv and created a DataFrame
Read To_Delete/toDelete_13.csv and created a DataFrame
Deleted 17 utterances in Stage 13
Applied 52 corrections in Stage 13.
STAGE 13 COMPLETE!


In [None]:
# Stage 14
all_14 = correction_complete(all_13)

Read Corrected/corrected_14.csv and created a DataFrame
Applied 11 corrections in Stage 14.
STAGE 14 COMPLETE! Utterance Count=178307


In [None]:
# Stage 15
all_15 = correction_complete(all_14)

Read Corrected/corrected_15.csv and created a DataFrame
Applied 492 corrections in Stage 15.
STAGE 15 COMPLETE! Utterance Count=178307


In [None]:
# Stage 16
all_16 = correction_complete(all_15)

Read Corrected/corrected_16.csv and created a DataFrame
Read To_Delete/toDelete_16.csv and created a DataFrame
Deleted 1 utterances in Stage 16
Applied 27 corrections in Stage 16.
STAGE 16 COMPLETE! Utterance Count=178306


In [None]:
# Stage 17
all_17 = correction_complete(all_16)

Read Corrected/corrected_17.csv and created a DataFrame
Applied 1 corrections in Stage 17.
STAGE 17 COMPLETE! Utterance Count=178306


In [None]:
# Stage 18
all_18 = correction_complete(all_17)

ERROR: Correction file for Stage 18 is unavailable!


In [None]:
### NO CORRECTIONS NEEDED IN STAGE 18
all_18 = all_17

In [None]:
# Stage 19
force_change_stage(19)
all_19 = correction_complete(all_18)

ERROR: Correction file for Stage 19 is unavailable!


In [None]:
### NO CORRECTIONS NEEDED IN STAGE 19
all_19 = all_18

In [None]:
# Stage 20
force_change_stage(20)
all_20 = correction_complete(all_19)

Read Corrected/corrected_20.csv and created a DataFrame
Read To_Delete/toDelete_20.csv and created a DataFrame
Deleted 28 utterances in Stage 20
Applied 391 corrections in Stage 20.
Read Doubt_Corrected/doubtCorrected_20.csv and created a DataFrame
Applied 7 Doubt Corrections in Stage 20.
STAGE 20 COMPLETE! Utterance Count=178278


In [None]:
# Stage 21
all_21 = correction_complete(all_20)

Read Corrected/corrected_21.csv and created a DataFrame
Applied 11 corrections in Stage 21.
Read Doubt_Corrected/doubtCorrected_21.csv and created a DataFrame
Applied 3 Doubt Corrections in Stage 21.
STAGE 21 COMPLETE! Utterance Count=178278


In [None]:
# Stage 22
all_22 = correction_complete(all_21)

ERROR: Correction file for Stage 22 is unavailable!


In [None]:
### NO CORRECTIONS NEEDED IN STAGE 22
all_22 = all_21

In [None]:
# Stage 23
force_change_stage(23)
all_23 = correction_complete(all_22)

Read Corrected/corrected_23.csv and created a DataFrame
Applied 6 corrections in Stage 23.
STAGE 23 COMPLETE! Utterance Count=178278


In [None]:
# Stage 24
all_24 = correction_complete(all_23)

Read Corrected/corrected_24.csv and created a DataFrame
Applied 8 corrections in Stage 24.
STAGE 24 COMPLETE! Utterance Count=178278


In [None]:
# Stage 25
all_25 = correction_complete(all_24)

Read Corrected/corrected_25.csv and created a DataFrame
Applied 1 corrections in Stage 25.
STAGE 25 COMPLETE! Utterance Count=178278


In [None]:
# Stage 26
all_26 = correction_complete(all_25)

Read Corrected/corrected_26.csv and created a DataFrame
Applied 1 corrections in Stage 26.
STAGE 26 COMPLETE! Utterance Count=178278


In [None]:
# Stage 27
all_27 = correction_complete(all_26)

Read Corrected/corrected_27.csv and created a DataFrame
Read To_Delete/toDelete_27.csv and created a DataFrame
Deleted 1 utterances in Stage 27
Applied 0 corrections in Stage 27.
STAGE 27 COMPLETE! Utterance Count=178277


**IMPORTANT:** හරකයි මි is retained in Stage 27; it should be changed as හරකයි මී (9d4947a774). Can apply finally.

In [None]:
# Stage 28
all_28 = correction_complete(all_27)

Read Corrected/corrected_28.csv and created a DataFrame
Applied 2 corrections in Stage 28.
STAGE 28 COMPLETE! Utterance Count=178277


In [None]:
# Stage 29
all_29 = correction_complete(all_28)

Read Corrected/corrected_29.csv and created a DataFrame
Applied 8 corrections in Stage 29.
STAGE 29 COMPLETE! Utterance Count=178277


In [None]:
# Stage 30
all_30 = correction_complete(all_29)

Read Corrected/corrected_30.csv and created a DataFrame
Applied 24 corrections in Stage 30.
STAGE 30 COMPLETE! Utterance Count=178277


In [None]:
# Stage 32
force_change_stage(32)
all_32 = correction_complete(all_30)

Read Corrected/corrected_32.csv and created a DataFrame
Read To_Delete/toDelete_32.csv and created a DataFrame
Deleted 1 utterances in Stage 32
Applied 42 corrections in Stage 32.
STAGE 32 COMPLETE! Utterance Count=178276


In [None]:
# Stage 33
all_33 = correction_complete(all_32)

Read Corrected/corrected_33.csv and created a DataFrame
Applied 106 corrections in Stage 33.
STAGE 33 COMPLETE! Utterance Count=178276


In [None]:
# Stage 34
all_34 = correction_complete(all_33)

Read Corrected/corrected_34.csv and created a DataFrame
Applied 0 corrections in Stage 34.
STAGE 34 COMPLETE! Utterance Count=178276


In [None]:
# Stage 35
all_35 = correction_complete(all_34)

Read Corrected/corrected_35.csv and created a DataFrame
Applied 57 corrections in Stage 35.
STAGE 35 COMPLETE! Utterance Count=178276


In [None]:
# Stage 36
all_36 = correction_complete(all_35)

Read Corrected/corrected_36.csv and created a DataFrame
Read To_Delete/toDelete_36.csv and created a DataFrame
Deleted 1 utterances in Stage 36
Applied 48 corrections in Stage 36.
STAGE 36 COMPLETE! Utterance Count=178275


In [None]:
# Stage 37
all_37 = correction_complete(all_36)

Read Corrected/corrected_37.csv and created a DataFrame
Applied 10 corrections in Stage 37.
STAGE 37 COMPLETE! Utterance Count=178275


In [None]:
# Stage 38
all_38 = correction_complete(all_37)

Read Corrected/corrected_38.csv and created a DataFrame
Applied 1 corrections in Stage 38.
STAGE 38 COMPLETE! Utterance Count=178275


In [None]:
# Stage 39
all_39 = correction_complete(all_38)

Read Corrected/corrected_39.csv and created a DataFrame
Read To_Delete/toDelete_39.csv and created a DataFrame
Deleted 4 utterances in Stage 39
Applied 49 corrections in Stage 39.
STAGE 39 COMPLETE! Utterance Count=178271


In [None]:
# Stage 40
all_40 = correction_complete(all_39)

Read Corrected/corrected_40.csv and created a DataFrame
Applied 1 corrections in Stage 40.
STAGE 40 COMPLETE! Utterance Count=178271


In [None]:
# Stage 41
# all_41 = correction_complete(all_40)

**IMPORTANT:** There was a runtime error caused by an asterisk (*) which was in one row of the `corrected_41.csv` when executing above cell. It was fixed after applying 67. So, 41 will be applied after 67.

In [None]:
# Stage 42
force_change_stage(42)
all_42 = correction_complete(all_40)

Read Corrected/corrected_42.csv and created a DataFrame
Applied 5 corrections in Stage 42.
STAGE 42 COMPLETE! Utterance Count=178271


In [None]:
# Stage 43
all_43 = correction_complete(all_42)

Read Corrected/corrected_43.csv and created a DataFrame
Applied 4 corrections in Stage 43.
STAGE 43 COMPLETE! Utterance Count=178271


In [None]:
# Stage 44
all_44 = correction_complete(all_43)

Read Corrected/corrected_44.csv and created a DataFrame
Read To_Delete/toDelete_44.csv and created a DataFrame
Deleted 1 utterances in Stage 44
Applied 13 corrections in Stage 44.
STAGE 44 COMPLETE! Utterance Count=178270


In [None]:
# Stage 45
all_45 = correction_complete(all_44)

Read Corrected/corrected_45.csv and created a DataFrame
Read To_Delete/toDelete_45.csv and created a DataFrame
Deleted 1 utterances in Stage 45
Applied 22 corrections in Stage 45.
STAGE 45 COMPLETE! Utterance Count=178269


In [None]:
# Stage 46
all_46 = correction_complete(all_45)

Read Corrected/corrected_46.csv and created a DataFrame
Applied 1 corrections in Stage 46.
STAGE 46 COMPLETE! Utterance Count=178269


In [None]:
# Stage 47
all_47 = correction_complete(all_46)

Read Corrected/corrected_47.csv and created a DataFrame
Applied 7 corrections in Stage 47.
STAGE 47 COMPLETE! Utterance Count=178269


In [None]:
# Stage 50
force_change_stage(50)
all_50 = correction_complete(all_47)

Read Corrected/corrected_50.csv and created a DataFrame
Applied 51 corrections in Stage 50.
STAGE 50 COMPLETE! Utterance Count=178269


In [None]:
# Stage 54
force_change_stage(54)
all_54 = correction_complete(all_50)

Read Corrected/corrected_54.csv and created a DataFrame
Applied 224 corrections in Stage 54.
Read Doubt_Corrected/doubtCorrected_54.csv and created a DataFrame
Applied 4 Doubt Corrections in Stage 54.
STAGE 54 COMPLETE! Utterance Count=178269


In [None]:
# Stage 55
all_55 = correction_complete(all_54)

Read Corrected/corrected_55.csv and created a DataFrame
Applied 261 corrections in Stage 55.
Read Doubt_Corrected/doubtCorrected_55.csv and created a DataFrame
Applied 1 Doubt Corrections in Stage 55.
STAGE 55 COMPLETE! Utterance Count=178269


In [None]:
# Stage 56
all_56 = correction_complete(all_55)

Read Corrected/corrected_56.csv and created a DataFrame
Applied 252 corrections in Stage 56.
STAGE 56 COMPLETE! Utterance Count=178269


In [None]:
# Stage 57
all_57 = correction_complete(all_56)

ERROR: Correction file for Stage 57 is unavailable!


**IMPORTANT:** Corrections of 57 are not available at the moment. Should do at the end.

In [None]:
force_change_stage(58)
all_57 = all_56

In [None]:
# Stage 58
all_58 = correction_complete(all_57)

Read Corrected/corrected_58.csv and created a DataFrame
Applied 37 corrections in Stage 58.
STAGE 58 COMPLETE! Utterance Count=178269


In [None]:
# Stage 62
force_change_stage(62)
all_62 = correction_complete(all_58)

Read Corrected/corrected_62.csv and created a DataFrame
Applied 66 corrections in Stage 62.
STAGE 62 COMPLETE! Utterance Count=178269


In [None]:
# Stage 63
all_63 = correction_complete(all_62)

Read Corrected/corrected_63.csv and created a DataFrame
Applied 51 corrections in Stage 63.
STAGE 63 COMPLETE! Utterance Count=178269


In [None]:
# Stage 64
all_64 = correction_complete(all_63)

Read Corrected/corrected_64.csv and created a DataFrame
Applied 25 corrections in Stage 64.
STAGE 64 COMPLETE! Utterance Count=178269


In [None]:
# Stage 65
all_65 = correction_complete(all_64)

Read Corrected/corrected_65.csv and created a DataFrame
Applied 29 corrections in Stage 65.
STAGE 65 COMPLETE! Utterance Count=178269


In [None]:
# Stage 66
all_66 = correction_complete(all_65)

Read Corrected/corrected_66.csv and created a DataFrame
Applied 13 corrections in Stage 66.
STAGE 66 COMPLETE! Utterance Count=178269


In [None]:
# Stage 67
all_67 = correction_complete(all_66)

Read Corrected/corrected_67.csv and created a DataFrame
Applied 16 corrections in Stage 67.
STAGE 67 COMPLETE! Utterance Count=178269


**Applying 41 after removing the unnecessary asterisk.**

In [None]:
force_change_stage(41)
all_41 = correction_complete(all_67)

Read Corrected/corrected_41.csv and created a DataFrame
Read To_Delete/toDelete_41.csv and created a DataFrame
Deleted 1 utterances in Stage 41
Applied 31 corrections in Stage 41.
STAGE 41 COMPLETE! Utterance Count=178268


In [None]:
all_67 = all_41
write_to_csv('all_67_41', all_67, allDirectory)

**Applying 57**

In [None]:
force_change_stage(57)
all_57 = correction_complete(all_67)

Read Corrected/corrected_57.csv and created a DataFrame
Read To_Delete/toDelete_57.csv and created a DataFrame
Deleted 8 utterances in Stage 57
Applied 135 corrections in Stage 57.
STAGE 57 COMPLETE! Utterance Count=178260


In [None]:
write_to_csv('all_67_41_57', all_67, allDirectory)

**Did some manual modifications to the above `all_67_41_57` utterance file:**


1.   Changed හරකයි මි to හරකයි මී in `9d4947a774`
2.   Removed `2385e88524` because it is a Pali Gatha
3.   Changed උත්සාහ වත්වන to උත්සාහවත් වන in `764aee61d5`
4.   Applied common_modifications by Lakshan: 5 modifications and 2 removals (removed `09f841d8b3` and `63190b0d59`)





**Finalized Less Ambiguous Utterances:**

In [39]:
all_LA = read_file('./All_Utterances/all_less_ambiguous.csv')

Read ./All_Utterances/all_less_ambiguous.csv and created a DataFrame


In [40]:
all_LA.shape

(178266, 4)

# **More Ambiguous Corrections**

In [41]:
# Stage 31
force_change_stage(31)
all_31 = correction_complete(all_LA)

Read Corrected/corrected_31.csv and created a DataFrame
Read To_Delete/toDelete_31.csv and created a DataFrame
Deleted 96 utterances in Stage 31
Applied 1519 corrections in Stage 31.
Read Doubt_Corrected/doubtCorrected_31.csv and created a DataFrame
Applied 11 Doubt Corrections in Stage 31.
STAGE 31 COMPLETE! Utterance Count=178170


In [23]:
# Stage 48
force_change_stage(48)
all_48 = correction_complete(all_31)

Read Corrected/corrected_48.csv and created a DataFrame
Read To_Delete/toDelete_48.csv and created a DataFrame
Deleted 24 utterances in Stage 48
Applied 1476 corrections in Stage 48.
Read Doubt_Corrected/doubtCorrected_48.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 48.
STAGE 48 COMPLETE! Utterance Count=178147


In [25]:
# Stage 49
all_49 = correction_complete(all_48)

Read Corrected/corrected_49.csv and created a DataFrame
Read To_Delete/toDelete_49.csv and created a DataFrame
Deleted 3 utterances in Stage 49
Applied 125 corrections in Stage 49.
Read Doubt_Corrected/doubtCorrected_49.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 49.
STAGE 49 COMPLETE! Utterance Count=178144


In [None]:
# Stage 51
force_change_stage(51)
all_51 = correction_complete(all_49)

Read Corrected/corrected_51.csv and created a DataFrame
Read To_Delete/toDelete_51.csv and created a DataFrame
Deleted 4 utterances in Stage 51
Applied 2099 corrections in Stage 51.
Read Doubt_Corrected/doubtCorrected_51.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 51.
STAGE 51 COMPLETE! Utterance Count=178140


In [None]:
# Stage 52
all_52 = correction_complete(all_51)

Read Corrected/corrected_52.csv and created a DataFrame
Read To_Delete/toDelete_52.csv and created a DataFrame
Deleted 38 utterances in Stage 52
Applied 941 corrections in Stage 52.
Read Doubt_Corrected/doubtCorrected_52.csv and created a DataFrame
Applied 4 Doubt Corrections in Stage 52.
STAGE 52 COMPLETE! Utterance Count=178102


In [None]:
# Stage 53
all_53 = correction_complete(all_52)

Read Corrected/corrected_53.csv and created a DataFrame
Applied 13 corrections in Stage 53.
STAGE 53 COMPLETE! Utterance Count=178102


In [None]:
# Stage 59
force_change_stage(59)
all_59 = correction_complete(all_53)

Read Corrected/corrected_59.csv and created a DataFrame
Applied 112 corrections in Stage 59.
Read Doubt_Corrected/doubtCorrected_59.csv and created a DataFrame
Applied 3 Doubt Corrections in Stage 59.
STAGE 59 COMPLETE! Utterance Count=178102


In [None]:
# Stage 60
all_60 = correction_complete(all_59)

Read Corrected/corrected_60.csv and created a DataFrame
Applied 142 corrections in Stage 60.
STAGE 60 COMPLETE! Utterance Count=178102


In [None]:
# Stage 61
all_61 = correction_complete(all_60)

Read Corrected/corrected_61.csv and created a DataFrame
Read To_Delete/toDelete_61.csv and created a DataFrame
Deleted 1 utterances in Stage 61
Applied 97 corrections in Stage 61.
STAGE 61 COMPLETE! Utterance Count=178102


**Finished More Ambiguous corrections**

In [None]:
write_to_csv('all_more_ambiguous', all_61, allDirectory)

# **Applying all corrections in one loop**

# CONSIDER THIS AS THE CORRECT OPERATION

There was a bug in `apply_corrections` and `correction_doubt_correct` functions which has caused erroneuos replacements of texts. The bug was fixed by having `apply_padding` and `remove_padding` functions.

Now, we shall apply all the corrections from STAGE 1, in one loop. However, we shall do *less ambiguous* and *more ambiguous* corrections separately.

In [115]:
LA_stages = [1, 2, 3, 4, 5, 6, 7, 8, 9,
             10, 11, 12, 13, 14, 15, 16, 17,
             20, 21, 23, 24, 25, 26, 27, 28, 29,
             30, 32, 33, 34, 35, 36, 37, 38, 39,
             40, 41, 42, 43, 44, 45, 46 ,47,
             50, 54, 55, 56, 57, 58,
             62, 63, 64, 65, 66, 67]

MA_stages = [31, 48, 49, 51, 52, 53, 59, 60, 61]

In [116]:
all_0 = read_file(f"{allDirectory}/all_0.csv")

Read All_Utterances/all_0.csv and created a DataFrame


In [117]:
all_0.describe()

Unnamed: 0,utterance_id,speaker_id,utterance,gender
count,178409,178409,178409,178409
unique,178409,478,98435,2
top,66a2912a84,0b586,ජය වේවා,f
freq,1,798,16,96724


In [118]:
print("Applying Common modifications")
print("======================================")

names = ['disura', 'nilmani']

for n in names:
  all_0 = correction_apply_common_modifications(n, all_0)
  print("======================================")

write_to_csv('all_0_common_modified', all_0, allDirectory)

Applying Common modifications
Read common_modifications_disura.csv and created a DataFrame
Applied 132 corrections in Stage 0.
Applied 132 common modifications.
Read common_modifications_nilmani.csv and created a DataFrame
Applied 15 corrections in Stage 0.
Applied 15 common modifications.


In [120]:
all_objects = []
all_objects.append(all_0)
i = 0

INITIALIZED = False

In [121]:
print("Starting Less Ambiguous Corrections")
print("======================================")

for s in range(len(LA_stages)):
  force_change_stage(LA_stages[s])
  all_obj = correction_complete(all_objects[i])
  all_objects.append(all_obj)
  i += 1
  print("======================================")


Starting Less Ambiguous Corrections
Read Corrected/corrected_1.csv and created a DataFrame
Applied 1 corrections in Stage 1.
STAGE 1 COMPLETE! Utterance Count=178409
Read Corrected/corrected_2.csv and created a DataFrame
Applied 2 corrections in Stage 2.
STAGE 2 COMPLETE! Utterance Count=178409
Read Corrected/corrected_3.csv and created a DataFrame
Applied 1 corrections in Stage 3.
STAGE 3 COMPLETE! Utterance Count=178409
Read Corrected/corrected_4.csv and created a DataFrame
Read To_Delete/toDelete_4.csv and created a DataFrame
Deleted 2 utterances in Stage 4
Applied 12 corrections in Stage 4.
Read Doubt_Corrected/doubtCorrected_4.csv and created a DataFrame
Applied 1 Doubt Corrections in Stage 4.
STAGE 4 COMPLETE! Utterance Count=178407
Read Corrected/corrected_5.csv and created a DataFrame
Read To_Delete/toDelete_5.csv and created a DataFrame
Deleted 24 utterances in Stage 5
Applied 82 corrections in Stage 5.
Read Doubt_Corrected/doubtCorrected_5.csv and created a DataFrame
Applied 

In [122]:
all_lessAmb = all_objects[-1]

write_to_csv('all_less_ambiguous', all_lessAmb, allDirectory)

In [123]:
all_objects = []
all_objects.append(all_lessAmb)
i = 0

INITIALIZED = False

In [124]:
print("Starting More Ambiguous Corrections")
print("======================================")

for s in range(len(MA_stages)):
  force_change_stage(MA_stages[s])
  all_obj = correction_complete(all_objects[i])
  all_objects.append(all_obj)
  i += 1
  print("======================================")

Starting More Ambiguous Corrections
Read Corrected/corrected_31.csv and created a DataFrame
Read To_Delete/toDelete_31.csv and created a DataFrame
Deleted 96 utterances in Stage 31
Applied 1519 corrections in Stage 31.
Read Doubt_Corrected/doubtCorrected_31.csv and created a DataFrame
Applied 11 Doubt Corrections in Stage 31.
STAGE 31 COMPLETE! Utterance Count=178164
Read Corrected/corrected_48.csv and created a DataFrame
Read To_Delete/toDelete_48.csv and created a DataFrame
Deleted 24 utterances in Stage 48
Applied 1476 corrections in Stage 48.
Read Doubt_Corrected/doubtCorrected_48.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 48.
STAGE 48 COMPLETE! Utterance Count=178141
Read Corrected/corrected_49.csv and created a DataFrame
Read To_Delete/toDelete_49.csv and created a DataFrame
Deleted 3 utterances in Stage 49
Applied 125 corrections in Stage 49.
Read Doubt_Corrected/doubtCorrected_49.csv and created a DataFrame
Applied 2 Doubt Corrections in Stage 49.
STAGE 49

In [125]:
all_moreAmb = all_objects[-1]

write_to_csv('all_more_ambiguous', all_moreAmb, allDirectory)