In [2]:
import LookUpTable as lut
import pandas as pd
import re
import random
import Levenshtein

In [1]:
SamantarDatasetPath = r'./SamantarDatasetRaw.txt'
webscrapedDatasetPath = r'./web_scraped_data_raw.csv'

In [3]:
with open(SamantarDatasetPath, 'r', encoding='utf-8') as file:
    lines = [line.strip() for line in file.readlines()]

df = pd.DataFrame(lines, columns=["OriginalText"])

In [28]:
def clean_telugu_text(text):
    # Remove non-Telugu characters except whitespace
    cleaned = re.sub(r'[^\u0C00-\u0C7F\s]', '', text)
    # Replace multiple spaces with a single space
    cleaned = re.sub(r'\s+', ' ', cleaned)
    # Strip unnecessary leading/trailing whitespace
    cleaned = cleaned.strip()
    
    return cleaned

In [29]:
def get_levenstein(src, tar):
    return Levenshtein.distance(src, tar)

In [38]:
def substitute_consonants(text, error_percentage):
    characters = list(text)
    max_errors = int(len(characters) * error_percentage)
    error_count = 0
    tried_indices = set()
    
    start_length = len(characters)

    while error_count < max_errors and len(tried_indices) < len(characters):
        idx = random.randint(0, len(characters) - 1)
        if idx in tried_indices:
            continue
        tried_indices.add(idx)

        current_char = characters[idx]

        # Skip if character is not a consonant present the lookup table
        if current_char not in lut.SimpleCharacterReplacements.keys():
            continue

        # Skip if previous character is a virama (్)
        if (idx > 0 and characters[idx - 1] == '్') or (characters[idx]=="య" and idx < len(characters) - 1 and characters[idx + 1] == "ి"):
            continue

        # Get replacement options and choose one randomly
        replacements = lut.SimpleCharacterReplacements[current_char]
        replacement = random.choice(replacements)

        characters[idx] = replacement
        error_count += 1

    end_length = len(characters)

    print(f"Percentage of errors introduced: {round((error_count / start_length) * 100, 2)}% with number of deletions = {end_length-start_length}")
    avg_error = 0
    for (actual, error) in zip(text.split(" "), ''.join(characters).split(" ")):
        avg_error += get_levenstein(actual, error)
        
    AverageErrorPerWord = round(avg_error / len(text.split(" ")), 2)

    return ''.join(characters), AverageErrorPerWord

In [39]:
def substitute_stacked_diacritics(text, error_percentage):
    
    telugu_consonants = ['క', 'ఖ', 'గ', 'ఘ', 'ఙ','చ', 'ఛ', 'జ', 'ఝ', 'ఞ',
                         'ట', 'ఠ', 'డ', 'ఢ', 'ణ', 'త', 'థ', 'ద', 'ధ', 'న',
                        'ప', 'ఫ', 'బ', 'భ', 'మ', 'య', 'ర', 'ల', 'వ',
                        'శ', 'ష', 'స', 'హ', 'ళ', 'క్ష', 'ఱ']

    characters = list(text)
    max_errors = int(len(characters) * error_percentage)
    error_count = 0
    tried_indices = set()
    
    start_length = len(characters)

    while error_count < max_errors and len(tried_indices) < len(characters):
        idx = random.randint(0, len(characters) - 1)
        if idx in tried_indices:
            continue
        tried_indices.add(idx)

        current_char = characters[idx]

        # Skip if character is not a consonant present the lookup table
        if current_char not in lut.StackingReplacements.keys():
            continue

        # Skip if previous character is a virama (్)
        if (idx > 2 and characters[idx - 1] == '్' and characters[idx - 2] in telugu_consonants):
            # Get replacement options and choose one randomly
            replacements = lut.StackingReplacements[current_char]
            replacement = random.choice(replacements)

            characters[idx] = replacement
            error_count += 1

    end_length = len(characters)

    print(f"Percentage of errors introduced: {round((error_count / start_length) * 100, 2)}% with number of deletions = {end_length-start_length}")
    avg_error = 0
    for (actual, error) in zip(text.split(" "), ''.join(characters).split(" ")):
        avg_error += get_levenstein(actual, error)
        
    AverageErrorPerWord = round(avg_error / len(text.split(" ")), 2)

    return ''.join(characters), AverageErrorPerWord

In [None]:
df["OriginalText"] = df["OriginalText"].apply(clean_telugu_text)

In [34]:
# df[["15%_ErrorInducedText", "AverageErrorPerWord_15%"]] = df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.15))
# df[["25%_ErrorInducedText", "AverageErrorPerWord_25%"]] = df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.25))
# df[["35%_ErrorInducedText", "AverageErrorPerWord_35%"]] = df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.35))
# df[["50%_ErrorInducedText", "AverageErrorPerWord_50%"]] = df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.5))
# df.to_csv('./SamantarDatasetWithDirectConsonantSubstitutions.csv', index=False, encoding='utf-8-sig')

In [None]:
# df[["15%_ErrorInducedText", "AverageErrorPerWord_15%"]] = df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.15))
# df[["25%_ErrorInducedText", "AverageErrorPerWord_25%"]] = df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.25))
# df[["35%_Err[orInducedText", "AverageErrorPerWord_35%"]] = df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.35))
# df[["50%_ErrorInducedText", "AverageErrorPerWord_50%"]] = df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.5))
# df.to_csv('./SamantarDatasetWithDiacriticSubstitutions.csv', index=False, encoding='utf-8-sig')

In [None]:
# Model needs to learn from more context and accordingly correct the text
# substitute_stacked_diacritics("శంకరాచార్యులు అద్వైత సిద్ధాంతాన్ని ప్రవేశపెట్టారు", 0.15)

Percentage of errors introduced: 8.16% with number of deletions = 0


'శంకరాచార్వులు అద్యైత సిద్గాంతాన్వి ప్రవేశపెట్టారు'

### Dealing with Web Scraped Dataset

In [16]:
scraped_df = pd.read_csv(webscrapedDatasetPath, encoding='utf-8-sig')

In [17]:
Q1 = scraped_df['word_count'].quantile(0.25)
Q3 = scraped_df['word_count'].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

outliers = scraped_df[(scraped_df['word_count'] < lower_bound) | (scraped_df['word_count'] > upper_bound)]

print("Maximum word count in the dataset:", scraped_df['word_count'].max())
print("Min word count in the dataset:", scraped_df['word_count'].min())

print("Number of outliers:", len(outliers))

Maximum word count in the dataset: 500
Min word count in the dataset: 13
Number of outliers: 0


In [24]:
# since there are no outliers, split each sentence into 3 sentences of length total number of words/3 = 500/3 ~ 160
max_sentence_length = 100 # max number of words in a sentence
split_sentences_list = []
for index, row in scraped_df.iterrows():
    text = row['content']
    words = text.split()
    for i in range(0, len(words), max_sentence_length):
        new_sentence = ' '.join(words[i:i + max_sentence_length])
        split_sentences_list.append(new_sentence)

# remove the empty strings from the list
split_sentences_list = [sentence for sentence in split_sentences_list if sentence.strip() != '']

# filter out the sentence with sentence length < 20 words
split_sentences_list = [sentence for sentence in split_sentences_list if len(sentence.split()) >= 20]

In [25]:
print("Number of sentences after splitting:", len(split_sentences_list))
print("Max sentence length after splitting:", max([len(sentence.split()) for sentence in split_sentences_list]))
print("Min sentence length after splitting:", min([len(sentence.split()) for sentence in split_sentences_list]))
print("Average sentence length after splitting:", sum([len(sentence.split()) for sentence in split_sentences_list]) / len(split_sentences_list))

Number of sentences after splitting: 3240
Max sentence length after splitting: 100
Min sentence length after splitting: 20
Average sentence length after splitting: 89.22901234567901


In [42]:
scraped_data_df = pd.DataFrame(split_sentences_list, columns=["OriginalText"])
scraped_data_df["OriginalText"] = scraped_data_df["OriginalText"].apply(clean_telugu_text)

In [None]:
# scraped_data_df[["15%_ErrorInducedText", "AverageErrorPerWord_15%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.15)).tolist()
# scraped_data_df[["25%_ErrorInducedText", "AverageErrorPerWord_25%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.25)).tolist()
# scraped_data_df[["35%_ErrorInducedText", "AverageErrorPerWord_35%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.35)).tolist()
# scraped_data_df[["50%_ErrorInducedText", "AverageErrorPerWord_50%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_consonants(x, 0.5)).tolist()
# scraped_data_df.to_csv('./WebScrapedDatasetWithDirectConsonantSubstitutions.csv', index=False, encoding='utf-8-sig')

In [44]:
scraped_data_df[["15%_ErrorInducedText", "AverageErrorPerWord_15%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.15)).tolist()
scraped_data_df[["25%_ErrorInducedText", "AverageErrorPerWord_25%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.25)).tolist()
scraped_data_df[["35%_ErrorInducedText", "AverageErrorPerWord_35%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.35)).tolist()
scraped_data_df[["50%_ErrorInducedText", "AverageErrorPerWord_50%"]] = scraped_data_df["OriginalText"].apply(lambda x: substitute_stacked_diacritics(x, 0.5)).tolist()
scraped_data_df.to_csv('./WebScrapedDatasetWithDiacriticSubstitutions.csv', index=False, encoding='utf-8-sig')

Percentage of errors introduced: 2.77% with number of deletions = 0
Percentage of errors introduced: 4.52% with number of deletions = 0
Percentage of errors introduced: 3.82% with number of deletions = 0
Percentage of errors introduced: 5.21% with number of deletions = 0
Percentage of errors introduced: 3.2% with number of deletions = 0
Percentage of errors introduced: 2.05% with number of deletions = 0
Percentage of errors introduced: 5.19% with number of deletions = 0
Percentage of errors introduced: 2.8% with number of deletions = 0
Percentage of errors introduced: 4.41% with number of deletions = 0
Percentage of errors introduced: 2.96% with number of deletions = 0
Percentage of errors introduced: 2.82% with number of deletions = 0
Percentage of errors introduced: 6.29% with number of deletions = 0
Percentage of errors introduced: 2.96% with number of deletions = 0
Percentage of errors introduced: 2.82% with number of deletions = 0
Percentage of errors introduced: 6.29% with number