In [1]:
stopwords = [], []
import re

def WordTokenizer(data, corpus='poetry', keep_stopwords = True):
        data = re.sub(r'([.,:;\'\\"!?%#@*<>|\+\-\(\)])', r' \1 ', data)
        # data = re.sub(r'')
        data = re.sub(r"”“", r'\"', data)
        data = re.sub(r'…', " ", data)
        data = re.split(r'[ -]',data)
        words = []

        if not keep_stopwords:
            for word in data:
                if word not in stopwords:
                    words.append(word)
            return words

        for i in data:
            if i:
                words.append(i)
        return words


def SentenceTokenizer(data):
    data = data.strip()
    data = re.sub(r'([.!?])', r'\1 ', data)
    data = re.split(r'  ',data)
    if not data[-1]:
    	del(data[-1])
    return data

In [2]:
import re

class Preprocessor():
    def __init__(self):
        self.suffixes = []
        pass

    def compulsory_preprocessing(self, text):
        '''This is a function to preprocess the text and make the necessary changes which are compulsory for any type of Gujarati NLP task'''
        text = re.sub(r'\u200b', '', text)
        text = re.sub(r'\ufeff', "", text)
        text = re.sub(r'…', " ", text)
        text = re.sub(r'  ', ' ', text)
        text = re.sub(r'”“', '', text)
        text = WordTokenizer(text)
        for i in range(len(text)):
            text[i] = text[i].rstrip(':')
        return ' '.join(text)

    def remove_tek(self, text, tek_string):
        '''
        Tek is the Gujarati word for the initial line of the poem. Whenever, one stanza of any poem is sung, the initial line of the poem is sung once again before starting the
        next stanza. This is called as singing a "Tek". Written poems mention the tek string too many a times. This will cause a problem of redundancy. Hence, removing it is
        necessary.
        '''
        if str(type(tek_string))=="<class 'NoneType'>" or not tek_string:
            raise TypeError('tek_string needs to be a valid string')
        if str(type(text))=="<class 'list'>":
            for i in range(len(text)):
                text[i] = text[i].rstrip(tek_string)
        elif str(type(text))=="<class 'str'>":
            text = text.rstrip(tek_string)
        else:
            raise TypeError("Argument 'text' must be either a str or list")
        return text

    def poetic_preprocessing(self, text, remove_tek=False, tek_string=None):
        '''This function is only required when dealing with poetic corpora. Make sure to use this function along with the compulsory preprocessing to have decently accurate results with poetic corpora'''
        text = re.sub(r'।','.',text)
        text = re.sub(' ।।[૧૨૩૪૫૬૭૮૯૦]।।', '.', text)
        if remove_tek:
            text = self.remove_tek(text, tek_string)
        tokens = WordTokenizer(text, corpus='poetry', keep_punctuations=False)

        for i in range(len(tokens)):
            # Rule 1
            if tokens[i].endswith('જી'):
                tokens[i] = tokens[i].strip('જી')
            # Rule 2
            if tokens[i].endswith('ૈ'):
                tokens[i] = tokens[i].strip('ૈ')+'ે'
            # Rule 3
            index = tokens[i].find('ર')
            if index == -1:
                pass
            elif index<len(tokens[i])-1 and tokens[i][index-1]=='િ':
                tokens[i] = re.sub('િર', 'ૃ', tokens[i])

        return ' '.join(tokens)

In [9]:
import re

# You may need to add/remove suffixes/prefixes according to the corpora
suffixes = ['નાં','ના','ની','નો','નું','ને','થી','માં','એ','ીએ','ઓ','ે','તા','તી','વા','મા','વું','વુ','ો','માંથી','શો','ીશ','ીશું','શે',
			'તો','તું','તાં','્યો','યો','યાં','્યું','યું','ોઈશ', 'ોઈશું', '્યા','યા','્યાં','સ્વી','રે','ં','મ્','મ્','ી','કો',
      'ેલ', 'ેલો', 'ેલા', 'ેલું', 'ેલી', 'ણે', 'ણા', 'ણું', 'ણો', 'ણી'
      ]
prefixes = [] #['અ']
class Stemmer():
	def __init__(self):
		self.suffixes = suffixes
		self.prefixes = prefixes

	def add_suffix(self, suffix):
		self.suffixes.append(suffix)

	def add_prefix(self, prefix):
		self.prefixes.append(prefix)

	def delete_suffix(self, suffix):
		try:
			del(self.suffixes[self.suffixes.index(suffix)])
		except IndexError:
			print('{} not present in suffixes'.format(suffix))

	def delete_prefix(self, prefix):
		try:
			del(self.prefixes[self.prefixes.index(prefix)])
		except IndexError:
			print("{} not present in prefixes".format(prefix))


	def stem_word(self, sentence, corpus):
		word_list = sentence.strip('\u200b').split(' ')
		if not word_list[-1]:
			del(word_list[-1])
		return_list = []
		suffix_list = []
		puctuations = ('.',',','!','?','"',"'",'%','#','@','&','…','“', '”', '’', '‘', ':', ';')
		for word in word_list:
			a = word
			removed_suffix = None
			if word.endswith(puctuations):
				a = word[:-1]

			for suffix in suffixes:
				if a.endswith(suffix):
					a = a[:-len(suffix)]
					removed_suffix = suffix
					break
			for prefix in prefixes:
				if a.startswith(prefix):
					a = a[len(prefix):]
					break
			if word.endswith(puctuations):
				a += str(word[-1])
			return_list.append(a)
			suffix_list.append(removed_suffix)
		return_sentence = " ".join(return_list)
		return {
			"stemmed_sentence": return_sentence,
			"removed_suffixes": suffix_list
		}

	def stem(self, text, corpus='prose', remove_tek=False, tek_string=None):
		preprocessor = Preprocessor()
		text = preprocessor.compulsory_preprocessing(text)
		if corpus == 'poetry':
			text = preprocessor.poetic_preprocessing(text, remove_tek=remove_tek, tek_string=tek_string)
		elif corpus == 'prose':
			pass
		else:
			raise ValueError("Unnrecognized argument 'corpus'. Should be either 'prose' or 'poetry'")
		l = SentenceTokenizer(text)
		if len(l)==1:
			sentence = l[0]
			return self.stem_word(sentence, corpus=corpus)
		else:
			a = []
			for sentence in l:
				a.append(self.stem(sentence))
			return a

In [10]:
stmr = Stemmer()
stmr.stem("એમાંથી બોધ તારવવાની મગજમારી કરવાની મારે કશી જરૂર નથી,")

{'stemmed_sentence': 'એમાં બોધ તારવવા મગજમાર કરવા માર કશ જરૂર ન ,',
 'removed_suffixes': ['થી', None, 'ની', 'ી', 'ની', 'ે', 'ી', None, 'થી', None]}

In [11]:
import pandas as pd

with open('/content/drive/MyDrive/Gujarati_Spelling_and_Grammar_Autocorrect/data/15k_sampled_sentences.txt', 'r', encoding='utf-8') as f:
    lines = [line.strip() for line in f if line.strip()]

df = pd.DataFrame({'sentence': lines})
df['n_err'] = 0
df['err_sentence'] = df['sentence']
df['err_types'] = ""

def load_word_list(path):
    with open(path, encoding="utf-8") as word_list:
        return set([word.strip() for word in word_list])

vocab = load_word_list("/content/drive/MyDrive/Gujarati_Spelling_and_Grammar_Autocorrect/hunspell-gu.dic")

In [20]:
suffix_classes = {
    "class1": ['નાં','ના','ની','નો','નું','ને','નુ'],
    "class2": ['માં','એ','ઓ','માંથી'],
    "class3": ['વા','વું','વુ'],
    "class4": ['શો','ીશ','ીશું','શે', 'ીશુ'],
    "class5": ['તો','તું','તાં','તા','તી'],
    "class6": ['્્યો','યો','યાં','્યું','યું','્યા','યા','્યાં'],
    "class7": ['સ્વી','રે','મ્'],
    "class8": ['ણે', 'ણા', 'ણું', 'ણો', 'ણી'],
    # "class8": ['ી','ો','ે','ું'],
    "class9": ['ેલ', 'ેલો', 'ેલા', 'ેલું', 'ેલી''ેલુ'],
 }

aux_suffix_classes = {
    "class9" : ['તો','તું','તાં','તા','તી','શો', 'ોઈશ', 'ોઈશું','શે'], #based on tense (only for aux હ)
    "class10" : ['ે', 'ું', 'ીએ', 'ો']  #based on tense (only for aux છ)
}

aux_stemmed =  ['હ', 'છ']

punctuations = ['.', ',', '!', '?', '"', "'", ':', ';']
matra_classes = {
    "class1": ["ુ", "ૂ"],
    "class2": ["િ", "ી"],
    "class3": ["ે", "ૈ"],
    "class7":["ઉ", "ઊ"],
    "class4": ["ં"],
    "class5": ["ો", "ૌ"],
    "class6": ["ૃ", "ૄ"],
    "class7": ["ં"],
    "class9": ["ં"]
}


def is_anusvara_related_change(s1, s2):
    return ('ં' in s1) != ('ં' in s2) and s1.count('ં') != s1.count('ં')

def gen_punct_err(row):
    row = row.copy()
    tokens = WordTokenizer(row['err_sentence'])
    punct_indices = [i for i, tok in enumerate(tokens) if tok in punctuations]

    if not punct_indices:
        return False, row

    idx = random.choice(punct_indices)
    current = tokens[idx]
    choices = [p for p in punctuations if p != current and p!='!']
    choices.append("")
    tokens[idx] = random.choice(choices)
    row['sentence'] = ' '.join(tokens)
    row['err_types'] = row['err_types']+"ORTH:PUNCT "
    # print(row["sentence"])
    return True, row


def gen_morph_err(row):
    row = row.copy()
    tokens = WordTokenizer(row['err_sentence'])
    if not tokens:
        return False, row

    idx = random.randint(0, len(tokens) - 1)
    token = tokens[idx]
    if token in ('અને', 'તને'):
      return False, row
    # print("b:", token)
    stemmer = Stemmer()
    stem_result = stemmer.stem_word(token, corpus='prose')
    stemmed = stem_result["stemmed_sentence"]
    if stem_result["removed_suffixes"]:
      removed_suffix = stem_result["removed_suffixes"][0]
    else:
      return False, row

    if removed_suffix is None or stemmed == token or stemmed == "":
        return False, row

    if stemmed == aux_stemmed[0] or stemmed == aux_stemmed[1]:

        x = random.randint(0, 1)
        if x:
          idy = random.randint(0, len(aux_suffix_classes['class9'])-1)
          if removed_suffix == aux_suffix_classes['class9'][idy]:
              idy = (idy + 1) % len(aux_suffix_classes['class9'])

          tokens[idx] = aux_stemmed[0] + aux_suffix_classes['class9'][idy]
          # print("a:", tokens[idx])
          if tokens[idx] in vocab:
            row['sentence'] = ' '.join(tokens)
            row['err_types'] = row['err_types']+"MORPH "
            return True, row
          return False, row

        else:
          idy = random.randint(0, len(aux_suffix_classes['class10'])-1)
          if removed_suffix == aux_suffix_classes['class10'][idy]:
              idy = (idy + 1) % len(aux_suffix_classes['class10'])

          tokens[idx] = aux_stemmed[1] + aux_suffix_classes['class10'][idy]
          # print("a:", tokens[idx])
          if tokens[idx] in vocab:
            row['sentence'] = ' '.join(tokens)
            row['err_types'] = row['err_types']+"MORPH "
            return True, row
          return False, row

    class_keys = list(suffix_classes.keys())
    random.shuffle(class_keys)
    for cls in class_keys:
        if removed_suffix in suffix_classes[cls]:
            idy = random.randint(0, len(suffix_classes[cls]) - 1)

            if removed_suffix == suffix_classes[cls][idy]:
                idy = (idy + 1) % len(suffix_classes[cls])

            tokens[idx] = stemmed + suffix_classes[cls][idy]
            # print("a:", tokens[idx])
            row['sentence'] = ' '.join(tokens)

            if is_anusvara_related_change(tokens[idx], token):
              row['err_types'] = row['err_types']+"SPELL:ANUSVARA "
            else:
              row['err_types'] = row['err_types']+"MORPH "
            # print(row["sentence"])
            if tokens[idx] in vocab:
              return True, row
    return False, row

def gen_synt_err(row):
    row = row.copy()
    tokens = WordTokenizer(row['err_sentence'])
    if len(tokens) < 2:
        return False, row

    indices = [i for i in range(len(tokens) - 1) if tokens[i] not in punctuations and tokens[i + 1] not in punctuations]

    if not indices:
        return False, row

    idx = random.choice(indices)
    if tokens[idx] != tokens[idx + 1]:
      tokens[idx], tokens[idx + 1] = tokens[idx + 1], tokens[idx]
    else:
      return False, row
    row['sentence'] = ' '.join(tokens)
    # print(row["sentence"])
    row['err_types'] = row['err_types']+"SYNT:WO "
    return True, row

def gen_spell_err(row):
    row = row.copy()
    tokens = WordTokenizer(row['err_sentence'])
    if not tokens:
        return False, row

    idx = random.randint(0, len(tokens) - 1)
    token = tokens[idx]
    class_keys = list(matra_classes.keys())
    random.shuffle(class_keys)

    for cls in class_keys:
        for matra in matra_classes[cls]:

            if matra not in token:
                continue
            choices = [m for m in matra_classes[cls] if m != matra]
            # print(matra, choices)
            if not choices and cls not in ["class4", "class9", "class7"]:
                return False, row

            replacement = None
            if cls in ["class4", "class9","class7"]:
                choices.append("")
                replacement = random.choice(choices)
                row['err_types'] += "SPELL:ANUSVARA "
            else:
                replacement = random.choice(choices)
                row['err_types'] += "SPELL:MATRA "

            tokens[idx] = token.replace(matra, replacement, 1)
            # print(tokens[idx], replacement, matra)
            row['sentence'] = ' '.join(tokens)
            # print(row['sentence'])
            return True, row

    return False, row

In [21]:
import random
import numpy as np

def generate_errs(df, prob_list=[0, 1, 0, 0], max_err_per_sentence=3):
    funcs = [
        gen_punct_err,
        gen_morph_err,
        gen_synt_err,
        gen_spell_err
    ]

    # if int(sum(prob_list)) != 1:
    #     raise ValueError("Probabilities must sum to 1")

    n_samples = len(df)
    weights = np.array(prob_list)
    weights = weights / weights.sum()
    chosen_func = random.choices(funcs, weights=weights, k=1)[0]


    # valid_rows = df[df['n_err'] < max_err_per_sentence]
    # if valid_rows.empty:
    #   print("Max errors reached for all the sentences.")
    #   return False, None
    # row = valid_rows.sample(1).iloc[0]
    # print(f"Selected row with current error count: {row['n_err']}")


    result = False, None

    i = 0
    while not result[0]:
        i += 1
        if n_samples == i:
            print("unable to generate error")
            break

        valid_rows = df[df['n_err'] == 0]
        # valid_rows = df[df['n_err'] < max_err_per_sentence]
        if valid_rows.empty:
          print("Max errors reached for all the sentences.")
          return False, None
        row = valid_rows.sample(1).iloc[0]
        result = chosen_func(row)
        idx = row.name


    if result[0]:
        df.at[idx, 'n_err'] += 1
        df.at[idx, 'err_sentence'] = result[1]['sentence']
        df.at[idx, 'err_types'] = result[1]['err_types']


    return result

In [22]:
for _ in range(5000):
  x = generate_errs(df, prob_list= [0.1,0.65,0.10,0.15], max_err_per_sentence= 3)#[0.05,0.45,0.20,0.30]

In [23]:
print(df.describe())

              n_err
count  15200.000000
mean       1.315789
std        0.729309
min        0.000000
25%        1.000000
50%        1.000000
75%        2.000000
max        3.000000


In [24]:
df0 = df.query("n_err>0")
len(df0)

14308

In [6]:
# df.to_csv('/content/drive/MyDrive/15k_err_sentences.csv')
# df = pd.read_csv('/content/drive/MyDrive/15k_err_sentences.csv')

In [28]:
data = []
i = 0
for index, row in df.iterrows():
    i += 1
    tokenized_reference = WordTokenizer(row['err_sentence'])
    tokenized_prediction = WordTokenizer(row['sentence'])
    row['reference'] = ' '.join(tokenized_reference)
    row['prediction'] = ' '.join(tokenized_prediction)
    data.append({
        "index": i,
        "input": row['reference'],
        "reference": row['prediction']
    })

# Specify the output JSON file path
json_file_path = '/content/drive/MyDrive/15k_err_sentences.json'

# Write the data to a JSON file
with open(json_file_path, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

print(f"JSON file created at: {json_file_path}")


JSON file created at: /content/drive/MyDrive/15k_err_sentences.json


In [None]:
import re

input_filepath = '/content/drive/MyDrive/Gujarati_Spelling_and_Grammar_Autocorrect/data/cleaned_eBooks_sentences_1-13.txt'
output_filepath = '/content/10k_err_sentences.txt'

gujarati_pattern = re.compile(r'^[\u0A80-\u0AFF\s.,!?\'"\-:;()]+$')

with open(input_filepath, 'r', encoding='utf-8') as infile, \
     open(output_filepath, 'w', encoding='utf-8') as outfile:
    for line in infile:
        sentence = line.strip()
        if sentence:
            words = sentence.split()
            # Apply length filter
            if 7 < len(words) <= 40:
                # Check if sentence contains mostly Gujarati characters
                if gujarati_pattern.match(sentence):
                    outfile.write(sentence + "\n")

print(f"Filtered sentences saved to {output_filepath}")

Filtered sentences saved to /content/10k_err_sentences.txt


In [None]:
import random

input_filepath = '/content/10k_err_sentences.txt'
output_filepath = '/content/drive/MyDrive/Gujarati_Spelling_and_Grammar_Autocorrect/data/15k_sampled_sentences.txt'

# Set seed for reproducibility (optional)
random.seed(42)

# Read all sentences
with open(input_filepath, 'r', encoding='utf-8') as infile:
    sentences = infile.readlines()

# Make sure you don't request more samples than available
sample_size = min(15200, len(sentences))

# Randomly sample sentences
sampled_sentences = random.sample(sentences, sample_size)

# Write sampled sentences to new file
with open(output_filepath, 'w', encoding='utf-8') as outfile:
    outfile.writelines(sampled_sentences)

print(f"Randomly sampled {sample_size} sentences and saved to {output_filepath}")

Randomly sampled 15200 sentences and saved to /content/drive/MyDrive/Gujarati_Spelling_and_Grammar_Autocorrect/data/15k_sampled_sentences.txt
