In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# removing some channels and adding again for resolving some package installing conflicts
!conda clean --all -y
!conda config --remove-key channels
!conda config --append channels conda-forge --append channels bioconda --append channels defaults

In [None]:
# stable version of transformers has a bug in saving and loading peft models
# so, in this project, we needed to uninstall stable version
# and install a custom fixed version
!echo "y" | pip uninstall transformers

In [None]:
# for some package conflicts, needed to install the latest version from github
!echo "y" | pip uninstall accelerate

In [None]:
!pip install git+https://github.com/huggingface/accelerate # latest version of accelerate
!pip install -q bitsandbytes datasets loralib
!pip install -q git+https://github.com/llohann-speranca/transformers@fix-resume-checkpoint-for-peftmodel # a custom fixed version of transformers
!pip install peft
!pip install sentencepiece
!pip install datasets tqdm
!pip install evaluate
!pip install jiwer
!pip install rouge_score
!pip install rouge
# notice that currently huggingface rouge_score doesn't work correctly for Persian text, so we needed to use the pip rouge library instead and it works ok with Persian

In [None]:
from transformers import EncoderDecoderModel, AutoTokenizer, DataCollatorForSeq2Seq
from peft import get_peft_model, LoraConfig, TaskType
import torch
from torch.utils.data import Dataset, DataLoader

In [None]:
# a part of PerSpellData on Google Drive
# here we are using a trick to download dataset in /kaggle/working directory and further tokenizing them easier
!pip install gdown
!gdown --id 1bNNUa3jizNA_tgpdXko4FuIl6JBZDyYG
!gdown --id 1gwdYyNwJBuY0yxTkzhFIazxf_qYdJol5
!gdown --id 1WoxPp-0gD2-lnndofUvZQ3XVS7ANUEcf
!gdown --id 1Cd0QWA1z-dj79ceWfnmcd2JNBYg3ybMm
!gdown --id 1ciOS1ONmOCZ0HkFB_eZgXhoilBQZ8mrz
!gdown --id 1qU87Y7o7r-ja87-T2Z8Nyv2pJqcHevGF
!gdown --id 1gH3RvmJ2KZxGVLdkm3JJ6PQ6wWnQKMN8
!gdown --id 1OSJn8RkqlhzP3dhQz_nzANYNkI8RcPBt

In [None]:
# writing list of all datasets (both in /kaggle/input and /kaggle/output) to load and tokenize
wrong_sents_path = ['/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/be/wrong_be.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/common/wrong_common.txt', \
                   '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/gozar/wrong_gozar.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/hich/wrong_hich.txt', \
                   '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/plural/wrong_plural.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/tanvin/wrong_tanvin.txt', \
                   '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/virastman/wrong_virastman_real.txt', '/kaggle/working/wrong_virastman.txt', \
                   '/kaggle/working/wrong_faspell.txt', '/kaggle/working/wrong_close_words.txt', '/kaggle/working/wrong_synthetic.txt']

correct_sents_path = ['/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/be/correct_be.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/common/correct_common.txt', \
                     '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/gozar/correct_gozar.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/hich/correct_hich.txt', \
                     '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/plural/correct_plural.txt', '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/tanvin/correct_tanvin.txt', \
                     '/kaggle/input/perspelldata2/PerSpellData-main/DataSet/real-word/virastman/correct_virastman_real.txt', '/kaggle/working/correct_virastman.txt', \
                     '/kaggle/working/correct_faspell.txt', '/kaggle/working/correct_close_words.txt', '/kaggle/working/correct_synthetic.txt']

In [None]:
# this cell just creates a dataframe of 10 samples to check further if model can overfit on these samples

# res = pd.DataFrame()
# res = pd.concat([res, pd.DataFrame({"wrong": ["سلام حوبی؟"], "correct": ["سلام خوبی؟"]}, index=[0])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["او این کار را نی کند."], "correct": ["او این کار را نمی کند."]}, index=[1])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["من به آنها اعلام کردن"], "correct": ["من به آنها اعلام کردم."]}, index=[2])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["چگون توانستید این کار را انجام بدهید؟"], "correct": ["چگونه توانستید این کار را انجام بدهید؟"]}, index=[3])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["به آنها تگویید تشریف بیاورند."], "correct": ["به آنها بگویید تشریف بیاورند."]}, index=[4])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["این یگی از بهترین آثار هنری است."], "correct": ["این یکی از بهترین آثار هنری است."]}, index=[5])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["نمی ذانم چرا مدل کار نمی کند"], "correct": ["نمی دانم چرا مدل کار نمی کند؟"]}, index=[6])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["به آنه دزباره این مشکل بگویید"], "correct": ["به آنها درباره این مشکل بگویید."]}, index=[7])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["این مسیله بسیار دقت برانگیز است"], "correct": ["این مسئله بسیار رقت برانگیز است."]}, index=[8])])
# res = pd.concat([res, pd.DataFrame({"wrong": ["کوشی موبایل خود را تحویل بذهید"], "correct": ["گوشی موبایل خود را تحویل بدهید."]}, index=[9])])
# res

In [None]:
# loading data and holding all in one dataframe
res = pd.DataFrame()
for i in range(len(wrong_sents_path)): # len(wrong_sents_path)
    wrong = pd.read_csv(wrong_sents_path[i], header=None)
    correct = pd.read_csv(correct_sents_path[i], header=None)
    res = pd.concat([res, pd.concat([wrong, correct], axis=1, ignore_index=True)], axis=0, ignore_index=True)

res.columns = ['wrong', 'correct']

In [None]:
# defining a function to clear CPU and GPU ram and cache
import gc
import torch

def clear_caches():
    res = gc.collect()
    print("freed ram:", res)
    torch.cuda.empty_cache()

In [None]:
# because of limit of RAM of CPU and GPU, everywhere in this project, we delete the objects
# that we don't need anymore to free RAM
del wrong
del correct
clear_caches()

In [None]:
# still loading rest of dataset
dehkhoda_df = pd.read_csv('/kaggle/input/perspelldata2/PerSpellData-main/dehkhoda/dehkhoda_corpus.csv')
dehkhoda_df.drop(columns=dehkhoda_df.columns[0], inplace=True)
dehkhoda_df = dehkhoda_df[['wrong', 'correct']]
res = pd.concat([res, dehkhoda_df], axis=0, ignore_index=True)

In [None]:
del dehkhoda_df
clear_caches()

In [None]:
#currently ignoring real_word confusion matrix as input data

# conf_matrix = pd.read_csv('/kaggle/input/perspelldata2/PerSpellData-main/confusion_matrix/real-word/final_confusion_real.csv')
# conf_matrix.drop(columns=conf_matrix.columns[2:], inplace=True)
# conf_matrix = conf_matrix[['word2', 'word1']]
# conf_matrix.columns = ['wrong', 'correct']
# res = pd.concat([res, conf_matrix], axis=0, ignore_index=True)
# res

In [None]:
non_word_conf = pd.read_csv('/kaggle/input/perspelldata2/PerSpellData-main/confusion_matrix/non-word/correct_error.csv')
non_word_conf.columns = ['wrong', 'correct']
res = pd.concat([res, non_word_conf], axis=0, ignore_index=True)

In [None]:
del non_word_conf
clear_caches()

In [None]:
pd.set_option('display.max_colwidth', None)

In [None]:
# we set seeds to make results reproducible
import random
import torch

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)

In [None]:
# this is a cell that we used for loading indices of empty strings
# in train and eval dataset to use them for preprocessing the data (deleting the
# records that have empty strings as inputs or targets)
import pickle
with open('/kaggle/input/empty-indices/train_empty_indices.pkl', "rb") as file:
    train_empty_indices = pickle.load(file)

with open('/kaggle/input/empty-indices/dev_empty_indices.pkl', "rb") as file:
    dev_empty_indices = pickle.load(file)

In [None]:
# split dataset to train and test
from sklearn.model_selection import train_test_split
train_df, dev_df = train_test_split(res, test_size=0.2, shuffle=True)
# # dev_df, test_df = train_test_split(dev_and_test, test_size=0.33, shuffle=True)

In [None]:
# reset indices, so we can delete records we want properly
train_df = train_df.reset_index(drop=True)
dev_df = dev_df.reset_index(drop=True)

In [None]:
from evaluate import load
wer = load('wer')

In [None]:
# here, we are using Word Error Rate metric to
# find records that there is a large difference between input and target (wer >= 0.3),
# so, these records may be a good choice to deletion and make a more clean dataset
for i, row in train_df.iterrows():
    wrong = row['wrong']
    correct = row['correct']
    score = wer.compute(predictions=[correct], references=[wrong])
    if score >= 0.3 and i not in train_empty_indices:
        train_empty_indices.append(i)
        
for i, row in dev_df.iterrows():
    wrong = row['wrong']
    correct = row['correct']
    score = wer.compute(predictions=[correct], references=[wrong])
    if score >= 0.3 and i not in dev_empty_indices:
        dev_empty_indices.append(i)

In [None]:
# saving the indices of to_be_deleted records, so we won't need to iterate the whole
# dataset again to find the records that we want to delete
with open('/kaggle/working/train_must_clean_indices', "wb") as file:
    pickle.dump(train_empty_indices, file)
with open('/kaggle/working/dev_must_clean_indices', "wb") as file:
    pickle.dump(dev_empty_indices, file)

In [None]:
train_df = train_df.drop(train_empty_indices)
dev_df = dev_df.drop(dev_empty_indices)

In [None]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)
# # test_df = Dataset.from_pandas(test_df)

In [None]:
del res
del train_df
del dev_df
clear_caches()

In [None]:
# because of lack of RAM, we set the max_length of tokenizer and model output to 256
max_length = 256

In [None]:
# loading models with huggingface
# notice that for switching to mT5, you should use AutoModelForSeq2SeqLM instead of EncoderDecoderModel
model_name = "HooshvareLab/bert-fa-zwnj-base" #"sepidmnorozy/parsbert-finetuned-pos"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = EncoderDecoderModel.from_encoder_decoder_pretrained(model_name, model_name)

In [None]:
# the code that we use for applying lora on top of our models

# from peft import LoraConfig, get_peft_model
# peft_config = LoraConfig(peft_type="LORA", task_type="SEQ_2_SEQ_LM",\
#                          r=8, lora_alpha=16, target_modules=["q", "v"], lora_dropout=0.01)
# model = get_peft_model(model, peft_config)
# model.print_trainable_parameters()

In [None]:
model.config.max_length = 256

In [None]:
# this cell is necessary only for training ParsBERT (It's a confing for EncoderDecoder Models)
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.decoder.vocab_size

In [None]:
# using a data collator with max_length padding method
torch_data_collator = DataCollatorForSeq2Seq(tokenizer, padding='max_length',max_length=max_length, return_tensors='pt')

In [None]:
# this cell tokenizes our wrong and correct parts of our dataset separately,
# deletes the unnecessary columns, and saves them to disk, so we don't need to tokenize
# the data every time
tokenizing_batch_size = 32
encoded_train_dataset_input = train_dataset.map(lambda examples: tokenizer(examples['wrong'], padding='max_length',max_length=max_length, return_attention_mask=True, truncation=True), batched=True, batch_size=tokenizing_batch_size, num_proc=1)
encoded_train_dataset_input = encoded_train_dataset_input.with_format("torch")
encoded_train_dataset_input = encoded_train_dataset_input.remove_columns(['wrong', 'correct', '__index_level_0__', 'token_type_ids'])
encoded_train_dataset_input.save_to_disk('/kaggle/working/encoded_train_dataset_input.hf')
del encoded_train_dataset_input
clear_caches()

encoded_train_dataset_target = train_dataset.map(lambda examples: tokenizer(examples['correct'], padding='max_length',max_length=max_length, return_attention_mask=True, truncation=True), batched=True, batch_size=tokenizing_batch_size, num_proc=1)
encoded_train_dataset_target = encoded_train_dataset_target.with_format("torch")
encoded_train_dataset_target = encoded_train_dataset_target.remove_columns(['wrong', 'correct', '__index_level_0__', 'attention_mask', 'token_type_ids'])
encoded_train_dataset_target = encoded_train_dataset_target.rename_columns({'input_ids': 'labels'})
encoded_train_dataset_target.save_to_disk('/kaggle/working/encoded_train_dataset_target.hf')
del encoded_train_dataset_target
clear_caches()

encoded_dev_dataset_input = dev_dataset.map(lambda examples: tokenizer(examples['wrong'], padding='max_length',max_length=max_length, return_attention_mask=True, truncation=True), batched=True, batch_size=tokenizing_batch_size, num_proc=1)
encoded_dev_dataset_input = encoded_dev_dataset_input.with_format("torch")
encoded_dev_dataset_input = encoded_dev_dataset_input.remove_columns(['wrong', 'correct', '__index_level_0__', 'token_type_ids'])
encoded_dev_dataset_input.save_to_disk('/kaggle/working/encoded_dev_dataset_input.hf')
del encoded_dev_dataset_input
clear_caches()

encoded_dev_dataset_target = dev_dataset.map(lambda examples: tokenizer(examples['correct'], padding='max_length',max_length=max_length, return_attention_mask=True, truncation=True), batched=True, batch_size=tokenizing_batch_size, num_proc=1)
encoded_dev_dataset_target = encoded_dev_dataset_target.with_format("torch")
encoded_dev_dataset_target = encoded_dev_dataset_target.remove_columns(['wrong', 'correct', '__index_level_0__', 'attention_mask', 'token_type_ids'])
encoded_dev_dataset_target = encoded_dev_dataset_target.rename_columns({'input_ids': 'labels'})
encoded_dev_dataset_target.save_to_disk('/kaggle/working/encoded_dev_dataset_target.hf')
del encoded_dev_dataset_target
clear_caches()

In [None]:
# the codes from this cell to end of this file, is for loading saved tokenized data on disk when
# we want to train the model later

# from datasets import load_from_disk
# encoded_train_dataset_input = load_from_disk('/kaggle/input/cleaned-encoded-datasets/encoded_train_dataset_input.hf')

In [None]:
# encoded_train_dataset_target = load_from_disk('/kaggle/input/cleaned-encoded-datasets/encoded_train_dataset_target.hf')

In [None]:
# from datasets import concatenate_datasets
# encoded_input_target = concatenate_datasets([encoded_train_dataset_input, encoded_train_dataset_target], axis=1)

In [None]:
# encoded_dev_dataset_input = load_from_disk('/kaggle/input/cleaned-encoded-datasets/encoded_dev_dataset_input.hf')

In [None]:
# encoded_dev_dataset_target = load_from_disk('/kaggle/input/cleaned-encoded-datasets/encoded_dev_dataset_target.hf')