In [None]:
import pandas as pd
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import ast
import torch

In [None]:
!pip install datasets

In [None]:
from datasets import load_dataset

dataset = load_dataset("jfleg")
print(dataset["validation"][0])
print(dataset["validation"][1])

In [None]:
dataset["validation"].to_csv("text_validation.csv")
if "train" in dataset:
    dataset["train"].to_csv("text_train.csv")
if "test" in dataset:
    dataset["test"].to_csv("text_test.csv")

In [None]:
df = pd.read_csv("text_validation.csv")
print(df.head())

In [None]:
df

# Corrections column preprocessing

In [None]:
print(type(df['corrections'])) # checking the type of corrections column

In [None]:
print(type(df["corrections"][0])) #checking the type of elements in the corrections column

In [None]:
df['corrections'][0]

In [None]:
df["corrections"] = df["corrections"].apply(lambda x: [sentence.replace("\n", "").strip("' ").strip()for sentence in ast.literal_eval(x)] if isinstance(x, str) else x) # removing the extra "\n" and single quotes

In [None]:
df['corrections'][0]

In [None]:
print(type(df['corrections'][0])) # checking type after removing unwanted objects

In [None]:
def split_corrections(text):
    if isinstance(text, list) and len(text) == 1: #checks that if the length is 1
        text = text[0]
    if isinstance(text, str):
        return [sentence.strip() for sentence in text.split(".") if sentence.strip()] #splitting the sentence on '.'
    return text

df["corrections"] = df["corrections"].apply(split_corrections)

In [None]:
df['corrections']

In [None]:
df["corrections"] = df["corrections"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x) # checks and converts into string if its not in str type

In [None]:
expanded_data = [] # creating a list of new data
for _, row in df.iterrows(): # iter through each row
    sentence = row["sentence"]
    for corrections in row["corrections"]:
        expanded_data.append({"sentence": sentence, "corrections": corrections}) # making a list of dictionaries

In [None]:
print(expanded_data[4])
print(expanded_data[5])
print(expanded_data[6])
print(expanded_data[7])

In [None]:
print(type(expanded_data))

In [None]:
print(type(expanded_data[0]))

In [None]:
df_expanded = pd.DataFrame(expanded_data) #make a dataframe from the list
print(df_expanded)

In [None]:
df_expanded

In [None]:
df_expanded['sentence'][0]

In [None]:
df_expanded['corrections'][0]

In [None]:
df['corrections'] = df['corrections'].str.lower()

## Tokenizing the corrections column using transformer **Bert** tokenizer also lower cased the elements

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
df_expanded['corrections'] = df_expanded['corrections'].astype(str).str.lower()
df_expanded['tokenized_corrections'] = df_expanded['corrections'].apply(lambda text: tokenizer.tokenize(text))
df_expanded['tokenized_corrections']=df_expanded['tokenized_corrections'].apply(lambda tokens: "".join(tokens).replace("▁", " ").strip())
print(df_expanded.head())

In [None]:
df_expanded['corrections'][0]

In [None]:
df_expanded['tokenized_corrections'][0]

In [None]:
df_expanded['tokenized_corrections'] = df_expanded['tokenized_corrections'].apply(lambda x: x.split() if isinstance(x, str) else x) #changing the type from str to list

In [None]:
df_expanded['tokenized_corrections'][0]

## Removing the stopwords from corrections column

In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
df_expanded['filtered_corrected_tokens'] = df_expanded['tokenized_corrections'].apply(lambda tokens:[word for word in tokens if word not in stop_words])
df_expanded.head()

## Stemming the correction column

In [None]:
stemmer = PorterStemmer()
df_expanded["stemmed_corrections"] = df_expanded["filtered_corrected_tokens"].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
print(df_expanded.head())

# Sentence column preprocessing

In [None]:
print(type(df['sentence'][0]))

In [None]:
print(type(df['sentence']))

In [None]:
df['sentence'][0]

## Tokenized the sentence column using transformer **Bert** tokenizer also lower cased the elements

In [None]:
from transformers import T5Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-base")
df_expanded['sentence'] = df_expanded['sentence'].astype(str).str.lower()
df_expanded['tokenized_sentence'] = df_expanded['sentence'].apply(lambda text: tokenizer.tokenize(text))
df_expanded['tokenized_sentence']=df_expanded['tokenized_sentence'].apply(lambda tokens: "".join(tokens).replace("▁", " ").strip())
print(df_expanded.head())

In [None]:
df_expanded['tokenized_sentence'][4]

In [None]:
df_expanded['sentence'][4]

In [None]:
print(df_expanded['sentence'][0])

In [None]:
print(type(df_expanded["tokenized_sentence"][0]))

In [None]:
df_expanded['tokenized_sentence'] = df_expanded['tokenized_sentence'].apply(lambda x: x.split() if isinstance(x, str) else x) # converting elements to str type

In [None]:
df_expanded['tokenized_sentence'][0]

## Removing the stopwords from the sentence column

In [None]:
nltk.download("stopwords")
stop_words = set(stopwords.words("english"))
df_expanded['filtered_sentence_tokens'] = df_expanded['tokenized_sentence'].apply(lambda tokens:[word for word in tokens if word not in stop_words])
df_expanded.head()

In [None]:
df_expanded['filtered_corrected_tokens'][0]

In [None]:
df_expanded['filtered_sentence_tokens'][0]

In [None]:
df_expanded["sentence"][4]

In [None]:
df_expanded['tokenized_sentence'][4]

## Stemmed the sentence column

In [None]:
stemmer = PorterStemmer()
df_expanded["stemmed_sentence"] = df_expanded["filtered_sentence_tokens"].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
print(df_expanded.head())

In [None]:
df_expanded['stemmed_sentence'][4]

In [None]:
df_expanded = df_expanded.dropna(subset=["sentence", "tokenized_sentence", "corrections","tokenized_corrections"]) # dropped the NaN values

# Final Dataset

In [None]:
df_expanded.to_csv("final_df.csv", index=False)
df_expanded.head() # final DataFrame after preprocessing

In [None]:
df_expanded.columns

In [None]:
final_df = pd.read_csv("final_df.csv", usecols=["sentence","tokenized_sentence", "correction","tokenized_corrections"])

## Further preprocessed the texts for training

In [None]:
# Convert tokenized words to token IDs (already done in preprocessing)
sentence_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in final_df["tokenized_sentence"]]
corrections_ids = [tokenizer.convert_tokens_to_ids(tokens) for tokens in final_df["tokenized_corrections"]]

In [None]:
# Manually pad sequences with 0
max_len = max(max(len(seq) for seq in sentence_ids), max(len(seq) for seq in corrections_ids))
sentence_ids = [seq + [0] * (max_len - len(seq)) for seq in sentence_ids]
corrections_ids = [seq + [0] * (max_len - len(seq)) for seq in corrections_ids]
print("Example sentence sequence:", sentence_ids[0])
print("Example corrections sequence:", corrections_ids[0])

In [None]:
# converting into PyTorch Tensors
sentence_ids = torch.tensor(sentence_ids, dtype=torch.long)
corrections_ids = torch.tensor(corrections_ids, dtype=torch.long)
print("Sentence IDs shape:", sentence_ids.shape)
print("Corrections IDs shape:", corrections_ids.shape)

## Demo code (Needs more understanding)

In [None]:
from torch.utils.data import Dataset, DataLoader

class SpellCorrectionDataset(Dataset):
    def __init__(self, sentence_ids, corrections_ids):
        self.sentence_ids = sentence_ids
        self.corrections_ids = corrections_ids

    def __len__(self):
        return len(self.sentence_ids)

    def __getitem__(self, idx):
        return {
            "sentence_ids": self.sentence_ids[idx],
            "corrections_ids": self.corrections_ids[idx]
        }

# Create dataset
dataset = SpellCorrectionDataset(sentence_ids, corrections_ids)

# Create DataLoader for batching
batch_size = 16  # Adjust based on memory availability
train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [None]:
from transformers import AutoModelForSeq2SeqLM

# Load pre-trained model
model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")


In [None]:
import torch
from transformers import AdamW

# Define loss function
criterion = torch.nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding tokens

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

epochs = 3  # Adjust as needed

for epoch in range(epochs):
    model.train()
    total_loss = 0

    for batch in train_loader:
        sentence_ids = batch["sentence_ids"].to(device)
        corrections_ids = batch["corrections_ids"].to(device)

        optimizer.zero_grad()

        outputs = model(input_ids=sentence_ids, labels=corrections_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")


In [None]:
model.eval()

with torch.no_grad():
    for batch in train_loader:
        sentence_ids = batch["sentence_ids"].to(device)

        outputs = model.generate(input_ids=sentence_ids)
        predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        print("Predictions:", predictions)
        break  # Show one batch
