In [32]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
from torch.utils.data import DataLoader, TensorDataset, random_split
from tqdm import tqdm
import pandas as pd

In [33]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [34]:
# Load data from CSV file
data = pd.read_csv('/content/Grammar Correction.csv')
# data = data.head(500)
# Extract erroneous and corrected sentences
erroneous_sentences = data['Ungrammatical Statement'].tolist()
corrected_sentences = data['Standard English'].tolist()

In [35]:
# Function to pad sequences to the same length
def pad_sequences(sequences, max_length, pad_value=0):
    padded_sequences = []
    for seq in sequences:
        if len(seq) < max_length:
            padded_seq = seq + [pad_value] * (max_length - len(seq))
        else:
            padded_seq = seq[:max_length]
        padded_sequences.append(padded_seq)
    return padded_sequences

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [37]:
# Tokenize input sentences and prepare input tensors
tokenized_inputs = tokenizer(erroneous_sentences, return_tensors='pt', padding=True, truncation=True)
labels = tokenizer(corrected_sentences, return_tensors='pt', padding=True, truncation=True)

# Convert token IDs to tensors
input_ids = tokenized_inputs['input_ids']
attention_mask = tokenized_inputs['attention_mask']
labels_ids = labels['input_ids']

# Pad sequences to the same length
max_length = max(len(seq) for seq in input_ids)
input_ids = pad_sequences(input_ids.tolist(), max_length)
attention_mask = pad_sequences(attention_mask.tolist(), max_length)
labels_ids = pad_sequences(labels_ids.tolist(), max_length)

# Convert padded sequences to tensors
input_ids = torch.tensor(input_ids).to(device)
attention_mask = torch.tensor(attention_mask).to(device)
labels_ids = torch.tensor(labels_ids).to(device)

In [38]:
# Prepare DataLoader
dataset = TensorDataset(input_ids, attention_mask, labels_ids)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

In [39]:
# Fine-tune BERT model
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
loss_fn = torch.nn.CrossEntropyLoss()
model.to(device)
for epoch in range(30):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)  # Move tensors to GPU
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}, Loss: {total_loss:.4f}')


Epoch 1: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]


Epoch 1, Loss: 242.8673


Epoch 2: 100%|██████████| 100/100 [00:13<00:00,  7.39it/s]


Epoch 2, Loss: 77.5569


Epoch 3: 100%|██████████| 100/100 [00:13<00:00,  7.39it/s]


Epoch 3, Loss: 57.1861


Epoch 4: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 4, Loss: 43.2066


Epoch 5: 100%|██████████| 100/100 [00:13<00:00,  7.60it/s]


Epoch 5, Loss: 32.0104


Epoch 6: 100%|██████████| 100/100 [00:13<00:00,  7.35it/s]


Epoch 6, Loss: 23.9734


Epoch 7: 100%|██████████| 100/100 [00:13<00:00,  7.55it/s]


Epoch 7, Loss: 18.1475


Epoch 8: 100%|██████████| 100/100 [00:13<00:00,  7.49it/s]


Epoch 8, Loss: 13.4467


Epoch 9: 100%|██████████| 100/100 [00:13<00:00,  7.47it/s]


Epoch 9, Loss: 9.8241


Epoch 10: 100%|██████████| 100/100 [00:13<00:00,  7.50it/s]


Epoch 10, Loss: 7.2337


Epoch 11: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 11, Loss: 5.4784


Epoch 12: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]


Epoch 12, Loss: 4.2750


Epoch 13: 100%|██████████| 100/100 [00:13<00:00,  7.53it/s]


Epoch 13, Loss: 3.2341


Epoch 14: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]


Epoch 14, Loss: 2.8205


Epoch 15: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 15, Loss: 2.2626


Epoch 16: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


Epoch 16, Loss: 2.3443


Epoch 17: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


Epoch 17, Loss: 1.9071


Epoch 18: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


Epoch 18, Loss: 1.7390


Epoch 19: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


Epoch 19, Loss: 1.5758


Epoch 20: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 20, Loss: 1.6657


Epoch 21: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 21, Loss: 1.4523


Epoch 22: 100%|██████████| 100/100 [00:13<00:00,  7.50it/s]


Epoch 22, Loss: 1.4952


Epoch 23: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 23, Loss: 1.3787


Epoch 24: 100%|██████████| 100/100 [00:13<00:00,  7.47it/s]


Epoch 24, Loss: 1.6149


Epoch 25: 100%|██████████| 100/100 [00:13<00:00,  7.40it/s]


Epoch 25, Loss: 1.5258


Epoch 26: 100%|██████████| 100/100 [00:13<00:00,  7.54it/s]


Epoch 26, Loss: 1.3184


Epoch 27: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 27, Loss: 1.0273


Epoch 28: 100%|██████████| 100/100 [00:13<00:00,  7.51it/s]


Epoch 28, Loss: 0.9966


Epoch 29: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]


Epoch 29, Loss: 0.8788


Epoch 30: 100%|██████████| 100/100 [00:13<00:00,  7.52it/s]

Epoch 30, Loss: 0.9298





In [62]:
model.eval()
total_correct = 0
total_count = 0

with torch.no_grad():
    for batch in tqdm(val_dataloader, desc='Validation'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions = logits.argmax(dim=-1)

        total_correct += (predictions == labels).sum().item()
        total_count += labels.numel()

accuracy = total_correct / total_count
print(f'Validation Accuracy: {accuracy:.4f}')


Validation: 100%|██████████| 26/26 [00:01<00:00, 20.60it/s]

Validation Accuracy: 0.8785





In [31]:
from google.colab import drive

# Specify the directory where you want to save the model in Google Drive
output_dir = "/content/drive/MyDrive/fine_tuned_bert_model"

# Create the directory if it doesn't exist
import os
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
0
# Save the model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved successfully at:", output_dir)


Model saved successfully at: /content/drive/MyDrive/fine_tuned_bert_model


In [61]:
# Move the test input tensor to the same device as the model
test_sentence = "You am not subscribed"
tokenized_test_sentence = tokenizer(test_sentence, return_tensors='pt', padding=True, truncation=True)
tokenized_test_sentence = {key: value.to(device) for key, value in tokenized_test_sentence.items()}

# Perform inference
outputs = model(**tokenized_test_sentence)
predicted_ids = torch.argmax(outputs.logits[0], dim=-1)
predicted_sentence = tokenizer.decode(predicted_ids, skip_special_tokens=True)

# Capitalize the first letter of the predicted sentence
predicted_sentence = predicted_sentence.capitalize()

print("Corrected Sentence:", predicted_sentence)


Corrected Sentence: You are not subscribed


In [None]:
import streamlit as st
import torch
from transformers import BertTokenizer, BertForMaskedLM

# Load the fine-tuned BERT model and tokenizer
model_path = "./fine_tuned_bert_model"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForMaskedLM.from_pretrained(model_path)

# Function to correct grammar using the BERT model
def correct_grammar(sentence):
    tokenized_sentence = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
    outputs = model(**tokenized_sentence)
    predicted_ids = torch.argmax(outputs.logits[0], dim=-1)
    corrected_sentence = tokenizer.decode(predicted_ids, skip_special_tokens=True)
    return corrected_sentence.capitalize()

# Streamlit app
def main():
    st.title("Grammar Correction App")

    # Input text box for user input
    input_text = st.text_area("Enter a sentence with grammatical errors:", "")

    # Button to trigger grammar correction
    if st.button("Correct Grammar"):
        if input_text.strip() == "":
            st.warning("Please enter a sentence.")
        else:
            corrected_text = correct_grammar(input_text)
            st.success("Corrected Sentence:")
            st.write(corrected_text)

if __name__ == "__main__":
    main()


In [None]:
import torch
from transformers import BertTokenizer, BertForMaskedLM
import streamlit as st

# Load the fine-tuned model and tokenizer from Google Drive
output_dir = "/content/drive/MyDrive/fine_tuned_bert_model"
model = BertForMaskedLM.from_pretrained(output_dir)
tokenizer = BertTokenizer.from_pretrained(output_dir)

# Move model to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define function for inference
def correct_sentence(input_sentence):
    # Tokenize input sentence
    tokenized_input = tokenizer(input_sentence, return_tensors='pt', padding=True, truncation=True)
    tokenized_input = {key: value.to(device) for key, value in tokenized_input.items()}

    # Perform inference
    outputs = model(**tokenized_input)
    predicted_ids = torch.argmax(outputs.logits[0], dim=-1)
    predicted_sentence = tokenizer.decode(predicted_ids, skip_special_tokens=True)
    predicted_sentence = predicted_sentence.capitalize()

    return predicted_sentence

# Streamlit app
st.title("Grammar Correction with BERT")

# Input text area for user to enter a sentence
user_input = st.text_area("Enter a sentence to correct")

# Button to perform correction
if st.button("Correct"):
    if user_input.strip():
        corrected_sentence = correct_sentence(user_input)
        st.write("Corrected Sentence:", corrected_sentence)
    else:
        st.write("Please enter a sentence for correction")
