In [1]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import datetime
from sklearn.metrics import mean_squared_error

# Load the dataset
df = pd.read_csv("C:/Users/siddh/Desktop/BERT/New folder/CELA.csv")

# Extract essay texts and labels
essays = df['Essays'].tolist()
labels = df[['Grammar', 'Lexical', 'Global Organization', 'Local Organization', 'Supporting Ideas', 'Holistic']].values

# Tokenization function
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokens['input_ids'], tokens['attention_mask']

# Prepare data function
def prepare_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        tokens = tokenize_text(text)
        input_ids.append(tokens[0])
        attention_masks.append(tokens[1])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.float32)  # Ensure labels are of the correct type

    return input_ids, attention_masks, labels

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(essays, labels, test_size=0.2, random_state=42)

# Define the BERT model for regression on each target
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=6)

optimizer = AdamW(model.parameters(), lr=1e-5)

# Prepare the training data
input_ids, attention_masks, labels = prepare_data(essays, labels)
dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Prepare the validation data
val_input_ids, val_attention_masks, val_labels = prepare_data(val_texts, val_labels)
val_dataset = TensorDataset(val_input_ids, val_attention_masks, val_labels)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

def evaluate_model(model, dataloader):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for batch in dataloader:
            inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
            outputs = model(**inputs)
            logits = outputs.logits
            all_preds.extend(logits.cpu().numpy())
            all_labels.extend(batch[2].cpu().numpy())
        mse = mean_squared_error(all_labels, all_preds)
        print(f'Mean Squared Error: {mse}')
    return mse

# Train the model
num_epochs = 10
best_val_loss = float('inf')  # Initialize with a large value for comparison

for epoch in range(num_epochs):
    model.train()
    for batch in train_dataloader:
        optimizer.zero_grad()
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    # Evaluate on the validation set
    val_loss = evaluate_model(model, val_dataloader)

    # Print or log the learning rate if desired
    print(f'Epoch {epoch + 1}, Mean Squared Error: {val_loss}')

    # Save the model if validation loss improves
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
        model_save_path = f"essay_scoring_model_regression_{timestamp}"
        model.save_pretrained(model_save_path)


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Mean Squared Error: 20.953706741333008
Epoch 1, Mean Squared Error: 20.953706741333008
Mean Squared Error: 15.653674125671387
Epoch 2, Mean Squared Error: 15.653674125671387
Mean Squared Error: 11.970582008361816
Epoch 3, Mean Squared Error: 11.970582008361816
Mean Squared Error: 9.387707710266113
Epoch 4, Mean Squared Error: 9.387707710266113
Mean Squared Error: 7.2328033447265625
Epoch 5, Mean Squared Error: 7.2328033447265625
Mean Squared Error: 5.597498416900635
Epoch 6, Mean Squared Error: 5.597498416900635
Mean Squared Error: 4.148698329925537
Epoch 7, Mean Squared Error: 4.148698329925537
Mean Squared Error: 3.0221729278564453
Epoch 8, Mean Squared Error: 3.0221729278564453
Mean Squared Error: 2.181945562362671
Epoch 9, Mean Squared Error: 2.181945562362671
Mean Squared Error: 1.5818978548049927
Epoch 10, Mean Squared Error: 1.5818978548049927


In [4]:
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
import datetime
from sklearn.metrics import mean_squared_error

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_text(text):
    tokens = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=512,
        truncation=True,
        return_token_type_ids=False,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return tokens['input_ids'], tokens['attention_mask']


def prepare_data(texts, labels):
    input_ids = []
    attention_masks = []

    for text in texts:
        tokens = tokenize_text(text)
        input_ids.append(tokens[0])
        attention_masks.append(tokens[1])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.float32)  # Ensure labels are of the correct type

    return input_ids, attention_masks, labels

# Load the saved model
model_path = "C:\\Users\\siddh\\Desktop\\BERT\\New folder\\essay_scoring_model_regression_20240228_123826"  # Replace <timestamp> with the actual timestamp
model = BertForSequenceClassification.from_pretrained(model_path)

# Tokenize the input essays
input_essays = ["With the development of our city, human right is talked by everyone of us more and more often, and for the city, two kids allowed to be born, is another stone throw into the lake, and that means the number of people will get a new chance to up to a amazing point, by the way, can we ask us a simple question: what are we going to sacrifice again to tell what's right or what's wrong with us. we only see high speed grow up of our city, but we never see the cut down tree, the populated watered, we can not put up a banner on a lonely tree surrounded by stumps, the go up number of people will lead to many problem, which many person think is the outstanding of human right. It's not what we say but what we that matters, we can't bring the whole world into mad then thinking what we could do to save it."]

# Prepare input tensors
input_ids, attention_masks, _ = prepare_data(input_essays, [])

# Make predictions
model.eval()
with torch.no_grad():
    inputs = {'input_ids': input_ids, 'attention_mask': attention_masks}
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = logits.cpu().numpy()

# Post-process the predictions if necessary
# For example, if you want to get the final predicted scores for each aspect:
predicted_scores = predictions.squeeze()  # Squeeze removes the extra dimension added by batch processing

# Now predicted_scores contains the predicted scores for each aspect
print(predicted_scores)


[3.495993  4.8120184 4.601542  4.7985306 4.8196936 3.4043941]
