In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import AdamW
import torch.nn as nn
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import re
import string

In [2]:
drRatings = pd.read_excel('/kaggle/input/hw1-input/OBGYN_new_train_80000.xlsx') 

In [3]:
# Shuffle the data so that they are in random sequence
drRatings = drRatings.sample(frac=1)
drRatings['highKnowledge'] = (drRatings['knowledge']>4).astype(int)

temp = pd.get_dummies(drRatings['state'],dtype=int)
drRatings = pd.concat([drRatings,temp],axis=1)
del temp

In [4]:
testingdata = pd.read_excel('/kaggle/input/hw1-data/OBGYN_new_test_withoutAnswer_20000_2024.xlsx')

In [5]:
temp = pd.get_dummies(testingdata['state'],dtype=int)
testingdata = pd.concat([testingdata,temp],axis=1)
del temp
states=drRatings['state'].unique().tolist()
for state in states:
  if state not in testingdata.columns.tolist():
    testingdata[state]=[0]*testingdata.shape[0]

In [6]:
def clean_text(text):
    # Modify text cleaning steps as needed
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    combined_text = ' '.join(tokenized_text)
    return combined_text

In [7]:
# Apply text preprocessing to both train and validation data
drRatings['text'] = drRatings['review'].apply(str).apply(lambda x: text_preprocessing(x))
testingdata['text'] = testingdata['review'].apply(str).apply(lambda x: text_preprocessing(x))

In [8]:
# Split the data into training, validation, and testing sets
training_data, temp_data = train_test_split(drRatings, test_size=0.15, random_state=42)
validation_data, test_data = train_test_split(temp_data, test_size=0.4, random_state=42)

In [9]:
device = torch.device("cuda")

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [11]:
def tokenize_data(data):
    input_ids = []
    attention_masks = []

    for review in data['review']:
        encoded_dict = tokenizer.encode_plus(
                            review,
                            add_special_tokens=True,
                            max_length=256,
                            pad_to_max_length=True,
                            return_attention_mask=True,
                            return_tensors='pt'
                       )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    return input_ids, attention_masks

In [12]:
# Padding and conversion to tensors
training_inputs, training_masks = tokenize_data(training_data)
validation_inputs, validation_masks = tokenize_data(validation_data)
test_inputs, test_masks = tokenize_data(test_data)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [13]:
# Convert labels to tensors
training_labels = torch.tensor(training_data['highKnowledge'].values)
validation_labels = torch.tensor(validation_data['highKnowledge'].values)
test_labels = torch.tensor(test_data['highKnowledge'].values)

In [14]:
# Convert lists to tensors and move them to GPU
training_inputs = torch.cat(training_inputs, dim=0).to(device)
training_masks = torch.cat(training_masks, dim=0).to(device)

validation_inputs = torch.cat(validation_inputs, dim=0).to(device)
validation_masks = torch.cat(validation_masks, dim=0).to(device)

test_inputs = torch.cat(test_inputs, dim=0).to(device)
test_masks = torch.cat(test_masks, dim=0).to(device)

In [15]:
# Create DataLoader for efficient batching
batch_size = 32
training_dataset = TensorDataset(training_inputs, training_masks, training_labels)
training_sampler = RandomSampler(training_dataset)
training_dataloader = DataLoader(training_dataset, sampler=training_sampler, batch_size=batch_size)

validation_dataset = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_dataset)
validation_dataloader = DataLoader(validation_dataset, sampler=validation_sampler, batch_size=batch_size)

test_dataset = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=batch_size)

In [16]:
# Fine-tune BERT model
optimizer = AdamW(model.parameters(), lr=2e-5)

In [17]:
# Training loop
model.train()
for epoch in range(3):
    for batch in training_dataloader:
        optimizer.zero_grad()
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss = loss.mean()
        loss.backward()
        optimizer.step()

In [18]:
# Evaluation
def accuracy(preds, labels):
    predicted_labels = preds.argmax(axis=1)
    correct = (predicted_labels == labels).sum().item()
    total = len(labels) 
    return correct / total


# Evaluation for validation set
true_labels_val = []
predicted_labels_val = []

model.eval()
eval_loss, eval_accuracy_val = 0, 0
nb_eval_steps_val, nb_eval_examples = 0, 0

for batch in validation_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    
    tmp_eval_accuracy = accuracy(logits, label_ids)
    eval_accuracy_val += tmp_eval_accuracy
    nb_eval_steps_val += 1
    
    # Collect true labels and predicted labels for validation set
    true_labels_val.extend(label_ids)
    predicted_labels_val.extend(np.argmax(logits, axis=1))

print("Validation Accuracy: {}".format(eval_accuracy_val / nb_eval_steps_val))

Validation Accuracy: 0.9438888888888889


In [19]:
# Test evaluation

# Evaluation
true_labels = []
predicted_labels = []

model.eval()
eval_loss, eval_accuracy = 0, 0
nb_eval_steps, nb_eval_examples = 0, 0

for batch in test_dataloader:
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.to(device)
    attention_mask = attention_mask.to(device)
    labels = labels.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    logits = logits.detach().cpu().numpy()
    label_ids = labels.to('cpu').numpy()
    
    tmp_eval_accuracy = accuracy(logits, label_ids)
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1
    
    # Collect true labels and predicted labels
    true_labels.extend(label_ids)
    predicted_labels.extend(np.argmax(logits, axis=1))

print("Test Accuracy: {}".format(eval_accuracy / nb_eval_steps))

Test Accuracy: 0.9402083333333333


In [20]:
# Tokenize and encode new reviews
new_inputs, new_masks = tokenize_data(testingdata)

In [21]:
# Convert each element in the list to a PyTorch tensor
new_inputs = [torch.tensor(input_data).clone().detach().to(device) for input_data in new_inputs]
new_masks = [torch.tensor(mask_data).clone().detach().to(device) for mask_data in new_masks]

  new_inputs = [torch.tensor(input_data).clone().detach().to(device) for input_data in new_inputs]
  new_masks = [torch.tensor(mask_data).clone().detach().to(device) for mask_data in new_masks]


In [22]:
# Concatenate the tensors in new_inputs and new_masks along the first dimension
new_inputs = torch.cat(new_inputs, dim=0)
new_masks = torch.cat(new_masks, dim=0)

In [23]:
# Create DataLoader
new_data = TensorDataset(new_inputs, new_masks)
new_sampler = SequentialSampler(new_data)
new_dataloader = DataLoader(new_data, sampler=new_sampler, batch_size=batch_size)

In [24]:
# Predict
predictions = []
model.eval()
for batch in new_dataloader:
    input_ids, attention_mask = batch
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    
    logits = outputs.logits
    predictions.extend(torch.argmax(logits, axis=1).cpu().numpy())

In [25]:
#Convert predictions to DataFrame
review_ids = testingdata['reviewID']
prediction_df = pd.DataFrame({'reviewID': review_ids, 'prediction': predictions})

In [26]:
# Save DataFrame to a CSV file
prediction_df.to_csv('32648157_kickstart.csv', index=False, encoding='utf-8')