In [1]:
! pip install transformers datasets pandas evaluate



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
data_path = "/content/drive/MyDrive/NLPproject/mcq_data_with_custom_ner_tags_cleaned.csv"
text_column_name = "prompt"
label_column_name = "best_model"

model_name = "distilbert-base-uncased"
test_size = 0.2
num_labels = 3

In [4]:
import pandas as pd
df = pd.read_csv(data_path)
df.head()

Unnamed: 0,prompt,biomistral,meditron,medalpaca,correct_answer,best_model,highest_probability,processed_prompt,tokens
0,Question: An outbreak of diphtheria has occu...,"{'A': 0.04511607810854912, 'B': 0.594287753105...","{'A': 0.16622786223888397, 'B': 0.194340050220...","{'A': 0.253202885389328, 'B': 0.32007843255996...",B,biomistral,0.594288,Question: An outbreak of \s_disease_disorder...,"['question', ':', 'an', 'outbreak', 'of', '\\s..."
1,Question: A 35-year-old woman who was recentl...,"{'A': 0.20254969596862793, 'B': 0.020370958372...","{'A': 0.173625186085701, 'B': 0.19369290769100...","{'A': 0.2113712579011917, 'B': 0.1725161820650...",C,biomistral,0.505247,Question: A \s_age 35-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '35', '-', '..."
2,Question: A 43-year-old woman was admitted to...,"{'A': 0.2921173572540283, 'B': 0.2701644003391...","{'A': 0.26627469062805176, 'B': 0.204159900546...","{'A': 0.18825779855251312, 'B': 0.325280308723...",C,meditron,0.279053,Question: A \s_age 43-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '43', '-', '..."
3,Question: A 50-year-old man presents with hea...,"{'A': 0.22638443112373352, 'B': 0.016919802874...","{'A': 0.1531776636838913, 'B': 0.1708820462226...","{'A': 0.21118728816509247, 'B': 0.198392108082...",A,biomistral,0.226384,Question: A \s_age 50-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '50', '-', '..."
4,Question: 2 hours after being admitted to the ...,"{'A': 0.11145520955324173, 'B': 0.427251458168...","{'A': 0.14759910106658936, 'B': 0.186582759022...","{'A': 0.3045741617679596, 'B': 0.2335251122713...",B,biomistral,0.427251,Question: \s_time 2 hours after \e_time being ...,"['question', ':', '\\s_time', '2', 'hours', 'a..."


In [5]:
# i want to preprocess the data, remove rows that their column "prompt" is more than 512 tokens according to the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
df["prompt_length"] = df[text_column_name].apply(lambda x: len(tokenizer(x)["input_ids"]))
df = df[df["prompt_length"] <= 512]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [6]:
df["prompt_length"].head()

Unnamed: 0,prompt_length
0,142
1,241
2,322
3,303
4,112


In [7]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit(df[label_column_name].tolist())
df['label'] = le.transform(df[label_column_name].tolist())
df.head()

Unnamed: 0,prompt,biomistral,meditron,medalpaca,correct_answer,best_model,highest_probability,processed_prompt,tokens,prompt_length,label
0,Question: An outbreak of diphtheria has occu...,"{'A': 0.04511607810854912, 'B': 0.594287753105...","{'A': 0.16622786223888397, 'B': 0.194340050220...","{'A': 0.253202885389328, 'B': 0.32007843255996...",B,biomistral,0.594288,Question: An outbreak of \s_disease_disorder...,"['question', ':', 'an', 'outbreak', 'of', '\\s...",142,0
1,Question: A 35-year-old woman who was recentl...,"{'A': 0.20254969596862793, 'B': 0.020370958372...","{'A': 0.173625186085701, 'B': 0.19369290769100...","{'A': 0.2113712579011917, 'B': 0.1725161820650...",C,biomistral,0.505247,Question: A \s_age 35-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '35', '-', '...",241,0
2,Question: A 43-year-old woman was admitted to...,"{'A': 0.2921173572540283, 'B': 0.2701644003391...","{'A': 0.26627469062805176, 'B': 0.204159900546...","{'A': 0.18825779855251312, 'B': 0.325280308723...",C,meditron,0.279053,Question: A \s_age 43-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '43', '-', '...",322,2
3,Question: A 50-year-old man presents with hea...,"{'A': 0.22638443112373352, 'B': 0.016919802874...","{'A': 0.1531776636838913, 'B': 0.1708820462226...","{'A': 0.21118728816509247, 'B': 0.198392108082...",A,biomistral,0.226384,Question: A \s_age 50-year-old \e_age \s_sex ...,"['question', ':', 'a', '\\s_age', '50', '-', '...",303,0
4,Question: 2 hours after being admitted to the ...,"{'A': 0.11145520955324173, 'B': 0.427251458168...","{'A': 0.14759910106658936, 'B': 0.186582759022...","{'A': 0.3045741617679596, 'B': 0.2335251122713...",B,biomistral,0.427251,Question: \s_time 2 hours after \e_time being ...,"['question', ':', '\\s_time', '2', 'hours', 'a...",112,0


In [8]:
from sklearn.model_selection import train_test_split
df_train,df_test = train_test_split(df,test_size=test_size)

In [9]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(df_train)
test_dataset = Dataset.from_pandas(df_test)

In [10]:
# from transformers import AutoTokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_name)

def preprocess_function(examples):
    return tokenizer(examples["prompt"], truncation=True)

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_test = test_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/8027 [00:00<?, ? examples/s]

Map:   0%|          | 0/2007 [00:00<?, ? examples/s]

In [11]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
from transformers import DataCollatorWithPadding
from transformers import TrainingArguments, Trainer
import evaluate
import numpy as np
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-4,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy = "epoch",
    logging_strategy="epoch"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics

)
import torch

# Determine the available device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the appropriate device
model = model.to(device)





Using device: cuda


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0609,1.051999,0.467364
2,1.0583,1.045507,0.467364
3,1.0536,1.043473,0.467364
4,1.0547,1.044178,0.467364
5,1.0532,1.044984,0.467364


TrainOutput(global_step=1255, training_loss=1.0561356867452067, metrics={'train_runtime': 1388.0717, 'train_samples_per_second': 28.914, 'train_steps_per_second': 0.904, 'total_flos': 3638555722866276.0, 'train_loss': 1.0561356867452067, 'epoch': 5.0})

In [None]:
trainer.save_model('spam_model')

In [14]:
from sklearn.metrics import classification_report
preds = trainer.predict(tokenized_train)
preds = np.argmax(preds[:3][0],axis=1)
GT = df_train['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.47      1.00      0.64      3750
           1       0.00      0.00      0.00      2516
           2       0.00      0.00      0.00      1761

    accuracy                           0.47      8027
   macro avg       0.16      0.33      0.21      8027
weighted avg       0.22      0.47      0.30      8027



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
preds = trainer.predict(tokenized_test)
preds = np.argmax(preds[:3][0],axis=1) #preds[:3][1]
GT = df_test['label'].tolist()
print(classification_report(GT,preds))

              precision    recall  f1-score   support

           0       0.47      1.00      0.64       938
           1       0.00      0.00      0.00       669
           2       0.00      0.00      0.00       400

    accuracy                           0.47      2007
   macro avg       0.16      0.33      0.21      2007
weighted avg       0.22      0.47      0.30      2007



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
import torch

# Determine the available device
device = torch.device("mps" if torch.backends.mps.is_available() else "cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Move the model to the appropriate device
model = model.to(device)

def predict_class(prompt):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True)

    # Move inputs to the same device as the model
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Set the model to evaluation mode
    model.eval()

    # Get the prediction
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    # Convert the predicted class back to its original label
    original_label = le.inverse_transform([predicted_class])[0]

    return original_label

# Example usage
prompt = "Omri is 26 years old lives in israel likes animals, food and science. what is suffering from? A: happiness B: smartness C: goodness"
try:
    predicted_class = predict_class(prompt)
    print(f"The predicted class for '{prompt}' is: {predicted_class}")
except RuntimeError as e:
    print(f"An error occurred: {e}")
    print("If you're using an Apple Silicon Mac, make sure you have the latest version of PyTorch installed with MPS support.")
    print("You can install it using: pip install --upgrade torch")

Using device: mps
The predicted class for 'Omri is 26 years old lives in israel likes animals, food and science. what is suffering from? A: happiness B: smartness C: goodness' is: biomistral
