<div style="font-size: 18pt;">- Predicting course ratings, RMP dataset</div><br/><br/>

In [4]:
import torch

In [5]:
#exetute this cell only in Google Colab
!pip install transformers
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [6]:
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import Trainer
from datasets import load_metric
from sklearn.model_selection import train_test_split

In [7]:
#exetute this cell only in Google Colab
from google.colab import files
import io

In [8]:
class RMPDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [9]:
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [11]:
#exetute this cell only in Google Colab
rmp = pd.read_csv("rmp_small.csv")

In [None]:
rmp = pd.read_csv("data/rmp_small.csv") #for local drive

In [15]:
rmp

Unnamed: 0,review,score
0,"he can be a mean man sometimes, but just get o...",2
1,"He's fun, but he speeds through lectures and l...",2
2,Dr. Foster is an amazing teacher. One of my fa...,4
3,She is one of the greatest teacher I have ever...,4
4,Professor Bird was very easy and relaxed. If y...,3
...,...,...
97540,Charismatic and enthusiastic but that's it . H...,0
97541,"Great books - Dickens, Eliot, etc. A little d...",3
97542,Awesome professor. Doesnt stop explaining unti...,4
97543,"He has a really high standard, but grades fair...",4


In [12]:
sentences = list(rmp['review'])
labels = list(rmp['score'])

In [13]:
#remove nan from both lists
selnan = list()
for idx,name in enumerate(sentences):
    if name == '' or pd.isnull(name):
        selnan.append(idx)
sentences = [sentences[i] for i, e in enumerate(sentences) if i not in selnan]
labels = [labels[i] for i, e in enumerate(labels) if i not in selnan]
(len(sentences), len(labels))

(97545, 97545)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.05, random_state=42, shuffle=True)

In [15]:
checkpoint = "bert-base-uncased"

In [16]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
train_encodings = tokenizer(X_train, truncation=True, padding=True)
#val_encodings = tokenizer(X_val, truncation=True, padding=True)
test_encodings = tokenizer(X_test, truncation=True, padding=True)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [17]:
train_dataset = RMPDataset(train_encodings, y_train)
#val_dataset = RMPDataset(val_encodings, y_val)
test_dataset = RMPDataset(test_encodings, y_test)

In [18]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
training_args = TrainingArguments(
    "RMP_5_labels",
    per_device_train_batch_size=32,# batch size per device during training
    evaluation_strategy="no",
    save_strategy="no",
    num_train_epochs=3
 )

training_args

TrainingArguments(
_n_gpu=1,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=IntervalStrategy.NO,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_min_num_params=0,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=HubStrategy.EVERY_SAVE,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=

In [19]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=5)

trainer = Trainer(
    model,
    training_args,
    train_dataset=train_dataset,
#    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)


Downloading builder script:   0%|          | 0.00/1.65k [00:00<?, ?B/s]

In [21]:
trainer.train()

***** Running training *****
  Num examples = 92667
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 8688


Step,Training Loss
500,1.0576
1000,0.9911
1500,0.9786
2000,0.9658
2500,0.9633
3000,0.9386
3500,0.8746
4000,0.8816
4500,0.8698
5000,0.8677




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=8688, training_loss=0.8631861495269157, metrics={'train_runtime': 7317.5967, 'train_samples_per_second': 37.991, 'train_steps_per_second': 1.187, 'total_flos': 2.100122009433743e+16, 'train_loss': 0.8631861495269157, 'epoch': 3.0})

In [22]:
predictions = trainer.predict(test_dataset)
print(predictions.predictions.shape, predictions.label_ids.shape)
predictions.predictions[:10]
predictions.label_ids[:10]

compute_metrics([predictions.predictions, predictions.label_ids])

***** Running Prediction *****
  Num examples = 4878
  Batch size = 8


(4878, 5) (4878,)


{'accuracy': 0.5694956949569495}

In [23]:
data = [ pd.Series(prd) for prd in predictions.predictions]
y_pred = [prd.idxmax() for prd in data]
cfm = pd.crosstab(np.array(y_test), np.array(y_pred), rownames=['Actual'], colnames=['Predicted'])
cfm = cfm / cfm.sum().sum()

In [24]:
cfm

Predicted,0,1,2,3,4
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.115416,0.027675,0.00697,0.00205,0.008815
1,0.035875,0.034235,0.026035,0.00779,0.006765
2,0.011275,0.021115,0.03895,0.036285,0.017015
3,0.00205,0.00574,0.0287,0.102091,0.110291
4,0.000205,0.00164,0.00861,0.065601,0.278803


In [25]:
trainer

<transformers.trainer.Trainer at 0x7f458f4f60d0>

In [None]:
checkpoint = "distilbert-base-uncased"