In [1]:
!pip install datasets transformers huggingface_hub

Collecting datasets
  Obtaining dependency information for datasets from https://files.pythonhosted.org/packages/ed/a5/33cf000137545a08b0a3a6ea76c8ccbd87917f78bb5d737f9f56f3b11ef6/datasets-3.1.0-py3-none-any.whl.metadata
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Obtaining dependency information for pyarrow>=15.0.0 from https://files.pythonhosted.org/packages/d1/db/42ac644453cfdfc60fe002b46d647fe7a6dfad753ef7b28e99b4c936ad5d/pyarrow-17.0.0-cp38-cp38-win_amd64.whl.metadata
  Downloading pyarrow-17.0.0-cp38-cp38-win_amd64.whl.metadata (3.4 kB)
Collecting requests>=2.32.2 (from datasets)
  Obtaining dependency information for requests>=2.32.2 from https://files.pythonhosted.org/packages/f9/9b/335f9764261e915ed497fcdeb11df5dfd6f7bf257d4a6a2a686d80da4d54/requests-2.32.3-py3-none-any.whl.metadata
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting tqdm>=4.66.3 (from datasets)
  Obtaining dependency infor

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

train_data = pd.read_csv("train.csv",on_bad_lines='skip')
test_data = pd.read_csv("test.csv", on_bad_lines='skip')

train_data['prompt'] = train_data['prompt'].str.replace('_comma_', ',')
test_data['prompt'] = test_data['prompt'].str.replace('_comma_', ',')

label_mapping = {label: i for i, label in enumerate(train_data['context'].unique())}
train_data['label'] = train_data['context'].map(label_mapping)
test_data['label'] = test_data['context'].map(label_mapping)

train_dataset = Dataset.from_pandas(train_data[['prompt', 'label']][:20000])
test_dataset = Dataset.from_pandas(test_data[['prompt', 'label']])

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_function(examples):
    return tokenizer(examples["prompt"], padding="max_length", truncation=True)

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=len(label_mapping))

training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    load_best_model_at_end=True,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
)

trainer.train()

model.save_pretrained("./Sentiment_Model")
tokenizer.save_pretrained("./Sentiment_Model")

metrics = trainer.evaluate()
print(metrics)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5701 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Step,Training Loss,Validation Loss
500,2.2516,2.496484
1000,1.8948,1.979252
1500,1.1622,1.77302
2000,0.9991,1.758624
2500,0.8069,1.797523
3000,0.548,1.871207
3500,0.4538,1.921195
4000,0.35,2.025475
4500,0.2194,2.126483
5000,0.2443,2.189822


{'eval_loss': 1.7586244344711304, 'eval_runtime': 79.4524, 'eval_samples_per_second': 71.754, 'eval_steps_per_second': 8.974, 'epoch': 3.0}


In [12]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd

train_data = pd.read_csv("Datasets/train.csv",on_bad_lines='skip')
test_data = pd.read_csv("Datasets/test.csv", on_bad_lines='skip')

train_data['prompt'] = train_data['prompt'].str.replace('_comma_', ',')
test_data['prompt'] = test_data['prompt'].str.replace('_comma_', ',')

label_mapping = {label: i for i, label in enumerate(train_data['context'].unique())}

model_name_or_path = "Sentiment_Model"  
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
model = AutoModelForSequenceClassification.from_pretrained(model_name_or_path)

input_text = ["I had a argument with my boss, it was his fault.",
              "I received concert tickets for Christmas!"]

inputs = tokenizer(input_text, padding=True, truncation=True, return_tensors="pt")

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)

reverse_label_mapping = {v: k for k, v in label_mapping.items()}
predicted_labels = [reverse_label_mapping[label_id] for label_id in predictions.tolist()]

print(predicted_labels)

['angry', 'joyful']
