In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


In [None]:
!unzip -o archive.zip

Archive:  archive.zip
  inflating: test.csv                
  inflating: testdata.manual.2009.06.14.csv  
  inflating: train.csv               
  inflating: training.1600000.processed.noemoticon.csv  


### Dataset


In [None]:
train_df = pd.read_csv('train.csv', encoding='latin1')
test_df = pd.read_csv('test.csv', encoding='latin1')
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [None]:
print("\nData shape:")
print(f"Train data: {train_df.shape}")
print(f"Test data: {test_df.shape}")


Data shape:
Train data: (27481, 10)
Test data: (4815, 9)


In [None]:
print(train_df.isnull().sum())

textID              0
text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64


### Preprocessing

In [None]:
train_df = train_df.dropna()
print(train_df['sentiment'].value_counts())

sentiment
neutral     11117
positive     8582
negative     7781
Name: count, dtype: int64


In [None]:
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
train_df['sentiment_label'] = train_df['sentiment'].map(sentiment_mapping)
train_df['input_text'] = train_df['selected_text']

In [None]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['input_text'].tolist(),
    train_df['sentiment_label'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=train_df['sentiment_label']
)

### Training


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

In [None]:
train_dataset = Dataset.from_dict({"text": train_texts, "label": train_labels})
val_dataset = Dataset.from_dict({"text": val_texts, "label": val_labels})

tokenized_train = train_dataset.map(tokenize_function, batched=True)
tokenized_val = val_dataset.map(tokenize_function, batched=True)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to=[], # Add this line to disable wandb logging
)

Map:   0%|          | 0/21984 [00:00<?, ? examples/s]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.2488,0.380532
2,0.2378,0.329501
3,0.2111,0.464378


TrainOutput(global_step=4122, training_loss=0.27147764950441194, metrics={'train_runtime': 777.0, 'train_samples_per_second': 84.88, 'train_steps_per_second': 5.305, 'total_flos': 2184161419763712.0, 'train_loss': 0.27147764950441194, 'epoch': 3.0})

In [None]:
predictions = trainer.predict(tokenized_val)
preds = np.argmax(predictions.predictions, axis=1)

In [None]:
label_names = {0: 'negative', 1: 'neutral', 2: 'positive'}
true_labels = [label_names[label] for label in val_labels]
predicted_labels = [label_names[label] for label in preds]

### Results


In [None]:
print(classification_report(true_labels, predicted_labels))

              precision    recall  f1-score   support

    negative       0.91      0.85      0.88      1556
     neutral       0.87      0.89      0.88      2223
    positive       0.89      0.92      0.90      1717

    accuracy                           0.89      5496
   macro avg       0.89      0.89      0.89      5496
weighted avg       0.89      0.89      0.89      5496

