In [None]:
# Install Kaggle API
!pip install kaggle

# Upload kaggle.json
from google.colab import files
files.upload()  # Upload kaggle.json

# Setup Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download Sentiment140 dataset
!kaggle datasets download -d kazanova/sentiment140
!unzip sentiment140.zip

# Save to sentiment140_clean.csv
import pandas as pd
df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='ISO-8859-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df.to_csv("sentiment140_clean.csv", index=False)

In [None]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding
)
from datasets import Dataset
import pandas as pd
from pathlib import Path

MODEL_DIR = Path("distilbert_finetuned")
SUBSET_SIZE = 50000

def fine_tune_model():
    model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

    df = pd.read_csv("sentiment140_clean.csv")
    df = df.sample(n=SUBSET_SIZE, random_state=42)
    df['label'] = df['target'].map({0: 0, 4: 1})
    dataset = Dataset.from_pandas(df[['text', 'label']].dropna())
    encoded_dataset = dataset.map(lambda e: tokenizer(e['text'], truncation=True, padding=True, max_length=128), batched=True)
    encoded_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
    dataset_split = encoded_dataset.train_test_split(test_size=0.2)

    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    training_args = TrainingArguments(
        output_dir=str(MODEL_DIR),
        num_train_epochs=1,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        #save_strategy="epoch",
        #evaluation_strategy="epoch",
        logging_dir='./logs',
        logging_steps=100,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset_split["train"],
        eval_dataset=dataset_split["test"],
        tokenizer=tokenizer,
        data_collator=data_collator
    )

    trainer.train()
    model.save_pretrained(MODEL_DIR)
    tokenizer.save_pretrained(MODEL_DIR)
    print("✅ Model and tokenizer saved in:", MODEL_DIR)

if __name__ == "__main__":
    fine_tune_model()
