In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
    pipeline
)
from datasets import Dataset
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from huggingface_hub import notebook_login

# Step 1: Load and Verify Dataset
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
print("Dataset Preview:")
print(df.head())  # Verify dataset structure

# Step 2: Data Preprocessing
df['label'] = df['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
df = df[['review', 'label']]  # Keep only necessary columns

# Split data into train (80%), validation (10%), test (10%)
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Step 3: Model & Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_function(examples):
    return tokenizer(
        examples['review'],
        padding="max_length",    # Pad to max_length
        truncation=True,
        max_length=256,          # Enforce max sequence length
        return_tensors="pt"
    )

# Convert pandas DataFrames to Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True)
val_dataset = Dataset.from_pandas(val_df).map(tokenize_function, batched=True)
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True)

# Step 4: Model Initialization
model = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased',
    num_labels=2
)

# Training Configuration
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",        # Evaluate every epoch
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=2,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none"             # Disable external logging services
)

# Metrics Calculation
def compute_metrics(p):
    predictions, labels = p
    preds = predictions.argmax(axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average='binary'
    )
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the Model
print("\nStarting Training...")
trainer.train()

# Step 5: Save and Upload Model
model_output_dir = './imdb-sentiment-distilbert'
tokenizer_output_dir = './imdb-sentiment-distilbert'

# Save Locally
model.save_pretrained(model_output_dir)
tokenizer.save_pretrained(tokenizer_output_dir)
print("\nModel saved locally at:", model_output_dir)

# Upload to Hugging Face Hub
notebook_login()  # Follow the prompt to enter your token

model.push_to_hub("your-username/imdb-sentiment-distilbert")
tokenizer.push_to_hub("your-username/imdb-sentiment-distilbert")
print("\nModel uploaded to Hugging Face Hub!")

# Final Evaluation
test_results = trainer.evaluate(test_dataset)
print("\nTest Set Performance:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Dataset Preview:
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive


Map:   0%|          | 0/40000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Starting Training...




Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2235,0.212462,0.913,0.926507,0.897241,0.911639
2,0.1345,0.226808,0.9206,0.919124,0.922431,0.920774





Model saved locally at: ./imdb-sentiment-distilbert


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

HfHubHTTPError: 401 Client Error: Unauthorized for url: https://huggingface.co/api/repos/create (Request ID: Root=1-67aba1fd-28b9144308da780770aab535;7bab31e1-1a47-4752-ad53-a5d40e9837f3)

Invalid username or password.

In [11]:
from huggingface_hub import notebook_login

notebook_login()  # Enter your token when prompted

your_username = "Rasmuzeri"  # Define your username HERE (replace with your actual username)
repo_name = "imdb-sentiment-distilbert"

model.push_to_hub(f"{your_username}/{repo_name}")
tokenizer.push_to_hub(f"{your_username}/{repo_name}")

print("\nModel uploaded to Hugging Face Hub!")

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]


Model uploaded to Hugging Face Hub!
