In [1]:
!pip install -q transformers datasets

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m481.3/491.2 kB[0m [31m48.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/183.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/143.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import numpy as np
import pandas as pd
import torch
import os
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments,
    AutoConfig
)
from datasets import Dataset
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [3]:
#config path of kaggle.json
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
#api to get dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other


In [5]:
#extracting zip file
!unzip /content/sentiment140.zip

Archive:  /content/sentiment140.zip
  inflating: training.1600000.processed.noemoticon.csv  


In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [7]:
df = pd.read_csv('/content/training.1600000.processed.noemoticon.csv',
                encoding='latin-1',
                names=['target', 'id', 'date', 'flag', 'user', 'text'])

In [9]:
print("Sample data:")
print(df.head())

# Check the sentiment distribution
print("Sentiment distribution:")
print(df['target'].value_counts())

Sample data:
   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  
Sentiment distribution:
target
0    800000
4    800000
Name: count, dtype: int64


In [10]:
# For simplicity, we'll use a subset of the data for faster training
# You can increase this if you have more computational resources
SAMPLE_SIZE = 100000  # Adjust based on your computational resources
print(f"Using {SAMPLE_SIZE} samples for training")

Using 100000 samples for training


In [11]:
# Balance the dataset by sampling equally from positive and negative classes
df_positive = df[df['target'] == 4].sample(SAMPLE_SIZE // 2, random_state=42)
df_negative = df[df['target'] == 0].sample(SAMPLE_SIZE // 2, random_state=42)
df_sampled = pd.concat([df_positive, df_negative]).reset_index(drop=True)

In [12]:
# Remap the labels to 0 and 1 (negative and positive)
df_sampled['target'] = df_sampled['target'].map({0: 0, 4: 1})

# Prepare the input format for the model
df_sampled['input'] = 'TEXT1: ' + df_sampled['text']

In [14]:
# Split the dataset into train, validation, and test sets
train_df, temp_df = train_test_split(df_sampled, test_size=0.3, random_state=42, stratify=df_sampled['target'])
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42, stratify=temp_df['target'])

In [16]:
print(f"Train set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")
print(f"Test set size: {len(test_df)}")

Train set size: 70000
Validation set size: 15000
Test set size: 15000


In [17]:
# Initialize the tokenizer
model_name = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [35]:
# Define the tokenization function - IMPORTANT: Rename 'target' to 'labels'
def tokenize(examples):
    tokenized = tokenizer(examples['input'], padding='max_length', truncation=True, max_length=128)
    # Very important: rename 'target' to 'labels' as that's what the model expects
    tokenized['labels'] = examples['target']
    return tokenized

In [36]:
# Create Dataset objects
train_ds = Dataset.from_pandas(train_df)
val_ds = Dataset.from_pandas(val_df)
test_ds = Dataset.from_pandas(test_df)

In [37]:
# Apply tokenization
tokenized_train_ds = train_ds.map(tokenize, batched=True)
tokenized_val_ds = val_ds.map(tokenize, batched=True)
tokenized_test_ds = test_ds.map(tokenize, batched=True)

Map:   0%|          | 0/70000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [38]:
# Print column names to see what's available
print("Available columns in tokenized_train_ds:", tokenized_train_ds.column_names)

Available columns in tokenized_train_ds: ['target', 'id', 'date', 'flag', 'user', 'text', 'input', '__index_level_0__', 'input_ids', 'attention_mask', 'labels']


In [39]:
# First, identify which columns exist and should be removed
columns_to_remove = []
for col in ['id', 'date', 'flag', 'user', 'text', 'input', 'target', '__index_level_0__']:
    if col in tokenized_train_ds.column_names:
        columns_to_remove.append(col)

print(f"Removing these columns: {columns_to_remove}")

Removing these columns: ['id', 'date', 'flag', 'user', 'text', 'input', 'target', '__index_level_0__']


In [40]:
# Now remove only the columns that exist
tokenized_train_ds = tokenized_train_ds.remove_columns(columns_to_remove)
tokenized_val_ds = tokenized_val_ds.remove_columns(columns_to_remove)
tokenized_test_ds = tokenized_test_ds.remove_columns(columns_to_remove)

In [41]:
# Set format for PyTorch
tokenized_train_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_val_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
tokenized_test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [42]:
# Verify the dataset structure - should contain 'labels'
print("Training dataset features:", tokenized_train_ds.features)
print("Sample from training dataset:", tokenized_train_ds[0])

Training dataset features: {'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None), 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None), 'labels': Value(dtype='int64', id=None)}
Sample from training dataset: {'input_ids': tensor([  101,  3793,  2487,  1024,  1030, 19863,  6499,  2497, 24096,  1045,
         1005,  1049,  2025,  2183,  2000,  2022,  2583,  2000,  2191,  2009,
         2000,  1996,  2283,  9317,  1045,  2031,  1037, 24385, 10439,  2102,
         1012,  7977,  1012,   102,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,   

In [43]:
# Set up training parameters
batch_size = 32  # Reduced batch size for Colab
epochs = 3
learning_rate = 1e-5
num_labels = 2  # Positive and Negative

In [44]:
# Define metrics for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # Calculate accuracy
    accuracy = np.mean(predictions == labels)

    # Calculate F1 score (weighted average)
    from sklearn.metrics import f1_score
    f1 = f1_score(labels, predictions, average='weighted')

    return {
        'accuracy': accuracy,
        'f1': f1
    }

In [45]:
# Create output directory for saving checkpoints
import os
os.makedirs('./results', exist_ok=True)

# Mount Google Drive to save model (optional but recommended)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    output_dir = '/content/drive/MyDrive/sentiment140_model'
    os.makedirs(output_dir, exist_ok=True)
    using_drive = True
    print("Google Drive mounted successfully.")
except:
    output_dir = './sentiment140_model'
    os.makedirs(output_dir, exist_ok=True)
    using_drive = False
    print("Failed to mount Google Drive. Model will be saved locally.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [46]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=learning_rate,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    fp16=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size * 2,
    num_train_epochs=epochs,
    weight_decay=0.05,
    report_to='none',
    logging_steps=100,
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    # Add gradient checkpointing to save memory
    gradient_checkpointing=True,
    # Add gradient accumulation to handle smaller batch sizes
    gradient_accumulation_steps=2,
)



In [47]:
# Configure and initialize the model
config = AutoConfig.from_pretrained(model_name)
config.num_labels = num_labels
config.hidden_dropout_prob = 0.2
config.attention_probs_dropout_prob = 0.2

model = DistilBertForSequenceClassification.from_pretrained(model_name, config=config)
model = model.to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [48]:
# Initialize the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_ds,
    eval_dataset=tokenized_val_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [49]:
# Display model configuration
print("Model configuration:")
print(model.config)

Model configuration:
DistilBertConfig {
  "_attn_implementation_autoset": true,
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "attention_probs_dropout_prob": 0.2,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "hidden_dropout_prob": 0.2,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.50.3",
  "vocab_size": 30522
}



In [50]:
# Option to reduce dataset size further if Colab runs out of memory
if device.type == 'cuda' and torch.cuda.get_device_properties(0).total_memory < 15e9:  # Less than 15GB VRAM
    print("Running on a limited GPU. Reducing sample size to 50,000")
    SAMPLE_SIZE = 50000

In [51]:
# Train the model
print("Starting training...")
trainer.train()

Starting training...


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.405,0.397856,0.825533,0.825337
2,0.3431,0.380541,0.833667,0.833666


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.405,0.397856,0.825533,0.825337
2,0.3431,0.380541,0.833667,0.833666
3,0.3285,0.383894,0.832667,0.832657


TrainOutput(global_step=3282, training_loss=0.3798116466608228, metrics={'train_runtime': 771.4235, 'train_samples_per_second': 272.224, 'train_steps_per_second': 4.254, 'total_flos': 6954538429440000.0, 'train_loss': 0.3798116466608228, 'epoch': 3.0})

In [52]:
# Evaluate on the test set
print("Evaluating on test set...")
test_results = trainer.evaluate(tokenized_test_ds)

Evaluating on test set...


In [53]:
print("Test set results:")
print(f"Loss: {test_results['eval_loss']:.4f}")
print(f"Accuracy: {test_results['eval_accuracy']:.4f}")
print(f"F1 Score: {test_results['eval_f1']:.4f}")

Test set results:
Loss: 0.3816
Accuracy: 0.8346
F1 Score: 0.8346


In [54]:
# Generate detailed classification report
print("\nDetailed Classification Report:")
predictions = trainer.predict(tokenized_test_ds)
preds = np.argmax(predictions.predictions, axis=-1)
print(classification_report(tokenized_test_ds['labels'], preds))


Detailed Classification Report:


              precision    recall  f1-score   support

           0       0.84      0.83      0.83      7500
           1       0.83      0.84      0.84      7500

    accuracy                           0.83     15000
   macro avg       0.83      0.83      0.83     15000
weighted avg       0.83      0.83      0.83     15000



In [55]:
# Save the model
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")

Model saved to /content/drive/MyDrive/sentiment140_model


In [56]:
# Function to classify new text
def classify_text(text):
    input_text = f"TEXT1: {text}"
    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=1)
    return probabilities[0].cpu().tolist()

In [57]:
# Define class names
def get_class_names():
    return {
        0: "negative",
        1: "positive"
    }

# Command line interface for testing
print("\nEnter text to classify (or 'quit' to exit):")
class_names = get_class_names()


Enter text to classify (or 'quit' to exit):


In [58]:
# Try to create interactive widgets if possible
try:
    from IPython.display import display
    import ipywidgets as widgets

    def classify_and_display(text):
        if not text.strip():
            return "Please enter some text to classify."

        probabilities = classify_text(text)
        result = "Classification results:\n"

        for i, prob in enumerate(probabilities):
            result += f"{class_names[i]}: {prob:.2%}\n"

        predicted_class = np.argmax(probabilities)
        result += f"\nTop prediction: {class_names[predicted_class]} ({probabilities[predicted_class]:.2%})"
        return result

    # Create widgets for interactive demo
    text_input = widgets.Textarea(
        value='',
        placeholder='Enter text to classify sentiment',
        description='Text:',
        disabled=False,
        layout=widgets.Layout(width='80%', height='100px')
    )

    button = widgets.Button(
        description='Classify',
        disabled=False,
        button_style='primary',
        tooltip='Classify the text',
        icon='check'
    )

    output = widgets.Output()

    def on_button_clicked(b):
        with output:
            output.clear_output()
            print(classify_and_display(text_input.value))

    button.on_click(on_button_clicked)

    # Display the interactive demo
    print("Interactive Sentiment Classifier:")
    display(text_input)
    display(button)
    display(output)

    print("Use the interactive widgets above to classify text, or use the input prompt below:")

except:
    print("Interactive widgets not available. Using command line interface.")

Interactive Sentiment Classifier:


Textarea(value='', description='Text:', layout=Layout(height='100px', width='80%'), placeholder='Enter text to…

Button(button_style='primary', description='Classify', icon='check', style=ButtonStyle(), tooltip='Classify th…

Output()

Use the interactive widgets above to classify text, or use the input prompt below:


In [59]:
# Command line interface as fallback
while True:
    user_input = input("> ")

    if user_input.lower() == 'quit':
        print("Goodbye!")
        break

    probabilities = classify_text(user_input)

    print("Classification results:")

    for i, prob in enumerate(probabilities):
        print(f"{class_names[i]}: {prob:.2%}")

    predicted_class = np.argmax(probabilities)
    print(f"\nTop prediction: {class_names[predicted_class]} ({probabilities[predicted_class]:.2%})")

> im smart
Classification results:
negative: 2.96%
positive: 97.04%

Top prediction: positive (97.04%)
> donald trump
Classification results:
negative: 6.01%
positive: 93.99%

Top prediction: positive (93.99%)
> quit
Goodbye!
