In [1]:
!pip install transformers

Defaulting to user installation because normal site-packages is not writeable


In [3]:
!pip install transformers==4.28.1 torch datasets evaluate accelerate


Defaulting to user installation because normal site-packages is not writeable
Collecting transformers==4.28.1
  Using cached transformers-4.28.1-py3-none-any.whl.metadata (109 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.1)
  Using cached tokenizers-0.13.3.tar.gz (314 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Using cached transformers-4.28.1-py3-none-any.whl (7.0 MB)
Building wheels for collected packages: tokenizers
  Building wheel for tokenizers (pyproject.toml): started
  Building wheel for tokenizers (pyproject.toml): finished with status 'error'
Failed to build tokenizers


  error: subprocess-exited-with-error
  
  Building wheel for tokenizers (pyproject.toml) did not run successfully.
  exit code: 1
  
  [62 lines of output]
  !!
  
          ********************************************************************************
          Please consider removing the following classifiers in favor of a SPDX license expression:
  
          License :: OSI Approved :: Apache Software License
  
          See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details.
          ********************************************************************************
  
  !!
    self._finalize_license_expression()
  running bdist_wheel
  running build
  running build_py
  creating build\lib.win-amd64-cpython-313\tokenizers
  copying py_src\tokenizers\__init__.py -> build\lib.win-amd64-cpython-313\tokenizers
  creating build\lib.win-amd64-cpython-313\tokenizers\models
  copying py_src\tokenizers\models\__init__.py -> build\lib.win-amd64-cpyth

In [4]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from datasets import load_dataset
from sklearn.metrics import classification_report, accuracy_score
import warnings

In [5]:
warnings.filterwarnings("ignore")

In [6]:
def load_tweet_data():
    print("Loading tweet_eval dataset...")
    dataset = load_dataset('tweet_eval', 'sentiment')
    return dataset['train'], dataset['validation'], dataset['test']


In [7]:
def load_model():
    print("Loading model and tokenizer...")
    # Using a model specifically fine-tuned for Twitter sentiment
    model_name = "cardiffnlp/twitter-roberta-base-sentiment"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    return tokenizer, model

In [8]:
def create_pipeline(tokenizer, model):
    print("Creating sentiment analysis pipeline...")
    sentiment_analyzer = pipeline(
        "text-classification",
        model=model,
        tokenizer=tokenizer,
        top_k=1
    )
    return sentiment_analyzer

In [9]:
def evaluate_model(pipeline, test_data, label_names):
    print("\nEvaluating model on test data (first 100 samples)...")
    test_samples = test_data.select(range(100))
    
    true_labels = test_samples['label']
    pred_labels = []
    
    for text in test_samples['text']:
        result = pipeline(text)[0][0]
        pred_labels.append(int(result['label'].split('_')[-1]))
    
    print("\nClassification Report:")
    print(classification_report(
        true_labels, 
        pred_labels, 
        target_names=label_names,
        zero_division=0
    ))
    print(f"Accuracy: {accuracy_score(true_labels, pred_labels):.2f}")

In [10]:
def main():
    # Load data and model
    train_data, val_data, test_data = load_tweet_data()
    tokenizer, model = load_model()
    sentiment_pipeline = create_pipeline(tokenizer, model)
    
    # Define label names
    label_names = ['Negative', 'Neutral', 'Positive']
    
    # Evaluate
    evaluate_model(sentiment_pipeline, test_data, label_names)
    
    # Example usage with better formatting
    print("\nExample predictions:")
    sample_tweets = [
        "I love this new feature! It's amazing!",
        "This is the worst experience I've ever had.",
        "The weather is okay today, nothing special.",
        "The service was terrible and the staff was rude!",
        "Just had the best meal of my life at this restaurant!"
    ]
    
    for tweet in sample_tweets:
        result = sentiment_pipeline(tweet)[0][0]
        sentiment = label_names[int(result['label'].split('_')[-1])]
        print(f"\nTweet: {tweet}")
        print(f"Sentiment: {sentiment} (Confidence: {result['score']:.2f})")

if __name__ == "__main__":
    main()

Loading tweet_eval dataset...
Loading model and tokenizer...


Device set to use cpu


Creating sentiment analysis pipeline...

Evaluating model on test data (first 100 samples)...

Classification Report:
              precision    recall  f1-score   support

    Negative       0.58      0.66      0.61        29
     Neutral       0.77      0.67      0.72        55
    Positive       0.74      0.88      0.80        16

    accuracy                           0.70       100
   macro avg       0.69      0.73      0.71       100
weighted avg       0.71      0.70      0.70       100

Accuracy: 0.70

Example predictions:

Tweet: I love this new feature! It's amazing!
Sentiment: Positive (Confidence: 0.99)

Tweet: This is the worst experience I've ever had.
Sentiment: Negative (Confidence: 0.98)

Tweet: The weather is okay today, nothing special.
Sentiment: Positive (Confidence: 0.92)

Tweet: The service was terrible and the staff was rude!
Sentiment: Negative (Confidence: 0.98)

Tweet: Just had the best meal of my life at this restaurant!
Sentiment: Positive (Confidence: 0.99)

In [12]:

import torch
import transformers
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)
from datasets import load_dataset
from evaluate import load
import numpy as np

# Verify environment
print(f"PyTorch version: {torch.__version__}")
print(f"Transformers version: {transformers.__version__}")

# 2. Load dataset
print("\nLoading dataset...")
try:
    dataset = load_dataset('tweet_eval', 'sentiment')
    train_data = dataset['train'].select(range(1000))  # Small subset for testing
    val_data = dataset['validation'].select(range(200))
    print(f"Loaded {len(train_data)} training and {len(val_data)} validation samples")
except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

# 3. Model setup
print("\nLoading model...")
model_name = "distilbert-base-uncased"
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=3,
        id2label={0: "Negative", 1: "Neutral", 2: "Positive"},
        label2id={"Negative": 0, "Neutral": 1, "Positive": 2}
    )
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

# 4. Tokenization
print("\nTokenizing data...")
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128
    )

try:
    tokenized_train = train_data.map(tokenize_function, batched=True)
    tokenized_val = val_data.map(tokenize_function, batched=True)
    print("Tokenization complete!")
except Exception as e:
    print(f"Error tokenizing data: {e}")
    raise

# 5. Training configuration with version compatibility
print("\nSetting up training...")
try:
    # Try newest parameter names first (v4.x+)
    args = TrainingArguments(
        output_dir="./sentiment_results",
        num_train_epochs=10,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        eval_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        logging_steps=10,
        load_best_model_at_end=True,
        report_to="none",
        optim="adamw_torch",
        learning_rate=2e-5,
        weight_decay=0.01
    )
    print("Using v4.x+ parameter names")
except TypeError:
    # Fallback to older parameter names (pre-v4)
    args = TrainingArguments(
        output_dir="./sentiment_results",
        num_train_epochs=1,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        evaluation_strategy="steps",
        eval_steps=50,
        save_strategy="steps",
        save_steps=50,
        logging_steps=10,
        load_best_model_at_end=True,
        report_to="none",
        optim="adamw_torch",
        learning_rate=2e-5,
        weight_decay=0.01
    )
    print("Using pre-v4 parameter names")

# 6. Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = load("accuracy")
    return accuracy.compute(predictions=predictions, references=labels)

# 7. Initialize Trainer
try:
    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
    )
    print("Trainer initialized successfully!")
except Exception as e:
    print(f"Error initializing trainer: {e}")
    raise

# 8. Train and save
print("\nStarting training...")
try:
    trainer.train()
    trainer.save_model("./final_sentiment_model")
    print("Training complete! Model saved.")
except Exception as e:
    print(f"Error during training: {e}")
    raise

# 9. Test prediction
print("\nRunning test prediction...")
sample_texts = [
    "This product works great!",
    "I'm very disappointed with this service",
    "The item was okay, nothing special"
]

for text in sample_texts:
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    predicted_class = torch.argmax(outputs.logits).item()
    print(f"\nText: {text}")
    print(f"Predicted sentiment: {model.config.id2label[predicted_class]}")

PyTorch version: 2.7.1+cpu
Transformers version: 4.54.1

Loading dataset...
Loaded 1000 training and 200 validation samples

Loading model...


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully!

Tokenizing data...
Tokenization complete!

Setting up training...
Using v4.x+ parameter names
Trainer initialized successfully!

Starting training...


Step,Training Loss,Validation Loss,Accuracy
50,1.0469,1.016289,0.43
100,0.8774,0.890565,0.585
150,0.704,0.802343,0.6
200,0.6281,0.787816,0.625
250,0.4656,0.755515,0.655
300,0.4143,0.903981,0.615
350,0.3422,0.959772,0.61
400,0.151,0.927144,0.64
450,0.1928,1.065688,0.64
500,0.2346,1.224716,0.66


Training complete! Model saved.

Running test prediction...

Text: This product works great!
Predicted sentiment: Positive

Text: I'm very disappointed with this service
Predicted sentiment: Negative

Text: The item was okay, nothing special
Predicted sentiment: Positive


In [15]:
# In your Jupyter Notebook - after training is complete
from transformers import DistilBertForSequenceClassification, AutoTokenizer
import os

# 1. Save the model and tokenizer
model.save_pretrained("./final_sentiment_model")
tokenizer.save_pretrained("./final_sentiment_model")

# 2. Verify the files (Windows compatible version)
print("Saved files:")
for file in os.listdir("./final_sentiment_model"):
    print(f"- {file}")

# Alternative verification (shows full paths)
from pathlib import Path
model_dir = Path("./final_sentiment_model")
print("\nFull paths:")
for file in model_dir.glob('*'):
    print(file)

Saved files:
- config.json
- model.safetensors
- special_tokens_map.json
- tokenizer.json
- tokenizer_config.json
- training_args.bin
- vocab.txt

Full paths:
final_sentiment_model\config.json
final_sentiment_model\model.safetensors
final_sentiment_model\special_tokens_map.json
final_sentiment_model\tokenizer.json
final_sentiment_model\tokenizer_config.json
final_sentiment_model\training_args.bin
final_sentiment_model\vocab.txt
