<a href="https://colab.research.google.com/github/ParavaVarshitaReddy/Advanced-Sentiment-Analysis-with-Transformer-Models/blob/main/Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate transformers datasets accelerate scikit-learn sentencepiece
!pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nvjitlink-cu12
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py311-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    XLMRobertaTokenizer,
    XLMRobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import evaluate

def clean_text(text):
    """Clean and validate text data."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    return str(text).strip()

def load_data(train_size=10000, test_size=10000):
    """Load and prepare a subset of the Amazon reviews dataset with cleaning."""
    # Read CSV files
    train_df = pd.read_csv('train.csv', header=None,
                          names=['polarity', 'title', 'text'])
    test_df = pd.read_csv('test.csv', header=None,
                         names=['polarity', 'title', 'text'])

    # Clean the text data
    train_df['title'] = train_df['title'].apply(clean_text)
    train_df['text'] = train_df['text'].apply(clean_text)
    test_df['title'] = test_df['title'].apply(clean_text)
    test_df['text'] = test_df['text'].apply(clean_text)

    # Remove rows with empty texts
    train_df = train_df[
        (train_df['title'].str.len() > 0) |
        (train_df['text'].str.len() > 0)
    ]
    test_df = test_df[
        (test_df['title'].str.len() > 0) |
        (test_df['text'].str.len() > 0)
    ]

    # Sample the datasets
    train_df = train_df.sample(n=min(train_size, len(train_df)), random_state=42)
    test_df = test_df.sample(n=min(test_size, len(test_df)), random_state=42)

    print(f"Training samples: {len(train_df)}")
    print(f"Testing samples: {len(test_df)}")

    # Check class distribution
    print("\nClass distribution in training set:")
    print(train_df['polarity'].value_counts())
    print("\nClass distribution in test set:")
    print(test_df['polarity'].value_counts())

    # Combine title and text, handling empty values
    train_df['combined_text'] = train_df.apply(
        lambda x: f"{x['title']} </s> {x['text']}" if x['title'] and x['text']
        else x['title'] if x['title']
        else x['text'],
        axis=1
    )
    test_df['combined_text'] = test_df.apply(
        lambda x: f"{x['title']} </s> {x['text']}" if x['title'] and x['text']
        else x['title'] if x['title']
        else x['text'],
        axis=1
    )

    # Convert polarity from 1,2 to 0,1
    train_df['labels'] = train_df['polarity'] - 1
    test_df['labels'] = test_df['polarity'] - 1

    # Convert to datasets
    train_dataset = Dataset.from_pandas(train_df[['combined_text', 'labels']])
    test_dataset = Dataset.from_pandas(test_df[['combined_text', 'labels']])

    return DatasetDict({
        'train': train_dataset,
        'test': test_dataset
    })

def preprocess_function(examples, tokenizer, max_length=256):
    """Tokenize and prepare the examples."""
    # Ensure all texts are strings
    texts = [str(text) for text in examples['combined_text']]

    # Tokenize the texts
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding='max_length'
    )

    # Add labels to the tokenized output
    tokenized['labels'] = examples['labels']

    return tokenized

def compute_metrics(eval_pred):
    """Compute metrics for evaluation."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Calculate precision, recall, f1
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels,
        predictions,
        average='binary'
    )

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

def main():
    # Set random seeds
    torch.manual_seed(42)
    np.random.seed(42)

    # Load tokenizer and model
    print("Loading tokenizer and model...")
    model_name = 'xlm-roberta-base'
    tokenizer = XLMRobertaTokenizer.from_pretrained(model_name)
    model = XLMRobertaForSequenceClassification.from_pretrained(
        model_name,
        num_labels=2
    )

    # Load and prepare datasets
    print("Loading and preparing datasets...")
    datasets = load_data(train_size=10000, test_size=10000)

    # Tokenize datasets
    tokenized_datasets = datasets.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=datasets['train'].column_names,
        desc="Tokenizing datasets"
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",  # Updated from evaluation_strategy
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_dir='./logs',
        logging_steps=50,
        report_to="tensorboard",
        warmup_steps=100,
        max_grad_norm=1.0,
    )

    # Initialize Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model
    print("Starting training...")
    trainer.train()

    # Evaluate the model
    print("Evaluating model...")
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # Save the model
    print("\nSaving model...")
    trainer.save_model("./final_model")
    tokenizer.save_pretrained("./final_model")

if __name__ == "__main__":
    main()

# New Section

## Distilbert-base-uncased


In [None]:
!pip install
!pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-cupti-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12 nvidia-cufft-cu12 nvidia-curand-cu12 nvidia-cusolver-cu12 nvidia-cusparse-cu12 nvidia-nvjitlink-cu12
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install

[31mERROR: You must give at least one requirement to install (see "pip help install")[0m[31m
[0mFound existing installation: nvidia-cublas-cu12 12.5.3.2
Uninstalling nvidia-cublas-cu12-12.5.3.2:
  Successfully uninstalled nvidia-cublas-cu12-12.5.3.2
Found existing installation: nvidia-cuda-cupti-cu12 12.5.82
Uninstalling nvidia-cuda-cupti-cu12-12.5.82:
  Successfully uninstalled nvidia-cuda-cupti-cu12-12.5.82
Found existing installation: nvidia-cuda-nvrtc-cu12 12.5.82
Uninstalling nvidia-cuda-nvrtc-cu12-12.5.82:
  Successfully uninstalled nvidia-cuda-nvrtc-cu12-12.5.82
Found existing installation: nvidia-cuda-runtime-cu12 12.5.82
Uninstalling nvidia-cuda-runtime-cu12-12.5.82:
  Successfully uninstalled nvidia-cuda-runtime-cu12-12.5.82
Found existing installation: nvidia-cudnn-cu12 9.3.0.75
Uninstalling nvidia-cudnn-cu12-9.3.0.75:
  Successfully uninstalled nvidia-cudnn-cu12-9.3.0.75
Found existing installation: nvidia-cufft-cu12 11.2.3.61
Uninstalling nvidia-cufft-cu12-11.2.3.61:
 

In [None]:
!pip install bitsandbytes

Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Downloading bitsandbytes-0.45.3-py3-none-manylinux_2_24_x86_64.whl (76.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.1/76.1 MB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.45.3


In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    DistilBertTokenizer,
    DistilBertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import evaluate

def clean_text(text):
    """Basic text cleaning function."""
    return str(text).strip().lower()

def load_data(train_size=10000, test_size=10000):
    """Load and prepare a subset of the Amazon reviews dataset."""

    # Load datasets
    train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])
    test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])

    # Ensure text columns are cleaned
    for df in [train_df, test_df]:
        df['title'] = df['title'].fillna("").apply(clean_text)
        df['text'] = df['text'].fillna("").apply(clean_text)

        # ✅ Ensure combined_text column exists
        df['combined_text'] = df.apply(
            lambda x: f"{x['title']} [SEP] {x['text']}" if x['title'] and x['text']
            else x['title'] if x['title']
            else x['text'],
            axis=1
        )

        # ✅ Convert polarity (1,2) → (0,1) for binary classification
        df['labels'] = df['polarity'] - 1

    # ✅ Ensure valid sample sizes
    train_df = train_df.sample(n=min(train_size, len(train_df)), random_state=42)
    test_df = test_df.sample(n=min(test_size, len(test_df)), random_state=42)

    # 🔍 Debugging: Check if combined_text and labels exist
    print("Train DataFrame Sample:")
    print(train_df.head())
    print("\nTest DataFrame Sample:")
    print(test_df.head())

    # ✅ Ensure proper column selection before converting to Dataset
    train_dataset = Dataset.from_pandas(train_df[['combined_text', 'labels']])
    test_dataset = Dataset.from_pandas(test_df[['combined_text', 'labels']])

    return DatasetDict({'train': train_dataset, 'test': test_dataset})


def preprocess_function(examples, tokenizer, max_length=128):
    """Tokenize the text."""
    texts = [str(text) for text in examples['combined_text']]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding='max_length'
    )
    tokenized['labels'] = examples['labels']
    return tokenized

def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1-score."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

def main():
    # Set random seeds
    torch.manual_seed(42)
    np.random.seed(42)

    # Load tokenizer and model
    model_name = 'distilbert-base-uncased'
    tokenizer = DistilBertTokenizer.from_pretrained(model_name)
    model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Load dataset
    datasets = load_data(train_size=10000, test_size=10000)

    # Tokenize dataset
    tokenized_datasets = datasets.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=datasets['train'].column_names
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./results",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_dir='./logs',
        logging_steps=50,
        report_to="tensorboard",
        warmup_steps=100,
        max_grad_norm=1.0,
        fp16=True  # Enable mixed-precision training
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # Save the model
    trainer.save_model("./final_model")
    tokenizer.save_pretrained("./final_model")

if __name__ == "__main__":
    main()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train DataFrame Sample:
         polarity                                   title  \
2079998         1                          expensive junk   
1443106         1                          toast too dark   
3463669         2   excellent imagery...dumbed down story   
2914699         1  are we pretending everyone is married?   
1603231         1                     not worth your time   

                                                      text  \
2079998  this product consists of a piece of thin flexi...   
1443106  even on the lowest setting, the toast is too d...   
3463669  i enjoyed this disc. the video is stunning. i ...   
2914699  the authors pretend that parents neither die n...   
1603231  might as well just use a knife, this product h...   

                                             combined_text  labels  
2079998  expensive junk [SEP] this product consists of ...       0  
1443106  toast too dark [SEP] even on the lowest settin...       0  
3463669  excellent imagery...

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2365,0.213508,0.9302,0.925511,0.936555,0.931
2,0.1647,0.234279,0.931,0.950936,0.909706,0.929864
3,0.0742,0.282108,0.9321,0.933081,0.931782,0.932431



Evaluation Results:
eval_loss: 0.2135
eval_accuracy: 0.9302
eval_precision: 0.9255
eval_recall: 0.9366
eval_f1: 0.9310
eval_runtime: 11.1320
eval_samples_per_second: 898.3130
eval_steps_per_second: 56.1450
epoch: 3.0000


## BERT


In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate
!pip install scikit-learn
!pip install torch
!pip install pandas numpy
!pip install tensorboard

Collecting datasets
  Downloading datasets-3.4.1-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.4.1-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    BertTokenizer,
    BertForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    IntervalStrategy
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import evaluate

def clean_text(text):
    """Basic text cleaning function."""
    return str(text).strip().lower()

def load_data(train_size=10000, test_size=10000):
    """Load and prepare a subset of the Amazon reviews dataset."""

    # Load datasets
    train_df = pd.read_csv('train.csv', header=None, names=['polarity', 'title', 'text'])
    test_df = pd.read_csv('test.csv', header=None, names=['polarity', 'title', 'text'])

    # Ensure text columns are cleaned
    for df in [train_df, test_df]:
        df['title'] = df['title'].fillna("").apply(clean_text)
        df['text'] = df['text'].fillna("").apply(clean_text)

        # Ensure combined_text column exists
        df['combined_text'] = df.apply(
            lambda x: f"{x['title']} [SEP] {x['text']}" if x['title'] and x['text']
            else x['title'] if x['title']
            else x['text'],
            axis=1
        )

        # Convert polarity (1,2) → (0,1) for binary classification
        df['labels'] = df['polarity'] - 1

    # Ensure valid sample sizes
    train_df = train_df.sample(n=min(train_size, len(train_df)), random_state=42)
    test_df = test_df.sample(n=min(test_size, len(test_df)), random_state=42)

    # Debugging: Check if combined_text and labels exist
    print("Train DataFrame Sample:")
    print(train_df.head())
    print("\nTest DataFrame Sample:")
    print(test_df.head())

    # Ensure proper column selection before converting to Dataset
    train_dataset = Dataset.from_pandas(train_df[['combined_text', 'labels']])
    test_dataset = Dataset.from_pandas(test_df[['combined_text', 'labels']])

    return DatasetDict({'train': train_dataset, 'test': test_dataset})


def preprocess_function(examples, tokenizer, max_length=128):
    """Tokenize the text."""
    texts = [str(text) for text in examples['combined_text']]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding='max_length'
    )
    tokenized['labels'] = examples['labels']
    return tokenized

def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1-score."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

def main():
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Load tokenizer and model - switching from DistilBERT to BERT
    model_name = 'bert-base-uncased'  # Changed from distilbert-base-uncased
    tokenizer = BertTokenizer.from_pretrained(model_name)  # Changed to BertTokenizer
    model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)  # Changed to BertForSequenceClassification

    # Load dataset
    datasets = load_data(train_size=10000, test_size=10000)

    # Tokenize dataset
    tokenized_datasets = datasets.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=datasets['train'].column_names
    )

    # Training arguments - updated for BERT
    training_args = TrainingArguments(
        output_dir="./bert_results",  # Changed directory
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=4,  # Slightly increased from 3 for BERT
        weight_decay=0.01,
        evaluation_strategy=IntervalStrategy.EPOCH,  # Updated attribute name
        save_strategy=IntervalStrategy.EPOCH,  # Updated attribute name
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_dir='./bert_logs',  # Changed directory
        logging_steps=50,
        report_to="tensorboard",
        warmup_steps=100,
        max_grad_norm=1.0,
        fp16=True  # Enable mixed-precision training
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # Save the model
    trainer.save_model("./bert_final_model")
    tokenizer.save_pretrained("./bert_final_model")

    # Example of how to use the model for prediction
    print("\nPrediction Example:")
    def predict_sentiment(text, model, tokenizer):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()
            return "Positive" if predicted_class == 1 else "Negative"

    example_text = "This product is amazing! I love it."
    prediction = predict_sentiment(example_text, model, tokenizer)
    print(f"Text: '{example_text}'")
    print(f"Predicted sentiment: {prediction}")

if __name__ == "__main__":
    main()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train DataFrame Sample:
         polarity                                   title  \
2079998         1                          expensive junk   
1443106         1                          toast too dark   
3463669         2   excellent imagery...dumbed down story   
2914699         1  are we pretending everyone is married?   
1603231         1                     not worth your time   

                                                      text  \
2079998  this product consists of a piece of thin flexi...   
1443106  even on the lowest setting, the toast is too d...   
3463669  i enjoyed this disc. the video is stunning. i ...   
2914699  the authors pretend that parents neither die n...   
1603231  might as well just use a knife, this product h...   

                                             combined_text  labels  
2079998  expensive junk [SEP] this product consists of ...       0  
1443106  toast too dark [SEP] even on the lowest settin...       0  
3463669  excellent imagery...

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2122,0.234931,0.9344,0.947767,0.920247,0.933804
2,0.145,0.199047,0.9381,0.951095,0.924423,0.937569
3,0.0757,0.268441,0.9395,0.933542,0.947096,0.940271
4,0.025,0.301691,0.9387,0.933608,0.945306,0.939421



Evaluation Results:
eval_loss: 0.1990
eval_accuracy: 0.9381
eval_precision: 0.9511
eval_recall: 0.9244
eval_f1: 0.9376
eval_runtime: 19.4449
eval_samples_per_second: 514.2750
eval_steps_per_second: 32.1420
epoch: 4.0000

Prediction Example:


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

##ROBERTa


In [None]:
!pip install datasets==2.18.0
!pip install evaluate==0.4.1
!pip install scikit-learn==1.3.2
!pip install torch==2.2.1
!pip install pandas==2.1.4
!pip install numpy==1.26.4
!pip install tensorboard==2.15.2

Collecting torch==2.2.1
  Using cached torch-2.2.1-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.2.1)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.2.1)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.2.1)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.2.1)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-

In [None]:
import pandas as pd
import numpy as np
from datasets import Dataset, DatasetDict
from transformers import (
    RobertaTokenizer,
    RobertaForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    IntervalStrategy
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import torch
import evaluate

def clean_text(text):
    """Basic text cleaning function."""
    return str(text).strip().lower()

def load_data(train_size=10000, test_size=10000):
    """Load and prepare a subset of the Amazon reviews dataset."""

    # Load datasets
    train_df = pd.read_csv('/content/drive/MyDrive/datasets/train.csv', header=None, names=['polarity', 'title', 'text'])
    test_df = pd.read_csv('/content/drive/MyDrive/datasets/test.csv', header=None, names=['polarity', 'title', 'text'])

    # Ensure text columns are cleaned
    for df in [train_df, test_df]:
        df['title'] = df['title'].fillna("").apply(clean_text)
        df['text'] = df['text'].fillna("").apply(clean_text)

        # Ensure combined_text column exists
        df['combined_text'] = df.apply(
            lambda x: f"{x['title']} {x['text']}" if x['title'] and x['text']  # Note: RoBERTa doesn't use [SEP] token like BERT
            else x['title'] if x['title']
            else x['text'],
            axis=1
        )

        # Convert polarity (1,2) → (0,1) for binary classification
        df['labels'] = df['polarity'] - 1

    # Ensure valid sample sizes
    train_df = train_df.sample(n=min(train_size, len(train_df)), random_state=42)
    test_df = test_df.sample(n=min(test_size, len(test_df)), random_state=42)

    # Debugging: Check if combined_text and labels exist
    print("Train DataFrame Sample:")
    print(train_df.head())
    print("\nTest DataFrame Sample:")
    print(test_df.head())

    # Ensure proper column selection before converting to Dataset
    train_dataset = Dataset.from_pandas(train_df[['combined_text', 'labels']])
    test_dataset = Dataset.from_pandas(test_df[['combined_text', 'labels']])

    return DatasetDict({'train': train_dataset, 'test': test_dataset})


def preprocess_function(examples, tokenizer, max_length=128):
    """Tokenize the text."""
    texts = [str(text) for text in examples['combined_text']]
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=max_length,
        padding='max_length'
    )
    tokenized['labels'] = examples['labels']
    return tokenized

def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, and F1-score."""
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)

    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')

    return {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}

def main():
    # Set random seeds for reproducibility
    torch.manual_seed(42)
    np.random.seed(42)

    # Load tokenizer and model
    model_name = 'FacebookAI/roberta-base'  # Using RoBERTa base model
    tokenizer = RobertaTokenizer.from_pretrained(model_name)
    model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

    # Load dataset
    datasets = load_data(train_size=10000, test_size=10000)

    # Tokenize dataset
    tokenized_datasets = datasets.map(
        lambda x: preprocess_function(x, tokenizer),
        batched=True,
        remove_columns=datasets['train'].column_names
    )

    # Training arguments
    training_args = TrainingArguments(
        output_dir="./roberta_results",
        learning_rate=1e-5,  # Slightly lower learning rate for RoBERTa
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=4,
        weight_decay=0.01,
        evaluation_strategy=IntervalStrategy.EPOCH,
        save_strategy=IntervalStrategy.EPOCH,
        load_best_model_at_end=True,
        push_to_hub=False,
        logging_dir='./roberta_logs',
        logging_steps=50,
        report_to="tensorboard",
        warmup_steps=100,
        max_grad_norm=1.0,
        fp16=True  # Enable mixed-precision training
    )

    # Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["test"],
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    print("\nEvaluation Results:")
    for key, value in eval_results.items():
        print(f"{key}: {value:.4f}")

    # Save the model
    trainer.save_model("./roberta_final_model")
    tokenizer.save_pretrained("./roberta_final_model")

    # Example of how to use the model for prediction
    print("\nPrediction Example:")
    def predict_sentiment(text, model, tokenizer):
        # Get the device that the model is on
        device = model.device

        # Tokenize the input text
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)

        # Move input tensors to the same device as the model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits
            predicted_class = torch.argmax(logits, dim=1).item()
            return "Positive" if predicted_class == 1 else "Negative"

    example_text = "This product is amazing! I love it."
    prediction = predict_sentiment(example_text, model, tokenizer)
    print(f"Text: '{example_text}'")
    print(f"Predicted sentiment: {prediction}")

if __name__ == "__main__":
    main()

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train DataFrame Sample:
         polarity                                   title  \
2079998         1                          expensive junk   
1443106         1                          toast too dark   
3463669         2   excellent imagery...dumbed down story   
2914699         1  are we pretending everyone is married?   
1603231         1                     not worth your time   

                                                      text  \
2079998  this product consists of a piece of thin flexi...   
1443106  even on the lowest setting, the toast is too d...   
3463669  i enjoyed this disc. the video is stunning. i ...   
2914699  the authors pretend that parents neither die n...   
1603231  might as well just use a knife, this product h...   

                                             combined_text  labels  
2079998  expensive junk this product consists of a piec...       0  
1443106  toast too dark even on the lowest setting, the...       0  
3463669  excellent imagery...

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.2246,0.228295,0.9476,0.938986,0.958035,0.948415
2,0.1374,0.212405,0.9522,0.962586,0.941527,0.95194
3,0.0949,0.23685,0.9519,0.949042,0.955648,0.952334
4,0.0583,0.242556,0.953,0.950217,0.956643,0.953419



Evaluation Results:
eval_loss: 0.2124
eval_accuracy: 0.9522
eval_precision: 0.9626
eval_recall: 0.9415
eval_f1: 0.9519
eval_runtime: 19.4258
eval_samples_per_second: 514.7800
eval_steps_per_second: 32.1740
epoch: 4.0000

Prediction Example:
Text: 'This product is amazing! I love it.'
Predicted sentiment: Positive
