In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import torch

# Check if CUDA is available
if torch.cuda.is_available():
    device = torch.device("cuda:0")  # Select GPU 0
    print(f"Using device: {torch.cuda.get_device_name(device)}")
else:
    device = torch.device("cpu")
    print("CUDA not available, using CPU")

# Exploratory Data Analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

train_df = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv")

test_df = pd.read_csv("/kaggle/input/twitter-entity-sentiment-analysis/twitter_validation.csv")

## Cleaning the Training Set

In [None]:
train_df.head()

In [None]:
train_df.columns = ['id', 'game', 'sentiment', 'content']

# Drop unnecessary column (filled with None values)
train_df = train_df[['id', 'game', 'sentiment', 'content']]

In [None]:
train_df = train_df.drop(columns=['id', 'game'])

train_df.head()

In [None]:
print(len(train_df))

In [None]:
nan_values = train_df.isna().sum()
duplicate_values = train_df.duplicated().sum()

nan_values

In [None]:
# Remove NaN values
train_df = train_df.dropna()

In [None]:
duplicate_values

In [None]:
# Remove duplicate rows
train_df = train_df.drop_duplicates()

In [None]:
# Print unique sentiment names
unique_sentiments = train_df['sentiment'].unique()
unique_sentiments

In [None]:
# Replace 'Irrelevant' with 'Neutral'
train_df['sentiment'] = train_df['sentiment'].replace('Irrelevant', 'Neutral')

# Calculate sentiment distribution
sentiment_distribution = train_df['sentiment'].value_counts()

# Print sentiment distribution
sentiment_distribution

In [None]:
# Check for NaN values after cleaning
nan_check = train_df.isna().sum()
print("NaN values after cleaning:")
print(nan_check)

# Check for duplicate rows after cleaning
duplicate_check = train_df.duplicated().sum()
print("\nDuplicate rows after cleaning:")
print(duplicate_check)

# Verify the sentiment distribution
print("\nSentiment distribution after cleaning:")
print(train_df['sentiment'].value_counts())

In [None]:
# Remove duplicate rows
train_df = train_df.drop_duplicates()

In [None]:
duplicate_check = train_df.duplicated().sum()
print("\nDuplicate rows after cleaning:")
print(duplicate_check)

In [None]:
train_df.head(10)

## Cleaning the Test Set

In [None]:
test_df.head()

In [None]:
test_df.columns = ['id', 'game', 'sentiment', 'content']

# Drop unnecessary column (filled with None values)
test_df = test_df[['id', 'game', 'sentiment', 'content']]

In [None]:
test_df = test_df.drop(columns=['id', 'game'])

test_df.head()

In [None]:
nan_values = test_df.isna().sum()
duplicate_values = test_df.duplicated().sum()

nan_values, duplicate_values

In [None]:
# Remove NaN values
test_df = test_df.dropna()

In [None]:
# Remove duplicate rows
test_df = test_df.drop_duplicates()

In [None]:
# Print unique sentiment names
unique_sentiments = test_df['sentiment'].unique()
unique_sentiments

In [None]:
# Replace 'Irrelevant' with 'Neutral'
test_df['sentiment'] = test_df['sentiment'].replace('Irrelevant', 'Neutral')

# Calculate sentiment distribution
sentiment_distribution = test_df['sentiment'].value_counts()

# Print sentiment distribution
sentiment_distribution

In [None]:
# Check for NaN values after cleaning
nan_check = test_df.isna().sum()
print("NaN values after cleaning:")
print(nan_check)

# Check for duplicate rows after cleaning
duplicate_check = test_df.duplicated().sum()
print("\nDuplicate rows after cleaning:")
print(duplicate_check)

# Verify the sentiment distribution
print("\nSentiment distribution after cleaning:")
print(test_df['sentiment'].value_counts())

In [None]:
test_df.head(10)

# Data Preprocessing

In [None]:
len(train_df), len(test_df)

In [None]:
# Train / Val Set
from sklearn.model_selection import train_test_split

X = train_df['content']
y = train_df['sentiment']

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y  
)

print(f"Training Set: {len(X_train)} examples")
print(f"Validation Set: {len(X_val)} examples")

In [None]:
# Test Set
X_test = test_df['content']  
y_test = test_df['sentiment']  

In [None]:
print(f"Test Set: {len(X_test)} examples")

In [None]:
reference_reviews = ["@GameDevStudio The new patch completely ruined the gameplay. Frame drops everywhere - unplayable!",
                    "Not sure if I should spend my money on this game, looks fun but a bit skeptical. Any thoughts?",
                    "Big thanks to @ConsoleBrand for their amazing customer support! My console is back to working perfectly.",
                    "Had to cancel my order because the delivery was delayed for the third time. Frustrating.",
                    "Absolutely loved the new RPG! Stunning graphics, deep story, and engaging gameplay."]

# Testing for the Un-finetuned Model

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
from sklearn.metrics import classification_report
from torch.utils.data import DataLoader, TensorDataset
import pandas as pd

# Initialize tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

max_length = 64  
def tokenize_data(texts, tokenizer, max_length=64):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

In [None]:
# Tokenize validation set
X_val_tokenized = tokenize_data(X_val, tokenizer, max_length=max_length)

# Tokenize test set
X_test_tokenized = tokenize_data(X_test, tokenizer, max_length=max_length)

In [None]:
# Label mapping for sentiments
label_mapping = {'Positive': 0, 'Negative': 1, 'Neutral': 2}

# Encode validation labels
y_val_encoded = y_val.map(label_mapping).to_numpy()
y_val_tensor = torch.tensor(y_val_encoded, dtype=torch.long)

# Encode test labels
y_test_encoded = y_test.map(label_mapping).to_numpy()
y_test_tensor = torch.tensor(y_test_encoded, dtype=torch.long)

In [None]:
# Print validation labels info
print(f"Validation Labels Shape: {y_val_tensor.shape}")
print(f"Validation Labels (First 5): {y_val_tensor[:5]}")

# Print test labels info
print(f"Test Labels Shape: {y_test_tensor.shape}")
print(f"Test Labels (First 5): {y_test_tensor[:5]}")

In [None]:
# Validation dataset and dataloader
batch_size = 8
val_dataset = TensorDataset(
    X_val_tokenized['input_ids'], 
    X_val_tokenized['attention_mask'], 
    y_val_tensor
)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Test dataset and dataloader
test_dataset = TensorDataset(
    X_test_tokenized['input_ids'], 
    X_test_tokenized['attention_mask'], 
    y_test_tensor
)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Verify DataLoader
for batch in val_loader:
    input_ids, attention_mask, labels = batch
    print("Validation Input IDs:", input_ids.shape)
    print("Validation Attention Mask:", attention_mask.shape)
    print("Validation Labels:", labels.shape)
    break

In [None]:
print("First input IDs:", input_ids[0])
print("First attention mask:", attention_mask[0])
print("First label:", labels[0])

In [None]:
# Evaluation function for validation and test sets
def evaluate_model(loader, target_names):
    model.eval()
    all_predictions = []
    all_labels = []
    
    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            
            predictions = torch.argmax(logits, dim=-1).cpu().numpy()
            all_predictions.extend(predictions)
            all_labels.extend(labels.cpu().numpy())
    
    # Generate classification report
    report = classification_report(
        all_labels, 
        all_predictions, 
        target_names=target_names, 
        output_dict=True, 
        zero_division=0
    )
    return pd.DataFrame(report).transpose()

In [None]:
# Evaluate on validation set
val_report = evaluate_model(val_loader, target_names=['Positive', 'Negative', 'Neutral'])
print("Validation Performance:")
print(val_report.round(2))

# Evaluate on test set
test_report = evaluate_model(test_loader, target_names=['Positive', 'Negative', 'Neutral'])
print("Test Performance:")
print(test_report.round(2))

In [None]:
# Function to classify custom reviews
label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}

def classify_reviews(reviews):
    inputs = tokenizer(
        reviews,
        padding='max_length',
        truncation=True,
        max_length=64,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1).cpu().numpy()

    sentiments = [label_mapping[pred] for pred in predictions]
    return sentiments

In [None]:
predicted_sentiments = classify_reviews(reference_reviews)

# Display predictions
for review, sentiment in zip(reference_reviews, predicted_sentiments):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}\n")

# Finetuning the DistillBert Model

In [None]:
import gc
import torch

# Delete the un-finetuned tokenizer and model
del tokenizer
del model

# Clear GPU memory
torch.cuda.empty_cache()

# Collect garbage
gc.collect()

In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW, get_scheduler
from sklearn.metrics import classification_report
from sklearn.utils.class_weight import compute_class_weight
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
import pandas as pd

# Initialize tokenizer and fine-tuned model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)
model.to(device)

# Tokenization function
def tokenize_data(texts, tokenizer, max_length=64):
    return tokenizer(
        list(texts),
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )

# Tokenize datasets
X_train_tokenized = tokenize_data(X_train, tokenizer)
X_val_tokenized = tokenize_data(X_val, tokenizer)
X_test_tokenized = tokenize_data(X_test, tokenizer)

In [None]:
# Label mapping and encoding
label_mapping = {'Positive': 0, 'Negative': 1, 'Neutral': 2}
y_train_encoded = torch.tensor(y_train.map(label_mapping).to_numpy(), dtype=torch.long)
y_val_encoded = torch.tensor(y_val.map(label_mapping).to_numpy(), dtype=torch.long)
y_test_encoded = torch.tensor(y_test.map(label_mapping).to_numpy(), dtype=torch.long)

In [None]:
# Create TensorDatasets
train_dataset = TensorDataset(
    X_train_tokenized['input_ids'], 
    X_train_tokenized['attention_mask'], 
    y_train_encoded
)
val_dataset = TensorDataset(
    X_val_tokenized['input_ids'], 
    X_val_tokenized['attention_mask'], 
    y_val_encoded
)
test_dataset = TensorDataset(
    X_test_tokenized['input_ids'], 
    X_test_tokenized['attention_mask'], 
    y_test_encoded
)

# DataLoaders
batch_size = 16
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

In [None]:
# Class weights for imbalanced datasets
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=[0, 1, 2],
    y=y_train.map(label_mapping)
)
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

In [None]:
# Optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_loader) * 3  # 3 epochs

# Define warm-up steps (e.g., 10% of training steps)
warm_up_steps = int(0.1 * num_training_steps)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=warm_up_steps,
    num_training_steps=num_training_steps
)

# Loss function
loss_fn = CrossEntropyLoss(weight=class_weights)

In [None]:
# Training loop
train_loss_history = []
train_accuracy_history = []
epochs = 3

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    model.train()
    loop = tqdm(train_loader, leave=True)

    epoch_loss = 0
    correct = 0
    total = 0

    for batch in loop:
        input_ids, attention_mask, labels = [x.to(device) for x in batch]

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = loss_fn(logits, labels)

        # Backward pass
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        epoch_loss += loss.item()
        predictions = torch.argmax(logits, dim=-1)
        correct += (predictions == labels).sum().item()
        total += labels.size(0)

        loop.set_postfix(loss=loss.item(), accuracy=correct / total)

    train_loss_history.append(epoch_loss / len(train_loader))
    train_accuracy_history.append(correct / total)

In [None]:
# Plot training loss and accuracy
plt.figure(figsize=(12, 5))

# Plot loss
plt.subplot(1, 2, 1)
plt.plot(train_loss_history, label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()

# Plot accuracy
plt.subplot(1, 2, 2)
plt.plot(train_accuracy_history, label='Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.title('Training Accuracy')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
# Evaluation function
def evaluate_model(model, loader):
    model.eval()
    all_predictions, all_labels = [], []

    with torch.no_grad():
        for batch in loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)

            all_predictions.extend(predictions.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return classification_report(all_labels, all_predictions, target_names=['Positive', 'Negative', 'Neutral'], zero_division=0)

# Evaluate on validation and test sets
print("Validation Results:")
print(evaluate_model(model, val_loader))

print("\nTest Results:")
print(evaluate_model(model, test_loader))

In [None]:
# Evaluate specific user reviews
reference_reviews = ["@GameDevStudio The new patch completely ruined the gameplay. Frame drops everywhere - unplayable!",
                    "Not sure if I should spend my money on this game, looks fun but a bit skeptical. Any thoughts?",
                    "Big thanks to @ConsoleBrand for their amazing customer support! My console is back to working perfectly.",
                    "Had to cancel my order because the delivery was delayed for the third time. Frustrating.",
                    "Absolutely loved the new RPG! Stunning graphics, deep story, and engaging gameplay."]

inputs = tokenize_data(reference_reviews, tokenizer)
inputs = {key: val.to(device) for key, val in inputs.items()}

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()

# Map predictions to sentiments
predicted_sentiments = [list(label_mapping.keys())[pred] for pred in predictions]
for review, sentiment in zip(reference_reviews, predicted_sentiments):
    print(f"Review: {review}")
    print(f"Predicted Sentiment: {sentiment}")

# Save Model

In [None]:
from transformers import DistilBertTokenizer

# Define the directory where the model will be saved
save_directory = 'distilbert_finetuned_3_epochs'

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)

# Save the model configuration (optional, but usually included with `save_pretrained`)
model.config.save_pretrained(save_directory)

print(f"Model, tokenizer, and configuration saved to {save_directory}")

In [None]:
from huggingface_hub import login

# Log in to Hugging Face
# This will prompt you for your Hugging Face token, which you can find at: https://huggingface.co/settings/tokens
login()

In [None]:
from huggingface_hub import create_repo, upload_folder

username = "Savoxism"  
repo_name = "distilbert_sentiment_analysis_final" 
repo_id = f"{username}/{repo_name}"

# Create the repository
create_repo(repo_id, exist_ok=True)

# Upload the model
model_path = "/kaggle/working/distilbert_finetuned_3_epochs" 
upload_folder(
    repo_id=repo_id,
    folder_path=model_path,
    commit_message="Upload final fine-tuned DistilBERT for sentiment analysis",
)
print(f"Model uploaded to: https://huggingface.co/{repo_id}")

# Testing Model

In [1]:
import torch

In [2]:
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Define the path to the fine-tuned model
fine_tuned_model_dir = 'Savoxism/distilbert_sentiment_analysis_final'  # Update this to the directory where your model is saved

# Load the fine-tuned model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained(fine_tuned_model_dir)
tokenizer = DistilBertTokenizer.from_pretrained(fine_tuned_model_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()  # Set the model to evaluation mode

config.json:   0%|          | 0.00/720 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.25k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [3]:
import torch

def classify_review(review, model, tokenizer, device, max_length=64):
    # Tokenize the review
    inputs = tokenizer(
        review,
        padding='max_length',
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        prediction = torch.argmax(logits, dim=-1).item()

    label_mapping = {0: 'Positive', 1: 'Negative', 2: 'Neutral'}  # Ensure this matches your fine-tuned model
    sentiment = label_mapping[prediction]
    return sentiment

In [6]:
review = input("Enter a review: ")

predicted_sentiment = classify_review(review, model, tokenizer, device)

print(f"\nReview: {review}")
print(f"Predicted Sentiment: {predicted_sentiment}")

Enter a review:  The new patch fixed a lot of bugs, but it introduced new ones that are just as annoying



Review: The new patch fixed a lot of bugs, but it introduced new ones that are just as annoying
Predicted Sentiment: Negative
