## Importing All The Required Libraries

In [None]:
import pandas as pd
import re
from transformers import AutoTokenizer
from datasets import Dataset
import numpy as np
from sklearn.metrics import accuracy_score
import torch
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
    DataCollatorWithPadding
)
print("IMPORT SUCCESSFUL !")

## Load And Preprocess DataSet


In [None]:
df = pd.read_csv("/content/dataset1.csv")
df.rename(columns={'Sentiment': 'label'}, inplace=True)
df['text'] = df['Title'] + df['Description']
df.drop(['Title', 'Description'], axis=1, inplace=True)
label_mapping = {"negative": 0,"neutral": 1,"positive": 2}
df["label"] = df["label"].map(label_mapping)
df["text"] = df["text"].astype(str)
df["label"] = df["label"].astype(int)
# 0-- Neagtive
# 1-- Neutral
# 2-- Positive

## Converting DataSet Into Hugging Face DataSet

In [None]:
# Convert Pandas DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df[["text", "label"]])


## Load Tokenizer And PreTrained And FineTune Model


In [None]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenization
def tokenize_function(example):
    return tokenizer(example["text"], padding="max_length", truncation=True, max_length=128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Split into train/test
split = tokenized_datasets.train_test_split(test_size=0.1, seed=42)
train_dataset = split["train"]
eval_dataset = split["test"]

# Data Collator (Ensures Efficient Batching)
data_collator = DataCollatorWithPadding(tokenizer)

In [None]:
from transformers import Trainer, AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments
from datasets import load_dataset, DatasetDict
import torch

# Detect device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3).to(device)

# Convert labels to int if needed
def cast_labels(example):
    example["label"] = int(example["label"])
    return example

tokenized_datasets = tokenized_datasets.map(cast_labels)

# Define training args (as you already have)
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=2,
    logging_dir="./logs",
    logging_steps=50,
    report_to="none",
    save_total_limit=2
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

## Evaluate Model Performance

In [None]:
from evaluate import load

# Evaluate
eval_result = trainer.evaluate()
print("Evaluation result:", eval_result)

metric = load("accuracy")

# Function to compute accuracy
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), dim=-1)
    return metric.compute(predictions=predictions.numpy(), references=labels)

# Evaluate on test dataset
results = trainer.evaluate(eval_dataset)
print(f"Evaluation Results: {results}")

predictions = trainer.predict(eval_dataset)

# Compute accuracy
accuracy = compute_metrics((predictions.predictions, predictions.label_ids))
print(f"Test Accuracy: {accuracy['accuracy']:.4f}")

In [None]:
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")

In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load model & tokenizer from saved directory
model_path = "/content/sentiment2_model"
model = AutoModelForSequenceClassification.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move to GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

import torch

def predict_sentiment(texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1).cpu().numpy()  # Convert to NumPy for readability
    return predictions

# 0-- Negative
# 1-- Neutral
# 2-- Positive

# Example Predictions
texts = ["100 peoplle died in a bomb blast."]
predictions = predict_sentiment(texts)
print(predictions)