**Bert**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the data
file_path = '/kaggle/input/bbc-data-new/balanced_news_samples_max.csv'
data = pd.read_csv(file_path)

# Encode the sentiment labels
label_map = {'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}
data['Sentiment'] = data['Sentiment'].map(label_map)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['Title'], data['Sentiment'], test_size=0.2, random_state=42)

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Create datasets
train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = NewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
)

# Define evaluation metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model}
eval_result = trainer.evaluate()
print(f"Accuracy for BERT: {eval_result['eval_accuracy']}")
##API KEY Wandbai 9c087fc721babdf8740dfa1e1b561057a4d9a03e

2024-08-13 14:20:55.660318: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 14:20:55.660373: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 14:20:55.661890: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Currently logged in as: [33mali408mehmood[0m ([33

Epoch,Training Loss,Validation Loss,Accuracy
1,0.7413,0.716019,0.682399
2,0.5338,0.758032,0.705998
3,0.3712,0.843619,0.697148
4,0.2824,1.057486,0.694199
5,0.1867,1.154317,0.691249


Accuracy for BERT: 0.6823992133726647


**DistilBert**

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the data
file_path = '/kaggle/input/bbc-data-new/balanced_news_samples_max.csv'  
data = pd.read_csv(file_path)

# Combine the 'Title' and 'Description' columns to create the 'Text' column
data['Text'] = data['Title'] + " " + data['Description']

# Drop rows where 'Sentiment' or 'Text' has NaN values
data = data.dropna(subset=['Sentiment', 'Text'])

# Encode the sentiment labels
label_map = {'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}
data['Sentiment'] = data['Sentiment'].map(label_map)

# After mapping, check if there are any NaN values (due to unmapped labels)
if data['Sentiment'].isna().any():
    print("There are NaN values in the 'Sentiment' column after mapping. They will be dropped.")
    data = data.dropna(subset=['Sentiment'])

# Ensure all labels are integers
data['Sentiment'] = data['Sentiment'].astype(int)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Create the datasets
train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = NewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Load the pre-trained model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
)

# Define evaluation metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-distilbert")
tokenizer.save_pretrained("fine-tuned-distilbert")

# Print the evaluation results
print(f"Evaluation results: {eval_result}")


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.5594,0.581324,0.751721
2,0.409,0.621881,0.755162
3,0.2186,0.711199,0.757129
4,0.1967,0.819853,0.761554
5,0.1656,0.914101,0.762537


Evaluation results: {'eval_loss': 0.581323504447937, 'eval_accuracy': 0.7517207472959685, 'eval_runtime': 6.7489, 'eval_samples_per_second': 301.383, 'eval_steps_per_second': 18.966, 'epoch': 5.0}


**Roberta-large**

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the data
file_path = '/kaggle/input/bbc-data-new/balanced_news_samples_max.csv'  
data = pd.read_csv(file_path)

# Combine the 'Title' and 'Description' columns to create the 'Text' column
data['Text'] = data['Title'] + " " + data['Description']

# Drop rows where 'Sentiment' or 'Text' has NaN values
data = data.dropna(subset=['Sentiment', 'Text'])

# Encode the sentiment labels
label_map = {'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}
data['Sentiment'] = data['Sentiment'].map(label_map)

# Ensure all labels are integers
data['Sentiment'] = data['Sentiment'].astype(int)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Create the datasets
train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = NewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained('roberta-large', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
    fp16=True,  # Use mixed precision for faster training
)

# Define evaluation metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-roberta-large")
tokenizer.save_pretrained("fine-tuned-roberta-large")

# Print the evaluation results
print(f"Evaluation results: {eval_result}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4168,0.50244,0.793019
2,0.4266,0.673724,0.791052
3,0.3035,0.887554,0.80826
4,0.1224,1.080547,0.807768
5,0.0288,1.215567,0.80826


Evaluation results: {'eval_loss': 0.5024403929710388, 'eval_accuracy': 0.7930186823992134, 'eval_runtime': 31.5184, 'eval_samples_per_second': 64.534, 'eval_steps_per_second': 8.091, 'epoch': 5.0}


In [13]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load the fine-tuned model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("fine-tuned-roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("fine-tuned-roberta-large")

# Set the model in evaluation mode
model.eval()

# Define the function to predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    label_map = {0: 'POSITIVE', 1: 'NEGATIVE', 2: 'NEUTRAL'}
    return label_map[predicted_class]

# Define a function to extract important tokens
def predict_important_tokens(text):
    inputs = tokenizer(text, return_tensors='pt')
    
    # Get embeddings from the model's input layer
    embeddings = model.roberta.embeddings(inputs['input_ids'])
    embeddings.retain_grad()
    
    # Forward pass through the model
    outputs = model(inputs_embeds=embeddings)
    logits = outputs.logits

    # Calculate gradients for importance scores
    logits[:, logits.argmax(dim=-1)].backward()
    gradients = embeddings.grad
    token_importance = gradients.abs().sum(dim=2).squeeze()

    # Get the tokens and their importance scores
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())
    importance_scores = token_importance.detach().cpu().numpy()

    # Pair tokens with their importance scores and sort by score
    token_importance_pairs = [(token, score) for token, score in zip(tokens, importance_scores)]
    token_importance_pairs = sorted(token_importance_pairs, key=lambda x: x[1], reverse=True)

    # Clean the tokens to remove special characters and select only alphabetic tokens
    important_tokens = [clean_token(pair[0]) for pair in token_importance_pairs[:5]]
    return [token for token in important_tokens if token.isalpha()]

# Function to clean tokens
def clean_token(token):
    # Remove special characters, e.g., "Ġ" that indicates a space in RoBERTa tokenization
    return token.replace('Ġ', '').replace('<s>', '').replace('</s>', '')

# Test with a few news statements
news_statements = [
    "The economy is showing signs of recovery after the pandemic.",
    "A massive earthquake has caused significant damage in the city.",
    "The new technology product received mixed reviews from the public.",
    "The government announced a new policy that is expected to benefit small businesses."
]

for statement in news_statements:
    sentiment = predict_sentiment(statement)
    important_tokens = predict_important_tokens(statement)
    print(f"News: {statement}\nPredicted Sentiment: {sentiment}\nMost Important Tokens: {important_tokens}\n")

News: The economy is showing signs of recovery after the pandemic.
Predicted Sentiment: POSITIVE
Most Important Tokens: ['pand', 'emic', 'economy']

News: A massive earthquake has caused significant damage in the city.
Predicted Sentiment: NEGATIVE
Most Important Tokens: ['city', 'earthquake', 'significant']

News: The new technology product received mixed reviews from the public.
Predicted Sentiment: NEUTRAL
Most Important Tokens: ['mixed', 'reviews', 'technology', 'product', 'new']

News: The government announced a new policy that is expected to benefit small businesses.
Predicted Sentiment: POSITIVE
Most Important Tokens: ['government', 'businesses', 'expected', 'policy']



In [3]:
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Load the fine-tuned model and tokenizer
model = RobertaForSequenceClassification.from_pretrained("fine-tuned-roberta-large")
tokenizer = RobertaTokenizer.from_pretrained("fine-tuned-roberta-large")

# Define the function to predict sentiment
def predict_sentiment(text):
    inputs = tokenizer(text, padding=True, truncation=True, max_length=128, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    label_map = {0: 'POSITIVE', 1: 'NEGATIVE', 2: 'NEUTRAL'}
    return label_map[predicted_class]

# Test with a few news statements
news_statements = [
    "The economy is showing signs of recovery after the pandemic.",
    "A massive earthquake has caused significant damage in the city.",
    "The new technology product received mixed reviews from the public.",
    "The government announced a new policy that is expected to benefit small businesses."
]

for statement in news_statements:
    sentiment = predict_sentiment(statement)
    print(f"News: {statement}\nPredicted Sentiment: {sentiment}\n")


News: The economy is showing signs of recovery after the pandemic.
Predicted Sentiment: POSITIVE

News: A massive earthquake has caused significant damage in the city.
Predicted Sentiment: NEGATIVE

News: The new technology product received mixed reviews from the public.
Predicted Sentiment: NEGATIVE

News: The government announced a new policy that is expected to benefit small businesses.
Predicted Sentiment: POSITIVE



In [8]:
from transformers import TFRobertaForSequenceClassification

# Load the TensorFlow model (already converted from PyTorch)
tf_model = TFRobertaForSequenceClassification.from_pretrained('fine-tuned-roberta-large', from_pt=True)

# Save the model in TensorFlow's SavedModel format
tf_model.save_pretrained("fine-tuned-roberta-large-tf")


All PyTorch model weights were used when initializing TFRobertaForSequenceClassification.

All the weights of TFRobertaForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaForSequenceClassification for predictions without further training.


In [11]:
import os

# Function to split a large file into smaller parts
def split_file(file_path, chunk_size):
    with open(file_path, 'rb') as f:
        chunk = f.read(chunk_size)
        i = 0
        while chunk:
            with open(f'{file_path}.part{i}', 'wb') as chunk_file:
                chunk_file.write(chunk)
            i += 1
            chunk = f.read(chunk_size)

# Compress with maximum compression and split into 500MB chunks
shutil.make_archive('fine-tuned-roberta-large-tf', 'zip', 'fine-tuned-roberta-large-tf')
split_file('fine-tuned-roberta-large-tf.zip', 500 * 1024 * 1024)  # 500MB chunks


**Bert-large-uncased**

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the data
file_path = '/kaggle/input/bbc-data-new/balanced_news_samples_max.csv'  
data = pd.read_csv(file_path)

# Combine the 'Title' and 'Description' columns to create the 'Text' column
data['Text'] = data['Title'] + " " + data['Description']

# Drop rows where 'Sentiment' or 'Text' has NaN values
data = data.dropna(subset=['Sentiment', 'Text'])

# Encode the sentiment labels
label_map = {'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}
data['Sentiment'] = data['Sentiment'].map(label_map)

# Ensure all labels are integers
data['Sentiment'] = data['Sentiment'].astype(int)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Create the datasets
train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = NewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Load the pre-trained model
model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=3)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
)

# Define evaluation metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-bert-large-uncased")
tokenizer.save_pretrained("fine-tuned-bert-large-uncased")

# Print the evaluation results
print(f"Evaluation results: {eval_result}")


2024-08-13 17:13:45.720791: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-13 17:13:45.720905: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-13 17:13:45.863291: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4766,0.538716,0.764503
2,0.2482,0.741872,0.776794
3,0.138,0.990956,0.786136
4,0.1081,1.189167,0.786627
5,0.0008,1.303304,0.793019


Evaluation results: {'eval_loss': 0.5387162566184998, 'eval_accuracy': 0.7645034414945919, 'eval_runtime': 30.1041, 'eval_samples_per_second': 67.566, 'eval_steps_per_second': 8.471, 'epoch': 5.0}


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import numpy as np

# Load the data
file_path = '/kaggle/input/bbc-data-new/balanced_news_samples_max.csv'
data = pd.read_csv(file_path)

# Combine the 'Title' and 'Description' columns to create the 'Text' column
data['Text'] = data['Title'] + " " + data['Description']

# Drop rows where 'Sentiment' or 'Text' has NaN values
data = data.dropna(subset=['Sentiment', 'Text'])

# Encode the sentiment labels
label_map = {'POSITIVE': 0, 'NEGATIVE': 1, 'NEUTRAL': 2}
data['Sentiment'] = data['Sentiment'].map(label_map)

# Ensure all labels are integers
data['Sentiment'] = data['Sentiment'].astype(int)

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(data['Text'], data['Sentiment'], test_size=0.2, random_state=42)

# Load the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define a custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, padding='max_length', truncation=True, max_length=128, return_tensors='pt')
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Create the datasets
train_dataset = NewsDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
val_dataset = NewsDataset(val_texts.tolist(), val_labels.tolist(), tokenizer)

# Load the pre-trained model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

# Define training arguments with weight decay and other regularization techniques
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,  
    per_device_eval_batch_size=8,   
    num_train_epochs=5,
    weight_decay=0.01,  # Regularization term
    logging_dir='./logs',
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    save_strategy="epoch",
    fp16=True,  # Use mixed precision for faster training
)

# Define evaluation metric
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {"accuracy": (preds == p.label_ids).mean()}

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
)

# Fine-tune the model
trainer.train()

# Evaluate the model
eval_result = trainer.evaluate()

# Save the fine-tuned model
model.save_pretrained("fine-tuned-roberta-base")
tokenizer.save_pretrained("fine-tuned-roberta-base")

# Print the evaluation results
print(f"Evaluation results: {eval_result}")


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4175,0.581895,0.778761
2,0.3647,0.734121,0.777778
3,0.0929,0.918367,0.798427
4,0.1839,1.156674,0.79646
5,0.0103,1.25218,0.797443


Evaluation results: {'eval_loss': 0.5818952322006226, 'eval_accuracy': 0.7787610619469026, 'eval_runtime': 10.908, 'eval_samples_per_second': 186.468, 'eval_steps_per_second': 23.377, 'epoch': 5.0}
