##  Load and Preprocess the Data

In [2]:
import pandas as pd

# Load the dataset
df = pd.read_csv('processed_chatbot_data.csv')

# Inspect the first few rows
df.head()

Unnamed: 0,clean_text,sentiment,NER
0,game hurt,Negative,['game']
1,sexuality ’ grouping category makes different ...,Neutral,"['’ grouping category', 'definition grouping']"
2,right dont care fuck em,Negative,"['fuck', 'em']"
3,man love reddit,Positive,"['man', 'reddit']"
4,name nowhere near falcon,Positive,['falcon']


In [None]:
# Check for missing values
print(df['clean_text'].isnull().sum())  

# Drop rows with missing or invalid text
df = df.dropna(subset=['clean_text']) 

# Ensure all entries are strings
df['clean_text'] = df['clean_text'].astype(str)

# Filter out empty strings
df = df[df['clean_text'].str.strip() != '']

# Verify the cleaned data
print(df.head())

398
                                          clean_text sentiment  \
0                                          game hurt  Negative   
1  sexuality ’ grouping category makes different ...   Neutral   
2                            right dont care fuck em  Negative   
3                                    man love reddit  Positive   
4                           name nowhere near falcon  Positive   

                                              NER  
0                                        ['game']  
1  ['’ grouping category', 'definition grouping']  
2                                  ['fuck', 'em']  
3                               ['man', 'reddit']  
4                                      ['falcon']  


## Tokenize the Text

In [5]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text data
def tokenize_data(texts, max_len=128):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors='pt'
    )

# Extract cleaned texts
texts = df['clean_text'].tolist()

# Tokenize the data
tokenized_data = tokenize_data(texts)

## Prepare Labels

In [6]:
# Define label mapping
label_map = {
    'Extremely Negative': 0,
    'Negative': 1,
    'Neutral': 2,
    'Positive': 3,
    'Extremely Positive': 4
}

# Convert sentiment labels to numerical values
df['label'] = df['sentiment'].map(label_map)

# Drop rows with missing labels
df = df.dropna(subset=['label'])

# Extract labels
labels = df['label'].values

## Fine Tuning

### Create Dataset and DataLoader

In [7]:
from torch.utils.data import Dataset, DataLoader
import torch

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

# Create dataset and dataloader
dataset = SentimentDataset(tokenized_data, labels)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

### Fine-Tune the Model

In [None]:
from transformers import BertForSequenceClassification, AdamW

# Load the pretrained BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

# Move model to GPU if available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5)

# Training loop
model.train()
for epoch in range(3):  # Number of epochs
    for batch in dataloader:
        optimizer.zero_grad()

        # Move batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch + 1} completed.")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Evaluation

In [None]:
from torch.nn.functional import softmax

def predict_sentiment(text):
    # Tokenize input
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        return_token_type_ids=False,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Predict
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = softmax(logits, dim=1)
        predicted_label = torch.argmax(probabilities, dim=1).item()

    # Map label to sentiment
    sentiment_map = {
        0: "Extremely Negative",
        1: "Negative",
        2: "Neutral",
        3: "Positive",
        4: "Extremely Positive"
    }
    return sentiment_map[predicted_label]

# Test with sample inputs
sample_texts = [
    "I am so excited about this project!",
    "I feel really anxious about the future.",
    "It's just another ordinary day."
]

for text in sample_texts:
    sentiment = predict_sentiment(text)
    print(f"Text: '{text}' -> Sentiment: {sentiment}")