Import Libraries

In [None]:
# importing necessary libraries
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
import torch
from torch.nn.functional import softmax

Define Emoji Mappings

In [None]:
# defining emoji mappings
emoji_to_description = {
    "😀": "grin",
    "😊": "smile",
    "😂": "tear of joy",
    "🤣": "tear of joy",
    "😇": "halo",
    "😉": "wink",
    "😋": "savor food",
    "😎": "sunglass",
    "☀️": "sun",
    "😘": "blow a kiss",
    "😺": "cat",
    "❤️": "red heart",
    "💕": "two heart",
    "😡": "angri",
    "😐": "neutral",
    "😑": "expressionless",
    "😕": "confound",
    "😤": "steam from nose",
    "💩": "pile of poo",
    "😓": "sweat",
    "☹️": "frown",
    "😱": "scream fear",
    "😰": "anxious",
    "😔": "pensiv",
    "😩": "tired",
    "😢": "cri",
    "😥": "sad",
    "😴": "sleepi",
    "😞": "downcast",
    "💔": "broken heart",
    "😳": "flush",
    "🤫": "hush",
    "😵": "knockedout",
    "😲": "astonish"
}

emoji_meanings = {
    "grin": "සිනහව",
    "face": "මුහුණ",
    "with": "සමග",
    "tear": "කඳුළු",
    "of": "වල",
    "joy": "සතුට",
    "big": "මහා",
    "eye": "ඇස",
    "smile": "සිනහව",
    "squint": "ඇස් දිලිසෙනවා",
    "halo": "හලෝ",
    "wink": "ඇසිපිය හෙළන්න",
    "savor": "රස විඳින්න",
    "food": "ආහාර",
    "reliev": "සහනය",
    "heartey": "හෘදයාංගම",
    "sunglass": "හිරු වීදුරු",
    "sun": "හිරු",
    "blow": "පිඹීම",
    "a": "ඒ",
    "kiss": "හාදුවක්",
    "tongue": "දිව",
    "cat": "බළලා",
    "red": "රතු",
    "heart": "හදවත",
    "two": "දෙක",
    "beam": "කදම්බ",
    "suit": "ඇඳුම",
    "grimac": "",
    "angri": "කෝපයෙන්",
    "neutral": "මධ්යස්ථ",
    "expressionless": "ප්රකාශ රහිත",
    "pout": "පුට්",
    "confound": "ව්යාකූල කරයි",
    "steam": "වාෂ්ප",
    "from": "සිට",
    "nose": "නාසය",
    "pile": "ගොඩවල්",
    "poo": "චූ කරන්න",
    "sweat": "දහඩිය",
    "frown": "නළල රැලි ගනියි",
    "mouth": "මුඛය",
    "scream": "කෑගහනවා",
    "fear": "බිය",
    "anxious": "කනස්සල්ලෙන්",
    "pensiv": "කල්පනාකාරී",
    "weari": "අඳිනවා",
    "confus": "ව්යාකූල කරයි",
    "tired": "මහන්සියි",
    "cri": "අඬනවා",
    "sad": "දුක",
    "sleepi": "නිදාගන්න",
    "downcast": "පහතට වැටී ඇත",
    "loudli": "හයියෙන්",
    "broken": "කැඩුණු",
    "flush": "ෆ්ලෂ්",
    "hush": "නිහඬයි",
    "knockedout": "තට්ටු කළා",
    "astonish": "විස්මිතයි"
}

Load and Combine Datasets

In [None]:
# loading and combine the datasets

fb_data = pd.read_csv('Data/facebook_comments.csv')
yt_data = pd.read_csv('Data/channel_comments.csv')
combined_data = pd.concat([fb_data, yt_data], ignore_index=True)

# checking for NaN values in the 'text' column and replacing them with empty strings
combined_data['text'] = combined_data['text'].fillna('')

# ensuring the 'text' column is of string type
combined_data['text'] = combined_data['text'].astype(str)

# displaying the first few rows to verify
combined_data.head()

Unnamed: 0,post_url,author,text,video_url
0,https://www.facebook.com/sinhala.adaderana.lk/...,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...,
1,https://www.facebook.com/sinhala.adaderana.lk/...,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න,
2,https://www.facebook.com/sinhala.adaderana.lk/...,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...,
3,https://www.facebook.com/sinhala.adaderana.lk/...,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...,
4,https://www.facebook.com/sinhala.adaderana.lk/...,පුපුරැදය,පුපුරැදය,


Preprocess Emojis

In [None]:
# preprocessing emojis by converting them to Sinhala descriptions
def preprocess_emojis(text):
    # ensuring text is a string and handle empty/None values
    text = str(text) if text else ''
    
    # skipping processing if text is empty
    if not text.strip():
        return text
    
    # replacing emojis with their Sinhala descriptions
    for emoji, desc in emoji_to_description.items():
        if emoji in text:
            # splitting description into words and translate each
            desc_words = desc.split()
            sinhala_desc = ' '.join(emoji_meanings.get(word, word) for word in desc_words)
            text = text.replace(emoji, f' {sinhala_desc} ')
    return text

# applying preprocessing to the 'text' column
combined_data['processed_text'] = combined_data['text'].apply(preprocess_emojis)

# displaying a sample to verify
combined_data[['text', 'processed_text']].head()

Unnamed: 0,text,processed_text
0,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...
1,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න
2,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...
3,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...
4,පුපුරැදය,පුපුරැදය


Export a Subset of Comments for Labeling

In [None]:
# exporting a subset of comments for manual labeling
# selecting 500 comments (adjust the number as needed)
sample_to_label = combined_data[['text']].head(500)

# saving to a CSV file for manual labeling
sample_to_label.to_csv('to_label.csv', index=False)

print("Exported 500 comments to 'to_label.csv'. Please manually add a 'label' column with values 'positive', 'negative', or 'neutral', then save the file as 'labeled_comments.csv'.")

Exported 500 comments to 'to_label.csv'. Please manually add a 'label' column with values 'positive', 'negative', or 'neutral', then save the file as 'labeled_comments.csv'.


Inspect and Clean the Labeled Dataset

In [None]:
# loading the labeled dataset as a DataFrame to inspect and clean
labeled_data = pd.read_csv('Data/labeled_comments.csv')

# checking for missing or invalid labels
print("Inspecting 'labeled_comments.csv':")
print("Total rows:", len(labeled_data))
print("\nMissing values in 'label' column:", labeled_data['label'].isna().sum())
print("\nUnique values in 'label' column:")
print(labeled_data['label'].value_counts(dropna=False))

# cleaning the dataset
# removing rows where 'label' is NaN or not in ['positive', 'negative', 'neutral']
valid_labels = ['positive', 'negative', 'neutral']
labeled_data_cleaned = labeled_data[labeled_data['label'].isin(valid_labels) & labeled_data['label'].notna()]

# checking how many rows were removed
print("\nRows after cleaning:", len(labeled_data_cleaned))
print("Rows removed:", len(labeled_data) - len(labeled_data_cleaned))

# saving the cleaned dataset
labeled_data_cleaned.to_csv('labeled_comments_cleaned.csv', index=False)
print("\nCleaned dataset saved to 'labeled_comments_cleaned.csv'.")

Inspecting 'labeled_comments.csv':
Total rows: 195

Missing values in 'label' column: 1

Unique values in 'label' column:
label
negative    131
neutral      36
positive     27
NaN           1
Name: count, dtype: int64

Rows after cleaning: 194
Rows removed: 1

Cleaned dataset saved to 'labeled_comments_cleaned.csv'.


Fine-Tune the XLM-RoBERTa Model

In [None]:
# importing torch
from torch.utils.data import DataLoader
from torch.optim import AdamW  # Updated import for AdamW
from transformers import get_linear_schedule_with_warmup  

# loading the cleaned labeled dataset for fine-tuning
dataset = load_dataset('csv', data_files='labeled_comments_cleaned.csv')

# mapping labels to integers
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
dataset = dataset.map(lambda x: {'label': label_map[x['label']]})

# loading the XLM-RoBERTa model and tokenizer
model_name = 'xlm-roberta-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# tokenizing the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128, return_tensors='pt')

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# preparing the dataset for PyTorch
tokenized_datasets.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# splitting into train and validation sets (80% train, 20% validation)
train_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(int(0.8 * len(tokenized_datasets['train']))))
eval_dataset = tokenized_datasets['train'].shuffle(seed=42).select(range(int(0.8 * len(tokenized_datasets['train'])), len(tokenized_datasets['train'])))

# creating DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=8, shuffle=False)

# moving model to GPU if available
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
model.to(device)

# defining optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
num_epochs = 3
total_steps = len(train_dataloader) * num_epochs  # ~60 steps (155 examples / batch_size 8 * 3 epochs)
warmup_steps = 10
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# training loop
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")
    total_train_loss = 0
    for step, batch in enumerate(train_dataloader):
        # moving batch to device
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        # backward pass
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

        # log training loss every 10 steps
        if (step + 1) % 10 == 0:
            print(f"Step {step + 1}/{len(train_dataloader)}, Loss: {loss.item():.4f}")

    avg_train_loss = total_train_loss / len(train_dataloader)
    print(f"Average training loss: {avg_train_loss:.4f}")

    # evaluation
    model.eval()
    total_eval_loss = 0
    for batch in eval_dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_eval_loss += loss.item()

    avg_eval_loss = total_eval_loss / len(eval_dataloader)
    print(f"Average evaluation loss: {avg_eval_loss:.4f}")
    model.train()

# saving the fine-tuned model
model.save_pretrained('fine_tuned_xlm_roberta')
tokenizer.save_pretrained('fine_tuned_xlm_roberta')

print("Fine-tuning complete. Model saved to 'fine_tuned_xlm_roberta'.")
print(f"Model is on device: {next(model.parameters()).device}")

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/194 [00:00<?, ? examples/s]

Epoch 1/3
Step 10/20, Loss: 1.0326
Step 20/20, Loss: 0.8523
Average training loss: 1.0085
Average evaluation loss: 1.0083
Epoch 2/3
Step 10/20, Loss: 0.4134
Step 20/20, Loss: 0.6536
Average training loss: 0.8317
Average evaluation loss: 0.9612
Epoch 3/3
Step 10/20, Loss: 1.1427
Step 20/20, Loss: 0.4896
Average training loss: 0.8548
Average evaluation loss: 1.0763
Fine-tuning complete. Model saved to 'fine_tuned_xlm_roberta'.
Model is on device: mps:0


Define Functions for Tokenization and Sentiment Prediction

In [None]:
# function to tokenize and encode the text
def encode_text(text, max_length=128):
    inputs = tokenizer(
        text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    # moving inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}
    return inputs

# function to predict sentiment
def predict_sentiment(text):
    inputs = encode_text(text)
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = softmax(logits, dim=1).cpu().numpy()[0]  # moving to CPU for numpy conversion
    labels = ['negative', 'neutral', 'positive']
    sentiment = labels[np.argmax(probs)]
    return sentiment, probs

Apply Sentiment Analysis

In [None]:
# applying sentiment analysis to all comments
sentiments = []
probabilities = []

for text in combined_data['processed_text']:
    sentiment, probs = predict_sentiment(text)
    sentiments.append(sentiment)
    probabilities.append(probs)

# adding results to the DataFrame
combined_data['sentiment'] = sentiments
combined_data['sentiment_probabilities'] = probabilities

# displaying a sample of the results
combined_data[['text', 'processed_text', 'sentiment', 'sentiment_probabilities']].head()

Unnamed: 0,text,processed_text,sentiment,sentiment_probabilities
0,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...,මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?\nඅහස...,negative,"[0.74273384, 0.13335575, 0.12391042]"
1,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න,තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න,negative,"[0.85549426, 0.08701834, 0.057487454]"
2,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...,වාහන මිල මොනවා වුනත් ප්\n‍\nරශ්නයක් නෙවෙයි උන්...,negative,"[0.7033787, 0.16769668, 0.12892465]"
3,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...,බොරු ගනන් දිලා අත්\n‍\nයවශ්\n‍\nය ම නොවේ නම් ග...,negative,"[0.5697927, 0.2342302, 0.19597708]"
4,පුපුරැදය,පුපුරැදය,negative,"[0.6523738, 0.20113891, 0.14648731]"


Evaluate Sentiment Analysis Results

In [None]:
# evaluating sentiment analysis results

# counting the number of comments for each sentiment
sentiment_counts = combined_data['sentiment'].value_counts()
print("Number of Comments per Sentiment:")
print(sentiment_counts)
print()

# calculating the percentage distribution of sentiments
sentiment_percentages = combined_data['sentiment'].value_counts(normalize=True) * 100
print("Percentage Distribution of Sentiments:")
print(sentiment_percentages)
print()

# calculating the average probability (confidence) for each sentiment
# extracting the probability corresponding to the predicted sentiment
def get_predicted_prob(row):
    sentiment_idx = {'negative': 0, 'neutral': 1, 'positive': 2}
    idx = sentiment_idx[row['sentiment']]
    return row['sentiment_probabilities'][idx]

combined_data['predicted_prob'] = combined_data.apply(get_predicted_prob, axis=1)
avg_probabilities = combined_data.groupby('sentiment')['predicted_prob'].mean()
print("Average Confidence (Probability) per Sentiment:")
print(avg_probabilities)
print()

# displaying sample comments for each sentiment (up to 3 examples per sentiment)
print("Sample Comments for Each Sentiment:")
for sentiment in ['positive', 'negative', 'neutral']:
    print(f"\nSentiment: {sentiment}")
    # filtering comments for the current sentiment
    sample_comments = combined_data[combined_data['sentiment'] == sentiment][['text', 'processed_text', 'predicted_prob']].head(3)
    for idx, row in sample_comments.iterrows():
        print(f"- Text: {row['text']}")
        print(f"  Processed Text: {row['processed_text']}")
        print(f"  Confidence: {row['predicted_prob']:.4f}")

# saving the evaluation summary to a CSV file
# creating a summary DataFrame
summary_data = {
    'Sentiment': sentiment_counts.index,
    'Count': sentiment_counts.values,
    'Percentage': sentiment_percentages.values,
    'Average_Confidence': avg_probabilities.values
}
summary_df = pd.DataFrame(summary_data)
summary_df.to_csv('sentiment_evaluation_summary.csv', index=False)
print("\nEvaluation summary saved to 'sentiment_evaluation_summary.csv'")

Number of Comments per Sentiment:
sentiment
negative    620
Name: count, dtype: int64

Percentage Distribution of Sentiments:
sentiment
negative    100.0
Name: proportion, dtype: float64

Average Confidence (Probability) per Sentiment:
sentiment
negative    0.654124
Name: predicted_prob, dtype: float32

Sample Comments for Each Sentiment:

Sentiment: positive

Sentiment: negative
- Text: මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?
අහස උසට වාහන වලට බදු ගහල වාහනයක් ගන්න බැරි තරමට ඒක හීනයක් කරල
වාහන බදු වලින් ආදායම් ගන්න බලන් ඉන්න මුන්ව නම් මිනිස්සු වෙන්න බෑ
  Processed Text: මේ වගෙ බුවෙක්ද බන් අනුරයගෙ ආර්තික උපදේශක?
අහස උසට වාහන වලට බදු ගහල වාහනයක් ගන්න බැරි තරමට ඒක හීනයක් කරල
වාහන බදු වලින් ආදායම් ගන්න බලන් ඉන්න මුන්ව නම් මිනිස්සු වෙන්න බෑ
  Confidence: 0.7427
- Text: තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න
  Processed Text: තව අවුරුද නමයක් තියනවා විට්ස් එක ලක්ෂ 12 දෙන්න
  Confidence: 0.8555
- Text: වාහන මිල මොනවා වුනත් ප්
‍
රශ්නයක් නෙවෙයි උන්ට. මොකද උන් දන්නවා වාහනයක් ගන්න තියා එද

In [None]:
import pandas as pd
from sklearn.metrics import classification_report
import csv

# loading the labeled dataset directly
labeled_data = pd.read_csv('labeled_comments_cleaned.csv')

# checking the columns in labeled_data (for debugging)
print("Columns in labeled_data:", labeled_data.columns)

# mapping the sentiment labels back to integers for comparison
label_map = {'negative': 0, 'neutral': 1, 'positive': 2}
reverse_label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}

# getting the true labels from the labeled dataset
true_labels = labeled_data['label'].map(label_map).values

# applying predictions to the labeled dataset
sentiments = []
for text in labeled_data['text']:
    sentiment, _ = predict_sentiment(text)
    sentiments.append(sentiment)

# mapping predicted sentiments to integers
predicted_labels = pd.Series(sentiments).map(label_map).values

# generating the classification report

# including all classes the model was trained on: negative (0), neutral (1), positive (2)
report = classification_report(
    true_labels,
    predicted_labels,
    target_names=['negative', 'neutral', 'positive'],
    output_dict=False,
    zero_division=0  # Suppress warnings
)

# printing the classification report
print(report)

# converting the classification report to a dictionary for saving to CSV
report_dict = classification_report(
    true_labels,
    predicted_labels,
    target_names=['negative', 'neutral', 'positive'],
    output_dict=True,
    zero_division=0  # Suppress warnings
)

# preparing data for CSV
report_data = []
# adding per-class metrics
for label in ['negative', 'neutral', 'positive']:
    metrics = report_dict[label]
    report_data.append({
        '': label,
        'precision': metrics['precision'],
        'recall': metrics['recall'],
        'f1-score': metrics['f1-score'],
        'support': metrics['support']
    })

# adding accuracy, macro avg, and weighted avg
report_data.append({
    '': 'accuracy',
    'precision': '',
    'recall': '',
    'f1-score': report_dict['accuracy'],
    'support': report_dict['weighted avg']['support']
})
report_data.append({
    '': 'macro avg',
    'precision': report_dict['macro avg']['precision'],
    'recall': report_dict['macro avg']['recall'],
    'f1-score': report_dict['macro avg']['f1-score'],
    'support': report_dict['macro avg']['support']
})
report_data.append({
    '': 'weighted avg',
    'precision': report_dict['weighted avg']['precision'],
    'recall': report_dict['weighted avg']['recall'],
    'f1-score': report_dict['weighted avg']['f1-score'],
    'support': report_dict['weighted avg']['support']
})

# saving the classification report to a CSV file
report_df = pd.DataFrame(report_data)
report_df.to_csv('classification_report.csv', index=False)
print("Classification report saved as 'classification_report.csv'")

Columns in labeled_data: Index(['text', 'label'], dtype='object')
              precision    recall  f1-score   support

    negative       0.68      1.00      0.81       131
     neutral       0.00      0.00      0.00        36
    positive       0.00      0.00      0.00        27

    accuracy                           0.68       194
   macro avg       0.23      0.33      0.27       194
weighted avg       0.46      0.68      0.54       194

Classification report saved as 'classification_report.csv'
