In [1]:
import re
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score
from torch.nn.functional import softmax

# Load the CSV file (assuming the file has a 'label' column with ground truth labels)
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/cleaned_comments_set4.csv')
tot_count = len(df['all_eng'])

# Initialize tokenizer and model for BERT Multilingual Sentiment Analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name)

comment_list = []
sentiment_list = []

# Set model to evaluation mode (disable dropout)
model.eval()

# Preprocessing function to clean text
def preprocess_text(text):
    # Remove emojis, punctuation, and special characters (keep commas)
    text = re.sub(r'[^\w\s,]', '', text)

    # Remove brackets and content inside them
    text = re.sub(r'\[.*?\]', '', text)  # Removes square brackets and the content inside them
    text = re.sub(r'\(.*?\)', '', text)  # Removes parentheses and the content inside them

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Function to classify a comment
def classify_comment(cleaned_text):
    # Tokenize and predict
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    with torch.no_grad():
        logits = model(**inputs).logits

    # Apply softmax to get probabilities for each class
    probabilities = softmax(logits, dim=1).squeeze().tolist()

    # Get predicted sentiment class (0 to 4) and label
    predicted_class_id = logits.argmax().item()

    # Map the prediction to sentiment labels
    if predicted_class_id <= 1:
        sentiment_label = "NEGATIVE"
    elif predicted_class_id == 2:
        sentiment_label = "NEUTRAL"
    else:
        sentiment_label = "POSITIVE"

    return sentiment_label, probabilities

# Lists to hold predicted labels and true labels
predicted_labels = []
true_labels = df['label'].tolist() if 'label' in df.columns else None  # Ground truth labels, if available

# Sentiment count dictionary to summarize sentiment distribution
sentiment_counts = {'NEGATIVE': 0, 'NEUTRAL': 0, 'POSITIVE': 0}

# Loop through each comment in the DataFrame, preprocess, classify, and gather results
for comment in df['all_eng']:
    # Preprocess the comment
    cleaned_comment = preprocess_text(comment)

    # Classify the preprocessed comment
    predicted_label, probabilities = classify_comment(cleaned_comment)
    predicted_labels.append(predicted_label)

    # Count the sentiment type for summary
    sentiment_counts[predicted_label] += 1

    #for final csv file
    comment_list += [comment]
    sentiment_list += [predicted_label]

    # Optional: Print the original, cleaned comment, and predicted sentiment
    print(f"Original Comment: {comment}")
    print(f"Cleaned Comment: {cleaned_comment}")
    print(f"Predicted Sentiment: {predicted_label}")
    print()

# Calculate sentiment percentages for each category
total_comments = len(df['all_eng'])
percent_negative = (sentiment_counts['NEGATIVE'] / total_comments) * 100
percent_neutral = (sentiment_counts['NEUTRAL'] / total_comments) * 100
percent_positive = (sentiment_counts['POSITIVE'] / total_comments) * 100

# Print the overall sentiment summary
print(f"Overall Sentiment of Comments:")
print(f"Negative: {percent_negative:.2f}%")
print(f"Neutral: {percent_neutral:.2f}%")
print(f"Positive: {percent_positive:.2f}%")

# Calculate accuracy if true labels are provided
if true_labels is not None:
    accuracy = accuracy_score(true_labels, predicted_labels)
    print(f"Accuracy of the model: {accuracy * 100:.2f}%")
else:
    print("No ground truth labels found for accuracy calculation.")

dict = {'Cleaned_Comment': comment_list,
        'Predicted_Sentiment': sentiment_list,

}

df_new = pd.DataFrame(dict)
df_new.to_csv('final_output_bert_set4.csv', index=False)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Cleaned Comment: bapu ka badla liya train se nikalne ka final se nikal kar bhai
Predicted Sentiment: NEGATIVE

Original Comment: mai bhi
Cleaned Comment: mai bhi
Predicted Sentiment: NEGATIVE

Original Comment: as a indian south africa world cup jeetna deserve karta tha very sad for south africa
Cleaned Comment: as a indian south africa world cup jeetna deserve karta tha very sad for south africa
Predicted Sentiment: NEGATIVE

Original Comment: f man kya catch the
Cleaned Comment: f man kya catch the
Predicted Sentiment: NEGATIVE

Original Comment: congratulations time india congratulations hitman
Cleaned Comment: congratulations time india congratulations hitman
Predicted Sentiment: POSITIVE

Original Comment: aaj mera janmadin hai but no one wish me
Cleaned Comment: aaj mera janmadin hai but no one wish me
Predicted Sentiment: NEUTRAL

Original Comment: aj inda ne pure afghanistan walo ka badla be le liya or world cup b

In [2]:
df_new.to_csv('/content/drive/MyDrive/Colab Notebooks/bert/bert_2/final_output_bert_set4', index=False)

df_new

Unnamed: 0,Cleaned_Comment,Predicted_Sentiment
0,watching today,POSITIVE
1,sa donated the cup to india these sa players c...,NEGATIVE
2,i am crying,POSITIVE
3,uff the commentary is at its best,POSITIVE
4,forever india,POSITIVE
...,...,...
14095,cack de india,NEUTRAL
14096,love you sarr,POSITIVE
14097,virat is best,POSITIVE
14098,virat retire,NEGATIVE
