In [None]:
!pip install transformers torch
from transformers import pipeline

# Load multilingual sentiment analysis model
sentiment_pipeline = pipeline("sentiment-analysis", model="cardiffnlp/twitter-xlm-roberta-base-sentiment")


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Device set to use cpu


In [None]:
import pandas as pd

# Load dataset
file_path = "/content/translated_comments_dataset.csv"  # Change if needed
df = pd.read_csv(file_path)

# Ensure text data is string
df["translated_comments"] = df["translated_comments"].astype(str)

# Function for sentiment analysis
def predict_multilingual_sentiment(text):
    if not text.strip():
        return "Neutral"

    # Limit text length to avoid tensor mismatch errors
    max_length = 512  # Safe limit for transformer models
    text = text[:max_length]

    result = sentiment_pipeline(text)[0]
    return result["label"]

# Apply sentiment analysis with fixed function
df["multilingual_sentiment"] = df["translated_comments"].apply(predict_multilingual_sentiment)


In [None]:
sentiment_distribution = df["multilingual_sentiment"].value_counts(normalize=True) * 100
print(sentiment_distribution)


multilingual_sentiment
positive    47.196871
neutral     35.984355
negative    16.818774
Name: proportion, dtype: float64


In [None]:
print(df.columns)


Index(['video_id', 'title', 'description', 'published_at', 'channel_id',
       'channel_title', 'category_id', 'tags', 'duration', 'definition',
       'caption', 'view_count', 'like_count', 'dislike_count',
       'favorite_count', 'comment_count', 'category_name', 'comments',
       'fetch_date', 'translated_comments', 'multilingual_sentiment'],
      dtype='object')


In [None]:
category_sentiment_distribution = category_sentiment_distribution.fillna(0)


In [None]:
# Calculate sentiment distribution for each category
category_sentiment_distribution = df.groupby("category_name")["multilingual_sentiment"].value_counts(normalize=True).unstack() * 100

# Display the category-wise sentiment analysis
category_sentiment_distribution


multilingual_sentiment,negative,neutral,positive
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Autos & Vehicles,,100.0,
Comedy,19.444444,33.333333,47.222222
Education,7.142857,78.571429,14.285714
Entertainment,21.574344,32.944606,45.48105
Film & Animation,12.5,50.0,37.5
Gaming,7.5,60.0,32.5
Howto & Style,9.302326,30.232558,60.465116
Music,3.658537,24.390244,71.95122
News & Politics,48.148148,25.925926,25.925926
People & Blogs,9.090909,39.393939,51.515152


In [None]:
missing_categories = df["category_name"].isna().sum()
print(f"Missing category labels: {missing_categories}")


Missing category labels: 0


In [None]:
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import numpy as np

# Load the sentiment model and tokenizer
model_name = "cardiffnlp/twitter-xlm-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Function for sentiment analysis
def analyze_sentiment(text):
    # Ensure text is a string and handle NaN values
    if isinstance(text, str):
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        scores = outputs.logits.softmax(dim=-1).numpy()[0]
        labels = ["negative", "neutral", "positive"]
        return labels[np.argmax(scores)]
    else:  # Handle NaN or other non-string values
        return "neutral"  # You might want to choose a different default sentiment here

# Load the cleaned dataset (Ensure the file is in the same directory)
file_path = "/content/cleaned_translated_comments.csv"
df = pd.read_csv(file_path)

# Run sentiment analysis on each comment, converting to string before applying
df["sentiment"] = df["cleaned_comments"].astype(str).apply(analyze_sentiment)

# Print some sample results
print("\nSAMPLE SENTIMENT RESULTS:")
print(df[["cleaned_comments", "sentiment"]].head(10))

# Load the original dataset to get category-wise analysis
original_file_path = "/content/translated_comments_dataset.csv"  # Update to the original dataset
df_original = pd.read_csv(original_file_path)

# Merge sentiment results with original dataset using 'translated_comments' as the key
df_merged = df_original.merge(df, left_on="translated_comments", right_on="cleaned_comments", how="inner")

# Category-wise sentiment distribution
category_sentiment = df_merged.groupby("category_name")["sentiment"].value_counts().unstack()

print("\nCATEGORY-WISE SENTIMENT ANALYSIS:")
print(category_sentiment)

# Save the results
output_file = "sentiment_analysis_results2.csv"
df_merged.to_csv(output_file, index=False)
print(f"\nSentiment analysis completed. Results saved to {output_file}")


SAMPLE SENTIMENT RESULTS:
                                    cleaned_comments sentiment
0  In the past, politics is less and more sociall...  negative
1  India US & UK Tour 2025 Tickets Link: https://...   neutral
2                                       Awesome orki  positive
3  Such me bro, kya njrana pesh kiya he, mja aagy...  positive
4  Akash bhai was not expected from you. ... that...  negative
5                             Kyu Nhi Hoi stand up 😢  negative
6                   Bcci ko selector badalna chahiye  negative
7               Rohit se batting me bumrah achha hai  positive
8  Rohit besharm ko bhagao virat aur pant vi nika...  negative
9  these reepentitative collapses Drop Non Perfor...  negative


FileNotFoundError: [Errno 2] No such file or directory: '/content/translated_comments_dataset.csv'