<a href="https://colab.research.google.com/github/aviralwalia08/youtubecomments/blob/main/Comparing_Algos_for_Sentiment_analysis_YouTube_comments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax
import nltk
nltk.downloader.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\avira\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Reading Data File

In [None]:
df = pd.read_pickle('lowest_grossing_movies.pkl')
print(df.shape)


(111965, 6)


In [None]:
df.movie_name.unique()

array(['Amsterdam', 'The Last Duel', 'West Side Story', 'Mortal Engines',
       'Pan', 'Jupiter Ascending', 'Strange World', 'Moonfall', 'Cats',
       'Monster Truck'], dtype=object)

# Cleaning the Youtube Comments (Text) Column

In [None]:
def clean_data(df, column):
    '''
    df: Dataframe containing uncleaned column
    column : Text column that you want to be cleaned
    '''
    # This pattern keeps alphanumeric characters (a-z, A-Z, 0-9) and special characters
    pattern = r'[^a-zA-Z0-9!@$%^&,.?\s]'
    df[column] = df[column].apply(lambda x: re.sub(pattern, '', str(x)))

    #removing new line character
    df[column] = df[column].str.replace('\n', ' ')

    # This pattern matches strings that contain only special characters or only numbers
    pattern_2 = r'^[^a-zA-Z]*$'

    # Remove rows where the column's value matches the pattern
    df = df[~df[column].apply(lambda x: bool(re.match(pattern_2, str(x))))]
    return df

In [None]:
cleaned_movies_df = clean_data(df, 'text')

In [None]:
df=df.reset_index(drop=True)

In [None]:
df.head()

Unnamed: 0,author,updated_at,like_count,text,public,movie_name
0,@IDreamElectricSheep,2023-12-11T10:36:29Z,0,20th century fox couldnt even higher a proper ...,True,Amsterdam
1,@lmcgready,2023-12-10T18:20:54Z,0,SUPERB!!!!,True,Amsterdam
2,@nmarks,2023-12-08T18:12:28Z,1,Recommended. The movie comes into sharp focus ...,True,Amsterdam
3,@user-qk7ny7xy9r,2023-12-08T12:50:59Z,0,nice,True,Amsterdam
4,@lakshanchamod1208,2023-12-02T12:17:35Z,0,,True,Amsterdam


In [None]:
df.shape

(111965, 6)

# Subsetting 700 Comments for each video (Randomly)

In [None]:
def subsetting_df(cleaned_movies_df,number_of_comments):
    '''
    cleaned_movies_df : Dataframe containing all the movies and comments
    number_of_comments : Numbers of comments of ewach movie you want to subset
    '''
    movie_df = pd.DataFrame()
    for movie in cleaned_movies_df.movie_name.unique():
        temp_df = df[df['movie_name'] == movie]
        temp_df = temp_df.sample(number_of_comments,replace=True)
        movie_df = pd.concat([movie_df,temp_df])
    return movie_df


In [None]:
df = subsetting_df(cleaned_movies_df,700)

df.movie_name.value_counts()

movie_name
Amsterdam            700
The Last Duel        700
West Side Story      700
Mortal Engines       700
Pan                  700
Jupiter Ascending    700
Strange World        700
Moonfall             700
Cats                 700
Monster Truck        700
Name: count, dtype: int64

In [None]:
df = df.reset_index(drop =True).reset_index()

In [None]:
df = df.rename(columns={'index': 'Id'})

In [None]:
df.head()

Unnamed: 0,Id,author,updated_at,like_count,text,public,movie_name
0,0,@klararusiti4043,2022-07-08T03:03:10Z,1,105 THATS TIMMY THATS MY TIMMY!,True,Amsterdam
1,1,@ztswift,2022-07-06T18:22:14Z,1,THE CAST,True,Amsterdam
2,2,@MrWatchowtnow,2022-09-24T20:21:43Z,2,Man that forced diversity rule really ruins th...,True,Amsterdam
3,3,@jaydeeppatil1488,2022-07-07T16:52:13Z,1,Movies nowadays are getting so bad its beyond ...,True,Amsterdam
4,4,@DragonboltBlastter,2023-09-23T22:01:53Z,1,"This movie is not realistic enough, needs more...",True,Amsterdam


# Using NLTK(Vaders Algo) & Transformer (RoBERTa)

In [None]:
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [None]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(example, return_tensors='pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'roberta_neg' : scores[0],
        'roberta_neu' : scores[1],
        'roberta_pos' : scores[2]
    }
    return scores_dict

In [None]:
def sentiment_analyser(df):
    sia = SentimentIntensityAnalyzer()
    res = {}
    for i, row in tqdm(df.iterrows(), total=len(df)):
        try:
            text = row['text']
            myid = row['Id']
            vader_result = sia.polarity_scores(text) # using SIA from NLTK
            vader_result_rename = {}
            for key, value in vader_result.items():
                vader_result_rename[f"vader_{key}"] = value
            roberta_result = polarity_scores_roberta(text) # using roberta from Cardiff transformer
            both = {**vader_result_rename, **roberta_result}
            res[myid] = both
        except RuntimeError:
            print(f'Broke for id {myid}')
    temp_df = pd.DataFrame(res).T
    temp_df = temp_df.reset_index().rename(columns={'index': 'Id'})
    results_df = temp_df.merge(df, how='left')
    return results_df

In [None]:
def vader_sentiment_colADD(vader_df, vader_threshold):
    '''
    vader_df : subset of vader_data fvrom result_df
    vader_threshold : threshold used to segregrate three categories[pos, neg, neu]
    '''
    # Create a mask for positive values
    pos_mask = vader_df['vader_compound'] > vader_threshold

    # Create a mask for negative values
    neg_mask = vader_df['vader_compound'] < -vader_threshold

    # Assign 'positive' to the rows where pos_mask is True
    vader_df.loc[pos_mask, 'vader_sentiment'] = 'positive'

    # Assign 'negative' to the rows where neg_mask is True
    vader_df.loc[neg_mask, 'vader_sentiment'] = 'negative'

    # Assign 'neutral' to the remaining rows
    vader_df.loc[~(pos_mask | neg_mask), 'vader_sentiment'] = 'neutral'

    return vader_df


In [None]:
def roberta_sentiment_colADD(roberta_df):
    '''
    roberta_df : subset of roberta_data from result_df
    '''
    max_column = roberta_df[['roberta_neg', 'roberta_neu', 'roberta_pos']].idxmax(axis=1)

    roberta_df['Max_Column'] = max_column

    # Create a mask for negative values
    neg_mask = roberta_df['Max_Column'] == 'roberta_neg'

    # Create a mask for neutral values
    neu_mask = roberta_df['Max_Column'] == 'roberta_neu'

    # Assign 'negative' to the rows where neg_mask is True
    roberta_df.loc[neg_mask, 'roberta_sentiment'] = 'negative'

    # Assign 'neutral' to the rows where neu_mask is True
    roberta_df.loc[neu_mask, 'roberta_sentiment'] = 'neutral'

    # Assign 'positive' to the remaining rows
    roberta_df.loc[~(neg_mask | neu_mask), 'roberta_sentiment'] = 'positive'

    return roberta_df

In [None]:
def algo_result(algo_df,algo_name):
    '''
    algo_df : df after adding the sentiment column to any of the algo
    algo_name : 'roberta' OR 'vader' - can contain value only from this
    '''
    m_names = vader_df['movie_name'].unique()
    algo_dic={}
    for i in m_names:
        tem_algo_df = algo_df[algo_df['movie_name'] == i]
        mode = tem_algo_df[f'{algo_name}_sentiment'].mode()[0]
        algo_dic[i]=mode

    algo_result = pd.DataFrame(algo_dic,index=[0]).T.reset_index().rename(columns={0: "Sentiment",'index' : 'Movie'})
    return algo_result



In [None]:
results_df = sentiment_analyser(df)

  0%|          | 0/7000 [00:00<?, ?it/s]

Broke for id 2746
Broke for id 3963
Broke for id 4096
Broke for id 4564


In [None]:
vader_df=results_df[['movie_name','text','vader_neg','vader_neu','vader_pos','vader_compound']]
vader_df = vader_sentiment_colADD(vader_df, 0.1)

In [None]:
roberta_df=results_df[['movie_name','text','roberta_neg','roberta_neu','roberta_pos']]

roberta_df = roberta_sentiment_colADD(roberta_df)

# Results from Vader (NLTK)

In [None]:
vader_movie_result = algo_result(vader_df,'vader')
vader_movie_result

Unnamed: 0,Movie,Sentiment
0,Amsterdam,positive
1,The Last Duel,positive
2,West Side Story,positive
3,Mortal Engines,neutral
4,Pan,positive
5,Jupiter Ascending,positive
6,Strange World,positive
7,Moonfall,positive
8,Cats,neutral
9,Monster Truck,positive


# Results from Roberta (CardiffNLP) - Using Transformer

In [None]:
roberta_movie_result = algo_result(roberta_df,'roberta')
roberta_movie_result

Unnamed: 0,Movie,Sentiment
0,Amsterdam,neutral
1,The Last Duel,neutral
2,West Side Story,positive
3,Mortal Engines,neutral
4,Pan,neutral
5,Jupiter Ascending,negative
6,Strange World,negative
7,Moonfall,negative
8,Cats,negative
9,Monster Truck,neutral
