In [12]:
import pandas as pd
import re
import string
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [13]:
df = pd.read_csv('comments.csv')

In [14]:
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove digits
    text = re.sub(r'\d+', '', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Join the tokens back into a string
    text = ' '.join(tokens)

    return text

def get_sentiment_polarity(comment):

    comment = str(comment)

    clean_comment = clean_text(comment)
    
    # Perform sentiment analysis using VADER
    sid = SentimentIntensityAnalyzer()
    scores = sid.polarity_scores(clean_comment)
    
    return scores["compound"]

for i in range(20):
    col_name = f"comment_{i+1}"
    df[col_name] = df[col_name].apply(get_sentiment_polarity)

# Calculate the mean sentiment polarity for each video
df["mean_sentiment_polarity"] = df.iloc[:, 1:].mean(axis=1)

In [15]:
df

Unnamed: 0,video_id,comment_1,comment_2,comment_3,comment_4,comment_5,comment_6,comment_7,comment_8,comment_9,...,comment_12,comment_13,comment_14,comment_15,comment_16,comment_17,comment_18,comment_19,comment_20,mean_sentiment_polarity
0,mBewRJ9CASE,0.4939,0.0,0.6249,0.0,0.2732,0.4019,0.6962,0.5719,0.1027,...,0.5719,0.2584,0.886,0.0516,-0.7505,0.6369,0.6369,0.0,0.8249,0.321575
1,CokI98aEmeU,0.0,0.4404,-0.0772,0.0,0.0,-0.163,0.4404,0.5859,0.34,...,-0.6597,-0.4767,0.4588,0.0,0.4215,0.5859,-0.431,0.0,0.0,0.09434
2,XzCoVvz8RDA,0.6467,-0.128,0.9081,0.0,0.926,0.7506,0.0,0.1531,0.0,...,0.8271,0.7906,0.5994,0.836,0.4404,0.9136,0.0,0.5574,0.836,0.45285
3,ONS2urrUe6Q,0.4767,0.6249,0.0,0.8979,0.4588,0.5859,0.5994,0.2732,0.4404,...,0.802,0.3089,0.0,0.0,0.4404,0.0,0.4767,0.0,0.4404,0.38081
4,GkoKZUBvC9E,0.1109,-0.8961,0.7003,0.3818,-0.7506,0.6369,0.2732,-0.296,-0.4767,...,0.0,0.802,0.5696,-0.4588,0.7506,0.5719,0.9382,-0.5256,-0.1144,0.13417
5,mpAs79hemw0,0.0,0.3818,0.4588,0.0,0.0,0.0,0.5859,0.0,0.0,...,0.4215,0.7506,0.8689,0.9823,0.0,0.0,0.8225,0.7717,0.5994,0.378795
6,kxvM3fMg2jY,-0.34,0.5542,0.296,0.9393,0.7579,0.0,0.743,0.0,0.918,...,-0.0772,0.9081,0.9118,0.9403,0.7964,0.5849,0.4404,0.9517,0.8176,0.55922
7,W0gkMq4mKaY,-0.3477,0.6369,0.2732,0.931,0.7906,0.6124,0.3612,0.9423,0.959,...,-0.3182,0.3818,0.8885,0.5106,0.0,0.7003,0.8779,0.5542,0.8225,0.54487
8,F81ZM-ZG6-g,0.6369,0.4019,0.5859,-0.4767,0.936,-0.3612,0.4767,0.8979,0.0,...,0.34,0.6705,0.5994,-0.3612,0.25,0.8402,-0.5267,0.0,0.0,0.259975
9,hWQy6xucRXI,0.0,0.4215,0.6808,-0.4019,0.7184,0.0,0.8481,0.8316,0.6486,...,0.8779,0.6597,-0.3356,0.7184,0.7269,-0.2444,0.7845,0.886,0.6124,0.493105


In [16]:
df['mean_sentiment_polarity'].mean()

0.3484092105263158

# Yahan se NER

In [17]:
df = pd.read_csv('comments.csv')

import spacy
from collections import Counter

# earlier
def perform_ner(df):
    nlp = spacy.load('en_core_web_sm')
    
    # Create an empty list to store NER results
    ner_results = []
    
    # Iterate over each row (video) in the DataFrame
    for i, row in df.iterrows():
        video_comments = ' '.join([str(row[c]) for c in df.columns if c.startswith('comment_')])
        doc = nlp(video_comments)
        
        # Extract named entities from the document
        ner_text = [(ent.text, ent.label_) for ent in doc.ents]
        
        # Add NER results to the list
        ner_results.append(ner_text)
        
    # Add a new column to the DataFrame with the NER results
    df['ner_results'] = ner_results

    # Get the most common word for each video
    most_common_words = []
    for ner_text in ner_results:
        words = [w[0].lower() for w in ner_text]
        counter = Counter(words)
        if counter:
            # most_common_word = counter.most_common(1)[0][0] 
            most_common_word = [w[0] for w in counter.most_common(5)]
        else:
            most_common_word = ''
        most_common_words.append(most_common_word)
    
    # Add a new column to the DataFrame with the most common word
    df['most_common_words'] = most_common_words
    
    return df


df = perform_ner(df)

df.head(2)

Unnamed: 0,video_id,comment_1,comment_2,comment_3,comment_4,comment_5,comment_6,comment_7,comment_8,comment_9,...,comment_13,comment_14,comment_15,comment_16,comment_17,comment_18,comment_19,comment_20,ner_results,most_common_words
0,mBewRJ9CASE,Tuchel with the 10 pts in the first league mat...,What an assist by Cucurella!,Not a Chelsea fan but it was nice to see ngolo...,had no patience with Tuchel ❌\npatience with P...,Mudryk finishing and ball control in the last ...,Definitely Dibu the player of the match! Chels...,I was in favour of us hiring Potter (though di...,What an assist from Cucurella and what a chip ...,be grateful for Tuchel if not his points at be...,...,You can't argue now that he doesn't have to go,Thanks for posting this. The chances were many...,"Players have to improve their finishing, not a...","In the midst of my sorrow and disbelief, at le...",Am not a Chelsea fan but its good to see kante...,Cucurella's header is the best assist I've see...,Mudryk bottled it in two different occasions. ...,I don't think the manager is the problem when ...,"[(10, CARDINAL), (first, ORDINAL), (Cucurella,...","[chelsea, cucurella, first, mudryk, kante]"
1,CokI98aEmeU,We must shift our attention to the massive gam...,We lost this game but I love Chelsea Women tea...,"When against Lyon, we were the lucky one. On t...",I know Emma has been trying to replace Magda; ...,Time to bounce back blues,The back position needs some change and people...,Better days are coming iron lions 🦁,No Reiten..No win..,Hard lucky blues,...,"The backline including the goalkeeper , poor",So sweet,CITY !! 🩵🩵,The goal keeper's kick needs to improve \nIf s...,Well get them next time definitely,The ladies didn't play well at all,We will be back.,Sayang sekali kalah dari city,"[(Lyon, PERSON), (this Thursday, DATE), (Stamf...","[lyon, this thursday, stamford bridge, chelsea..."


In [19]:
df['most_common_words'][1]

['lyon', 'this thursday', 'stamford bridge', 'chelsea women', 'emma']