In [1]:
#!pip install vaderSentiment
#!pip install -U sentence-transformers

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [2]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# input sample
text = "I absolutely love this video! It's so inspiring and well-made :)"

# get score
scores = analyzer.polarity_scores(text)

# output score
print("Text:", text)
print("VADER Scores:", scores)

# classify by compound 
compound = scores['compound']
if compound >= 0.05:
    sentiment = "Positive 😊"
elif compound <= -0.05:
    sentiment = "Negative 😠"
else:
    sentiment = "Neutral 😐"

print("Overall Sentiment:", sentiment)

Text: I absolutely love this video! It's so inspiring and well-made :)
VADER Scores: {'neg': 0.0, 'neu': 0.412, 'pos': 0.588, 'compound': 0.9081}
Overall Sentiment: Positive 😊


In [None]:
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# read CSV
df = pd.read_csv("CAvideos.csv")

# initialize Vader
analyzer = SentimentIntensityAnalyzer()

# add new cols
df['title_compound'] = 0.0
df['description_compound'] = 0.0

for i in range(len(df)):
    # Title 
    title = str(df.loc[i, 'title'])
    title_score = analyzer.polarity_scores(title)
    df.loc[i, 'title_compound'] = title_score['compound']

    # Description 
    description = str(df.loc[i, 'description'])
    desc_score = analyzer.polarity_scores(description)
    df.loc[i, 'description_compound'] = desc_score['compound']

# tags str to list
df['tags'] = df['tags'].astype(str).apply(lambda x: x.split('|'))

df[['title', 'title_compound', 'description_compound', 'tags']].head()

Unnamed: 0,title,title_compound,description_compound,tags
0,Eminem - Walk On Water (Audio) ft. Beyoncé,0.0,0.6369,"[Eminem, ""Walk"", ""On"", ""Water"", ""Aftermath/Sha..."
1,PLUSH - Bad Unboxing Fan Mail,-0.296,0.7783,"[plush, ""bad unboxing"", ""unboxing"", ""fan mail""..."
2,"Racist Superman | Rudy Mancuso, King Bach & Le...",-0.6124,0.7365,"[racist superman, ""rudy"", ""mancuso"", ""king"", ""..."
3,I Dare You: GOING BALD!?,0.0,0.929,"[ryan, ""higa"", ""higatv"", ""nigahiga"", ""i dare y..."
4,Ed Sheeran - Perfect (Official Music Video),0.5719,0.4404,"[edsheeran, ""ed sheeran"", ""acoustic"", ""live"", ..."


In [15]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from tqdm import tqdm

# load Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')  # dimension =  384

# make sure str format
df['title'] = df['title'].astype(str)
df['description'] = df['description'].astype(str)

# embedding 
title_embeddings = model.encode(df['title'].tolist(), show_progress_bar=True)
desc_embeddings = model.encode(df['description'].tolist(), show_progress_bar=True)

# add to df
df['title_embedding'] = list(title_embeddings)
df['description_embedding'] = list(desc_embeddings)

print(df[['title', 'title_embedding', 'description_embedding']].head(2))

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Batches:   0%|          | 0/1278 [00:00<?, ?it/s]

Batches:   0%|          | 0/1278 [00:00<?, ?it/s]

                                        title  \
0  Eminem - Walk On Water (Audio) ft. Beyoncé   
1               PLUSH - Bad Unboxing Fan Mail   

                                     title_embedding  \
0  [-0.06277154, -0.04639821, 0.048019543, 0.0320...   
1  [-0.036631104, -0.0015310794, 0.110469095, -0....   

                               description_embedding  
0  [-0.04607288, -0.13656281, 0.06220317, -0.0064...  
1  [-0.10279074, -0.13369378, 0.041275885, -0.114...  


In [16]:
import ast

# turn embedding vector to str for csv to store "[0.1, 0.2, 0.3, ...]"）
df['title_embedding'] = df['title_embedding'].apply(lambda x: str(x))
df['description_embedding'] = df['description_embedding'].apply(lambda x: str(x))

# save to new CSV file
df.to_csv("CAVideos_with_embeddings.csv", index=False)

In [17]:
df.head()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description,title_compound,description_compound,title_embedding,description_embedding
0,n1WpP7iowLc,17.14.11,Eminem - Walk On Water (Audio) ft. Beyoncé,EminemVEVO,10,2017-11-10T17:00:03.000Z,"[Eminem, ""Walk"", ""On"", ""Water"", ""Aftermath/Sha...",17158579,787425,43420,125882,https://i.ytimg.com/vi/n1WpP7iowLc/default.jpg,False,False,False,Eminem's new track Walk on Water ft. Beyoncé i...,0.0,0.6369,[-6.27715364e-02 -4.63982113e-02 4.80195433e-...,[-4.60728817e-02 -1.36562809e-01 6.22031689e-...
1,0dBIkQ4Mz1M,17.14.11,PLUSH - Bad Unboxing Fan Mail,iDubbbzTV,23,2017-11-13T17:00:00.000Z,"[plush, ""bad unboxing"", ""unboxing"", ""fan mail""...",1014651,127794,1688,13030,https://i.ytimg.com/vi/0dBIkQ4Mz1M/default.jpg,False,False,False,STill got a lot of packages. Probably will las...,-0.296,0.7783,[-3.66311036e-02 -1.53107941e-03 1.10469095e-...,[-1.02790743e-01 -1.33693784e-01 4.12758850e-...
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"[racist superman, ""rudy"", ""mancuso"", ""king"", ""...",3191434,146035,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...,-0.6124,0.7365,[-1.01845309e-01 -2.69313413e-03 -1.27036691e-...,[-1.19398125e-01 -1.17875852e-01 -7.25160632e-...
3,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"[ryan, ""higa"", ""higatv"", ""nigahiga"", ""i dare y...",2095828,132239,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...,0.0,0.929,[-2.44291220e-02 3.73215340e-02 6.12568110e-...,[-1.42253369e-01 -2.20152866e-02 -6.84852079e-...
4,2Vv-BfVoq4g,17.14.11,Ed Sheeran - Perfect (Official Music Video),Ed Sheeran,10,2017-11-09T11:04:14.000Z,"[edsheeran, ""ed sheeran"", ""acoustic"", ""live"", ...",33523622,1634130,21082,85067,https://i.ytimg.com/vi/2Vv-BfVoq4g/default.jpg,False,False,False,🎧: https://ad.gt/yt-perfect\n💰: https://atlant...,0.5719,0.4404,[-1.84795614e-02 2.95374449e-03 1.03636540e-...,[-1.37768984e-01 -2.78132148e-02 4.75002490e-...


In [None]:
import pandas as pd
import ast

# read CSV
df = pd.read_csv("CAVideos_with_embeddings.csv")

# Convert string embeddings to lists (restore to vectors)
df['title_embedding'] = df['title_embedding'].apply(ast.literal_eval)
df['description_embedding'] = df['description_embedding'].apply(ast.literal_eval)