### Data preparation

In [30]:
import pandas as pd

# load data
data = pd.read_csv("Data  - Official data.csv")

In [31]:
data.head(30)

Unnamed: 0,VideoID,"Concert year (2018, 2019 or 2020) (song released before 2018 -> 2018)",Artist,Song Name,View,Like,Dislike,danceability,energy,speechiness,acousticness,instrumentalness,liveness,valence,tempo,type,duration_ms,popularity
0,U9BwWKXjVaI,2018,Drake,Nice For What,348637112,2353239,130665,0.889,0.496,0.091,0.259,0.0,0.252,0.544,86.0,audio_features,180522,84
1,S1gp0m4B5p8,2019,Drake,Going Bad feat. Drake (Official Video),109266381,1221814,40336,0.834,0.454,0.201,0.321,0.0,0.114,0.837,82.0,audio_features,247059,99
2,xWggTb45brM,2020,Drake,Toosie Slide,59212131,1370881,49691,0.787,0.673,0.158,0.256,0.0,0.09,0.786,111.0,audio_features,237893,97
3,pok8H_KF1FA,2020,Doja Cat,Say So (Official Video),85600520,2206882,36219,0.777,0.658,0.054,0.077,0.0,0.09,0.633,110.0,audio_features,190360,83
4,AqAJLh9wuZ0,2020,Taylor Swift,The Man (Official Video),37774922,1297372,142309,0.777,0.658,0.054,0.077,0.0,0.09,0.633,110.0,audio_features,190360,83
5,ba7mB8oueCY,2019,Post Malone,"""Goodbyes"" ft. Young Thug (Rated R)",135262821,2783359,64973,0.548,0.653,0.081,0.456,0.0,0.103,0.163,150.0,audio_features,174853,87
6,UceaB4D0jpo,2018,Post Malone,rockstar ft. 21 Savage,662705945,8520437,261554,0.585,0.52,0.071,0.124,0.0,0.131,0.129,160.0,audio_features,218147,88
7,Pkh8UtuejGw,2019,Shawn Mendes,SeÃ±orita,998397606,15306458,447814,0.759,0.54,0.029,0.037,0.0,0.095,0.75,117.0,audio_features,190960,89
8,xpVfcZ0ZcFM,2018,Drake,God's Plan,1130103938,11480794,431209,0.754,0.449,0.109,0.033,0.0,0.552,0.357,77.0,audio_features,198973,85
9,DRS_PpOrUZ4,2018,Drake,In My Feelings,220295948,3657786,175288,0.835,0.626,0.125,0.059,0.0,0.396,0.35,91.0,audio_features,217925,83


In [32]:
data.shape

(162, 18)

## Add sentiment score columns

In [33]:
## Use Vader lexicon to generate sentiment scores
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from statistics import mean
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import os.path
from os import path

# create the list for mean sentiment scores
mean_neg_list = []
mean_neu_list = []
mean_pos_list = []
mean_cou_list = []

for i in range(0, len(data)):
    # use video ID to get the text comments
    name = data["VideoID"][i].strip() + '.txt'
    # if we do not have the file then set scores = 0
    if(path.exists(name)==False):
        mean_neg_list.append(0)
        mean_neu_list.append(0)
        mean_pos_list.append(0)
        mean_cou_list.append(0)
        continue
    
    ## Data Cleaning for Sentiment Analysis (without stemming)
    corpus_cleaned = []
    
    # open the file 
    with open(name) as f:
        for text in f:

            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(text)

            filtered_words = [w for w in tokens if not w in stopwords.words('english')]

            text_cleaned = " ".join(filtered_words)
            corpus_cleaned.append(text_cleaned)
            
    # if the txt file has length of 0 after cleaning then scores = 0
    if(len(corpus_cleaned)==0):
        mean_neg_list.append(0)
        mean_neu_list.append(0)
        mean_pos_list.append(0)
        mean_cou_list.append(0)
        continue
    
    # get sentiment scores
    senScore_neg = []
    senScore_pos = []
    senScore_neu = []
    senScore_cou = []

    sia = SentimentIntensityAnalyzer()
    
    for sentence in corpus_cleaned:
        ss = sia.polarity_scores(sentence)
        senScore_neg.append(ss['neg'])
        senScore_pos.append(ss['pos'])
        senScore_neu.append(ss['neu'])
        senScore_cou.append(ss['compound'])

    # find the mean
    mean_neg = mean(senScore_neg)
    mean_pos = mean(senScore_pos)
    mean_neu = mean(senScore_neu)
    mean_cou = mean(senScore_cou)

    # add mean to the list 
    mean_neg_list.append(mean_neg)
    mean_neu_list.append(mean_neu)
    mean_pos_list.append(mean_pos)
    mean_cou_list.append(mean_cou)
    
# add the list as a new columns to data   
data['neg_score']= mean_neg_list
data['neu_score']= mean_neu_list
data['pos_score']= mean_pos_list
data['cou_score']= mean_cou_list

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/phuongho/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [35]:
data.to_csv(r'Data_withSentiments.csv', index = False)