<a href="https://colab.research.google.com/github/VDai1999/ConcertTicketSale/blob/main/text_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import packages
import pandas as pd
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer 
from statistics import mean
import string
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import re
import os.path
from os import path

### Data preparation

In [None]:
# load data
data = pd.read_csv("Data  - Official data.csv")

In [None]:
data.head(30)

In [None]:
data.shape

## Add sentiment score columns

In [None]:
## Use Vader lexicon to generate sentiment scores
# create the list for mean sentiment scores
mean_neg_list = []
mean_neu_list = []
mean_pos_list = []
mean_cou_list = []

for i in range(0, len(data)):
    # use video ID to get the text comments
    name = data["VideoID"][i].strip() + '.txt'
    # if we do not have the file then set scores = 0
    if(path.exists(name)==False):
        mean_neg_list.append(0)
        mean_neu_list.append(0)
        mean_pos_list.append(0)
        mean_cou_list.append(0)
        continue
    
    ## Data Cleaning for Sentiment Analysis (without stemming)
    corpus_cleaned = []
    
    # open the file 
    with open(name) as f:
        for text in f:

            tokenizer = RegexpTokenizer(r'\w+')
            tokens = tokenizer.tokenize(text)

            filtered_words = [w for w in tokens if not w in stopwords.words('english')]

            text_cleaned = " ".join(filtered_words)
            corpus_cleaned.append(text_cleaned)
            
    # if the txt file has length of 0 after cleaning then scores = 0
    if(len(corpus_cleaned)==0):
        mean_neg_list.append(0)
        mean_neu_list.append(0)
        mean_pos_list.append(0)
        mean_cou_list.append(0)
        continue
    
    # get sentiment scores
    senScore_neg = []
    senScore_pos = []
    senScore_neu = []
    senScore_cou = []

    sia = SentimentIntensityAnalyzer()
    
    for sentence in corpus_cleaned:
        ss = sia.polarity_scores(sentence)
        senScore_neg.append(ss['neg'])
        senScore_pos.append(ss['pos'])
        senScore_neu.append(ss['neu'])
        senScore_cou.append(ss['compound'])

    # find the mean
    mean_neg = mean(senScore_neg)
    mean_pos = mean(senScore_pos)
    mean_neu = mean(senScore_neu)
    mean_cou = mean(senScore_cou)

    # add mean to the list 
    mean_neg_list.append(mean_neg)
    mean_neu_list.append(mean_neu)
    mean_pos_list.append(mean_pos)
    mean_cou_list.append(mean_cou)
    
# add the list as a new columns to data   
data['neg_score']= mean_neg_list
data['neu_score']= mean_neu_list
data['pos_score']= mean_pos_list
data['cou_score']= mean_cou_list

In [None]:
data.to_csv(r'Data_withSentiments.csv', index = False)