In [1]:
import string
import csv
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [2]:
# set up emolex analysis
emolex_df = pd.read_csv('../data/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt', sep='\t', names=['word', 'emotion','association'])
emolex_df = emolex_df[emolex_df.association == 1]
emolex_words = emolex_df.pivot(index='word', columns='emotion', values='association')
emolex_words = emolex_words.reset_index()

In [3]:
def count_emotions(text):

    text = text.translate(str.maketrans('', '', string.punctuation)).lower()
    words = text.split()
    stop_words = set(stopwords.words('english'))

    words = [word for word in words if word not in stop_words]
    emotions_count = emolex_words[emolex_words.word.isin(words)].sum()
    emotions_count['word_count'] = len(words)

    return emotions_count

In [4]:
# read the restaurant dataset with vader scores
df = pd.read_csv('../output/large_output/filtered_full.csv')

In [5]:
# run the emolex analysis on the dataframe and add to the dataframe
emotions_count = df['text'].apply(count_emotions)
emolex = pd.concat([df, emotions_count], axis=1)

In [6]:
emolex.rename(columns={'Unnamed: 0': 'Record'}, inplace=True)

In [7]:
# set index to review id and drop columns not required
emolex = emolex.reset_index()
emolex = emolex.set_index('Record')
emolex.drop(['index'], axis=1, inplace=True)

In [8]:
emolex.to_csv('../output/large_output/emolex.csv')

In [9]:
sid = SentimentIntensityAnalyzer()
output_rows = []
with open('../output/large_output/emolex.csv', 'r') as file:
    reader = csv.DictReader(file)

    for row in reader:
        stars = row['stars']
        review_id = row['Record']
        text = row['text']
        sentiment_scores = sid.polarity_scores(text)
        compound_score = sentiment_scores['compound']
        positive_score = sentiment_scores['pos']
        neutral_score = sentiment_scores['neu']
        negative_score = sentiment_scores['neg']
        output_row = {'review_id':review_id,'text':text, 'stars':stars, 'compound_sentiment': compound_score, 'positive': positive_score,'neutral': neutral_score, 'negative': negative_score}
        output_rows.append(output_row)

In [10]:
vader = pd.DataFrame(output_rows)
vader.to_csv('../output/large_output/vader.csv')

In [11]:
vader.drop(['text','stars'], axis=1, inplace=True)

In [12]:
# merge vader and emolex
vader['review_id'] = vader['review_id'].astype('int64')
vader_emolex = pd.merge(vader, emolex, left_on='review_id',right_on='Unnamed: 0.1')
vader_emolex.drop(['Unnamed: 0.1'], axis=1, inplace=True)

In [13]:
vader_emolex.drop(['word','business_id'], axis=1, inplace=True)

In [14]:
vader_emolex.to_csv('../output/large_output/vader_emolex.csv')

In [15]:
# save to an excel file
vader_emolex.to_excel('../output/large_output/vader_emolex.xlsx')

In [17]:
# remove stop words from text
stop_words = stopwords.words('english')

def remove_stopwords_punctuation(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    text = ' '.join(filtered_tokens)
    return text

In [18]:
vader_emolex['text'] = df['text'].apply(remove_stopwords_punctuation)

In [18]:
vader_emolex.to_excel('../output/large_output/vader_emolex.xlsx')

In [19]:
counts = vader_emolex['stars'].value_counts()

In [20]:
print(counts)

5    39004
4    28477
1    14235
3    12676
2    10900
Name: stars, dtype: int64


In [21]:
from collections import Counter
grouped = vader_emolex.groupby('stars')['text'].apply(' '.join)
results = []
for group in grouped:
    words = group.split()
    counter = Counter(words)
    top_words = dict(counter.most_common(30))
    results.append(top_words)

#create a DataFrame from the results
df_results = pd.DataFrame(results)


In [22]:
# df_results = pd.DataFrame(results)
df_results.head(50)

In [23]:
df_results.drop(['food','place','restaurant','chicken','im','sauce','ive','one'], axis=1, inplace=True)

In [24]:
df_results.head(50)

In [25]:
from collections import Counter

# create a list of words from your dataframe column
words = vader_emolex['text'].str.split(expand=True).stack().tolist()

# count the most common words
word_counts = Counter(words).most_common()

In [26]:
word_df = pd.DataFrame(word_counts, columns=['word', 'count'])

In [27]:
word_top50 = word_df.sort_values(by='count', ascending=False)[:50]

In [28]:
drop_words=['food','place','one','two','restaurant','menu','ive','im','lunch','table']

In [29]:
filtered_words = word_top50[word_top50.word.isin(drop_words) == False]

In [30]:
filtered_words.head(50)

In [31]:
filtered_words.to_excel('../output/large_output/filtered_words.xlsx')