In [3]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords

In [4]:
df_scores = pd.read_csv("similarity_scores.csv")
df_scores.head()

Unnamed: 0.1,Unnamed: 0,product_name,product_review,similarity_score
0,1,Kentucky Brunch Brand Stout,I didnt think i was going to give it a perfect...,0.035921
1,2,Kentucky Brunch Brand Stout,So I just read a review that called the legend...,0.019702
2,3,Kentucky Brunch Brand Stout,2021 vintage bottle 79\r\n\r\nHoly Fucking Shi...,0.110674
3,4,Kentucky Brunch Brand Stout,Celebrating my buddy Rug with his 1000th beer ...,0.071157
4,5,Kentucky Brunch Brand Stout,Thick and syrupy pour mocha head Aroma is booz...,0.063246


In [5]:
def windowmaker(attr, sent):
    if attr not in sent:
        return None
    attr_idx = sent.index(attr)
    i, j = attr_idx - 2, attr_idx + 2
    if i < 0:
        i = 0
    if j > len(sent):
        j = len(sent)
    return sent[i:j]

In [6]:
stop = stopwords.words('english')

def remove_stopwords(sent):
    new = []
    for w in sent:
        if w not in stop:
            new.append(w)
    return new

In [7]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_score(sentence):
    if not sentence:
        return None
    sentence = ' '.join(sentence)
    score = analyser.polarity_scores(sentence)
    return score['compound']

In [8]:
attrs = ['chocolate', 'dark', 'sweet', 'bourbon', 'coffee']

df_scores_filt = df_scores[df_scores['similarity_score'] != 0].copy()

for attr in attrs:
    col_name = f'{attr}_score'
    df_scores_filt[col_name] = df_scores_filt['product_review'].apply(str.lower)
    df_scores_filt[col_name] = df_scores_filt[col_name].apply(lambda r: re.sub(r'[^A-Za-z0-9 ]+', '', r))
    df_scores_filt[col_name] = df_scores_filt[col_name].apply(str.split)
    df_scores_filt[col_name] = df_scores_filt[col_name].apply(remove_stopwords)
    df_scores_filt[col_name] = df_scores_filt[col_name].apply(lambda r: windowmaker(attr, r))
    df_scores_filt[col_name] = df_scores_filt[col_name].apply(lambda r: sentiment_analyzer_score(r))

In [9]:
df_scores_filt.head()

Unnamed: 0.1,Unnamed: 0,product_name,product_review,similarity_score,chocolate_score,dark_score,sweet_score,bourbon_score,coffee_score
0,1,Kentucky Brunch Brand Stout,I didnt think i was going to give it a perfect...,0.035921,,,,,0.0
1,2,Kentucky Brunch Brand Stout,So I just read a review that called the legend...,0.019702,,0.6222,,,0.1779
2,3,Kentucky Brunch Brand Stout,2021 vintage bottle 79\r\n\r\nHoly Fucking Shi...,0.110674,0.0,,,0.0,0.0
3,4,Kentucky Brunch Brand Stout,Celebrating my buddy Rug with his 1000th beer ...,0.071157,0.0,,,,
4,5,Kentucky Brunch Brand Stout,Thick and syrupy pour mocha head Aroma is booz...,0.063246,,,,,


In [10]:
df_scores_filt.describe()

Unnamed: 0.1,Unnamed: 0,similarity_score,chocolate_score,dark_score,sweet_score,bourbon_score,coffee_score
count,989.0,989.0,496.0,471.0,468.0,368.0,347.0
mean,1294.995956,0.114104,0.094939,0.077446,0.473938,0.11204,0.080115
std,762.544032,0.079937,0.215531,0.19577,0.210252,0.221719,0.209191
min,1.0,0.007223,-0.5574,-0.4939,-0.4717,-0.4767,-0.5267
25%,592.0,0.052154,0.0,0.0,0.4588,0.0,0.0
50%,1359.0,0.093026,0.0,0.0,0.4588,0.0,0.0
75%,1982.0,0.156293,0.0,0.0,0.5095,0.1779,0.0
max,2504.0,0.454077,0.8074,0.8074,0.8591,0.875,0.7845


In [11]:
df_brandwise = df_scores_filt[df_scores_filt['similarity_score'] != 0].groupby(by='product_name').mean()

In [12]:
def avg(row):
    scores = [row['chocolate_score'], row['dark_score'], row['sweet_score'], row['bourbon_score'], row['coffee_score']]
    n = 0 # war crime
    avg = 0
    for s in scores:
        if np.isnan(s) or s==0:
            continue
        n += 1
        avg += s
    if n == 0:
        n += 1
    return avg / n

df_brandwise['avg_score'] = df_brandwise.apply(lambda b: avg(b), axis=1)

In [13]:
df_brandwise.head()

Unnamed: 0_level_0,Unnamed: 0,similarity_score,chocolate_score,dark_score,sweet_score,bourbon_score,coffee_score,avg_score
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A Deal With The Devil - Double Oak-Aged,197.75,0.09545,,0.0,0.48415,0.075433,0.0,0.279792
A Deal With The Devil - Triple Oak-Aged,177.5,0.138721,,0.0,0.4588,0.0,,0.4588
Abner,392.0,0.023313,,,0.4588,,,0.4588
Abrasive Ale,1741.0,0.015972,,,0.4588,,,0.4588
Abraxas,2393.7,0.095875,0.079629,0.069675,0.5369,0.2732,0.0602,0.203921


### TASK E- Evaluation Metric

In [24]:
df_brandwise['evaluation_metric'] = (df_brandwise['similarity_score'] + df_brandwise['avg_score'])/2 
df_brandwise.sort_values(by='evaluation_metric', ascending=False)

Unnamed: 0_level_0,Unnamed: 0,similarity_score,chocolate_score,dark_score,sweet_score,bourbon_score,coffee_score,avg_score,evaluation_metric
product_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Black Tuesday - Rum Barrel-Aged,2304.5,0.096538,0.0,0.0,0.796400,,,0.796400,0.446469
Crusher,2215.0,0.051988,,,0.765000,,,0.765000,0.408494
JJJuiceee Machine,1237.0,0.056567,,0.0,0.735100,,,0.735100,0.395834
Doubleganger,557.0,0.054225,,0.0,0.662933,,,0.662933,0.358579
I Will Not Be Afraid,2296.8,0.173847,0.0,0.0,0.542667,,0.0,0.542667,0.358257
...,...,...,...,...,...,...,...,...,...
Lou Pepe - Framboise,753.0,0.042258,,0.0,,,,0.000000,0.021129
Schaarbeekse Kriek,2485.0,0.030643,,0.0,,,,0.000000,0.015321
Congress Street IPA,1777.0,0.026774,,0.0,,,,0.000000,0.013387
Ephraim,943.0,0.017582,,0.0,,,,0.000000,0.008791
