In [26]:
import pickle
import jsonlines
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import tqdm

In [27]:
artist_reivews = {}
with jsonlines.open('../data_for_558_proj/songkick.jl') as reader:
    for obj in reader:
        if len(obj["reviews"]) > 0:
            text = " ".join(obj["reviews"])
            text = " ".join(text.split()).strip()
            artist_reivews[obj["url"]] = {"text" : text,
                                          "count" : len(obj["reviews"]),
                                          "name" : obj["name"]}

In [28]:
list(artist_reivews.keys())[0]

'https://www.songkick.com/artists/268425-beach-boys'

In [30]:
sid_obj = SentimentIntensityAnalyzer() 
for key in tqdm.tqdm(list(artist_reivews.keys())):
    artist_reivews[key]["sentiment"] = sid_obj.polarity_scores(artist_reivews[key]["text"])

100%|██████████| 4971/4971 [12:02<00:00,  6.88it/s]  


In [31]:
urls = list(artist_reivews.keys())

In [33]:
review_sentiment = {key : artist_reivews[key]["sentiment"] for key in urls}

In [40]:
# with open("nlp_data/review_sentiment.p", "wb") as f:
#     pickle.dump(review_sentiment, f)

In [32]:
artist_reivews[urls[0]]["sentiment"]

{'neg': 0.027, 'neu': 0.745, 'pos': 0.228, 'compound': 1.0}

In [41]:
# https://github.com/cjhutto/vaderSentiment#about-the-scoring
sentiment = []
pos_scores = []
neg_scores = []
for url in tqdm.tqdm(urls):
    sentiment_scores = review_sentiment[url]
    if sentiment_scores["compound"] > 0.05:
        pos_scores.append(sentiment_scores["pos"])
        sentiment.append("pos")
    elif sentiment_scores["compound"] < -0.05:
        neg_scores.append(sentiment_scores["neg"])
        sentiment.append("neg")
    else:
        sentiment.append("n")
        

100%|██████████| 4971/4971 [00:00<00:00, 247535.14it/s]


In [42]:
import numpy as np

In [43]:
pos_mean = np.mean(pos_scores)
pos_stdev = np.std(pos_scores)

In [45]:
pos_mean, pos_stdev

(0.21904841675178757, 0.06681115661824748)

In [46]:
neg_mean = np.mean(neg_scores)
neg_stdev = np.std(neg_scores)

In [47]:
neg_mean, neg_stdev

(0.14885483870967747, 0.12720263828342676)

In [49]:
z_score_sentiments = {}
for url in tqdm.tqdm(urls):
    sentiment_scores = review_sentiment[url]
    if sentiment_scores["compound"] > 0.05:
        z_score_sentiments[url] = {
            "sentiment" : "pos",
            "z-score" : (sentiment_scores["pos"] - pos_mean) / pos_stdev
        }
    elif sentiment_scores["compound"] < -0.05:
        z_score_sentiments[url] = {
            "sentiment" : "neg",
            "z-score" : (sentiment_scores["neg"] - neg_mean) / neg_stdev
        }
    else:
        z_score_sentiments[url] = {
            "sentiment" : "neutral",
            "z-score" : 0
        }

100%|██████████| 4971/4971 [00:00<00:00, 337033.21it/s]


In [50]:
z_score_sentiments

{'https://www.songkick.com/artists/268425-beach-boys': {'sentiment': 'pos',
  'z-score': 0.13398335998523303},
 'https://www.songkick.com/artists/255691-erykah-badu': {'sentiment': 'pos',
  'z-score': -0.7191675639793454},
 'https://www.songkick.com/artists/7061074-becky-g': {'sentiment': 'pos',
  'z-score': -1.018518765370426},
 'https://www.songkick.com/artists/330792-lloyd': {'sentiment': 'pos',
  'z-score': -2.1261182105174226},
 'https://www.songkick.com/artists/44022-slash': {'sentiment': 'pos',
  'z-score': 0.059145559637462916},
 'https://www.songkick.com/artists/172258-tom-jones': {'sentiment': 'pos',
  'z-score': 0.8374586832542716},
 'https://www.songkick.com/artists/402995-martina-mcbride': {'sentiment': 'pos',
  'z-score': -1.078389005648642},
 'https://www.songkick.com/artists/4183826-christine-and-the-queens': {'sentiment': 'pos',
  'z-score': 0.4632696815154211},
 'https://www.songkick.com/artists/492798-red-velvet': {'sentiment': 'pos',
  'z-score': 1.585836686731972},

In [67]:
pos_z_scores = [z_score_sentiments[key]["z-score"] for key in z_score_sentiments if z_score_sentiments[key]["sentiment"] == "pos"]
pos_counts = [len(artist_reivews[key]["text"]) for key in z_score_sentiments if z_score_sentiments[key]["sentiment"] == "pos"]



In [56]:
import scipy

In [69]:
scipy.stats.pearsonr(pos_z_scores, pos_counts)

(0.13307577359012054, 8.782759864173692e-21)

In [70]:
scipy.stats.spearmanr(pos_z_scores, pos_counts)

SpearmanrResult(correlation=0.22934641746320308, pvalue=1.9240549498224333e-59)

In [63]:
neg_z_scores = [z_score_sentiments[key]["z-score"] for key in z_score_sentiments if z_score_sentiments[key]["sentiment"] == "neg"]
neg_counts = [len(artist_reivews[key]["text"]) for key in z_score_sentiments if z_score_sentiments[key]["sentiment"] == "neg"]



In [64]:
scipy.stats.pearsonr(neg_z_scores, neg_counts)

(-0.011353180119608487, 0.9302116607255821)

In [65]:
scipy.stats.spearmanr(neg_z_scores, neg_counts)

SpearmanrResult(correlation=-0.31392845122147806, pvalue=0.012964821412427309)

In [73]:
len(neg_z_scores) + len(pos_z_scores) - len(z_score_sentiments)

-14

In [1]:
import pickle
with open("nlp_data/text_data.p", "rb") as f:
    data = pickle.load(f)

In [4]:
lengths = [len(row) for row in data]

In [5]:
sum(lengths) / len(lengths)

4786.734258700462

In [7]:
max(lengths)

40389

In [8]:
import numpy as np

In [11]:
def print_percentiles(percentiles, data):
    for per in percentiles:
        print("{} {}".format(per, np.percentile(data, per)))

In [12]:
print_percentiles([1,3,5,10,20,25,40,50,60,75,80,85,90,95,98,99], lengths)

1 189.7
3 309.19999999999993
5 557.0
10 1053.0
20 1358.0
25 1564.5
40 2523.0
50 3292.0
60 4250.0
75 6541.5
80 7822.0
85 8960.5
90 10775.0
95 13740.5
98 17714.800000000003
99 20454.500000000004
