In [1]:
import spacy
import jsonlines
import tqdm
import pickle

In [9]:
nlp = spacy.load("en_core_web_md")

In [10]:
artist_reviews = {}
with jsonlines.open('../data_for_558_proj/songkick.jl') as reader:
    for obj in reader:
        if len(obj["reviews"]) > 0:
            text = " ".join(obj["reviews"])
            text = " ".join(text.split()).strip()
            artist_reviews[obj["url"]] = {"text" : text,
                                          "count" : len(obj["reviews"]),
                                          "name" : obj["name"]}

In [11]:
for key in tqdm.tqdm(list(artist_reviews.keys())):
    artist_reviews[key]["text"] = nlp(artist_reviews[key]["text"])

100%|██████████| 4971/4971 [11:31<00:00,  7.18it/s]


In [12]:
urls = list(artist_reviews.keys())

In [13]:
text_data = []
for url in tqdm.tqdm(urls):
    doc = artist_reviews[url]["text"]
    text = " ".join([token.lemma_ for token in doc])
    text_data.append(text)

100%|██████████| 4971/4971 [00:03<00:00, 1494.04it/s]


In [18]:
with open("nlp_data/text_data.p", "wb") as f:
    pickle.dump(text_data, f)

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [39]:
vectorizer = TfidfVectorizer(lowercase=False, ngram_range=(2,3), max_df=0.4, min_df=0.05, sublinear_tf=True)

In [40]:
X = vectorizer.fit_transform(text_data)

In [41]:
X = X.tocoo()

In [42]:
def sort_coo(m):
    tuples = zip(m.row, m.col, m.data)
    return sorted(tuples, key=lambda x: (x[0], x[2]), reverse=True)

In [43]:
data_X = sort_coo(X)

In [48]:
data_X[0]

(4970, 480, 0.21028597298407226)

In [50]:
top_words_per_url = {}

In [47]:
features = vectorizer.get_feature_names()

In [52]:
for i, tup in enumerate(tqdm.tqdm(data_X)):
    url = urls[tup[0]]
    phrase = features[tup[1]]
    score = tup[2]
    if url in top_words_per_url:
        if len(top_words_per_url[url]) < 10:
            top_words_per_url[url].append((phrase, score))
    else:
        top_words_per_url[url] = [(phrase, score)]

100%|██████████| 1067855/1067855 [00:01<00:00, 840961.13it/s]


In [54]:
top_words_per_url[urls[0]]

[('while the', 0.09388530864872821),
 ('the second', 0.09125814565047366),
 ('more PRON', 0.08710275870623432),
 ('be hard', 0.08587239883657166),
 ('be nice', 0.08578288080472353),
 ('of the night', 0.08377107353236941),
 ('be when', 0.08340835026810528),
 ('would have be', 0.08316959838334816),
 ('concert PRON be', 0.0763129316016321),
 ('the first', 0.0761978791187876)]