/
AppReviews.py
190 lines (162 loc) · 6.15 KB
/
AppReviews.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import nltk
import numpy as np
import pandas as pd
import requests
def single_country_reviews(
appID,
country='us',
# Internal parameters
page=1,
df=pd.DataFrame()):
"""
Gets pandas DataFrame of app store reviews based on an ID
Recursively descends pages on itself to get all reviews for a country
ARGUMENTS
----------
appID: int
apple ID of the app. Generally 9 digit number
Can be found in the url of a store page after "/id="
Country: string of length 2
apple country ID
the list of country IDs can be found in a dict
RETURNS:
---------
Pandas DataFrame formatted:
[app_name, title, game version, rating, review, review votes]
INTERNAL ARGUMENTS:
-------------------
page: int
Page number to start the request at
Looks recursively until it cannot find a page
df: pandas DataFrame
dataframe from previous page. final df is appended from each page
"""
url = 'https://itunes.apple.com/' + country \
+ '/rss/customerreviews/id=%s/page=%d/sortby=mostrecent/json' \
% (appID, page)
req = requests.get(url)
try:
data = req.json().get('feed')
page_error = False
except ValueError:
return df.reset_index(drop=True)
try:
df_index = np.arange(len(data.get('entry')))
except:
return df.reset_index(drop=True)
csvTitles = ['app_name', 'title', 'version', 'rating', 'review', 'vote_count']
page_df = pd.DataFrame(index=df_index, columns=csvTitles)
entry_index = -1 # DataFrame Index
try:
for entry in data.get('entry'):
# There's app info @ top of JSON
if entry.get('im:name'):
page_df.app_name = entry.get('im:name').get('label')
continue
entry_index += 1
page_df.title.loc[entry_index] = entry.get('title').get('label')
page_df.version.loc[entry_index] = entry.get('im:version').get('label')
page_df.rating.loc[entry_index] = entry.get('im:rating').get('label')
page_df.review.loc[entry_index] = entry.get('content').get('label')
page_df.vote_count.loc[entry_index] = entry.get('im:voteCount').get('label')
except AttributeError as ae:
print(data, "\n\n")
raise(ae)
# Clean up returned values
page_df.dropna(axis=0, how='all', inplace=True)
page_df.fillna({'title' : '', 'review' : '', 'version': ''})
page_df.rating = pd.to_numeric(page_df.rating,
downcast='unsigned',
errors='coerce')
page_df.vote_count = pd.to_numeric(page_df.vote_count,
downcast='unsigned',
errors='coerce')
if not page_error:
return single_country_reviews(
appID,
country=country,
page=page + 1,
df=df.append(page_df).reset_index(drop=True))
def get_reviews(
appID,
list_countries=['us', 'gb', 'ca', 'au', 'ie', 'nz']):
"""
Gets single pandas dataframe from multiple countries
ARGUMENTS
----------
appID: int
apple ID of the app. Generally 9 digit number
Can be found in the url of a store page after "/id="
Country: list of apple country ID strings
the list of country IDs can be found in a dict
RETURNS:
---------
Pandas DataFrame formatted:
[title, game version, rating, review, review votes]
"""
if type(list_countries) == str:
list_countries = [list_countries]
df = pd.DataFrame()
for country in list_countries:
df=df.append(
single_country_reviews(appID, country=country)
).reset_index(drop=True)
return df
def break_sentence(review):
"""
Given a text, returns adjectives and related noun
Text is possibly multiple sentences
ARGUMENTS
-----------
review:
single string. Body of text to tokenize
RETURNS
---------
list of strings.
Each string is a n-gram with a single nouns and multiple qualifiers
n is variable (multiple adjectives can modify the same noun)
"""
returned_token_list = list()
sentences = nltk.sent_tokenize(review)
for sent in sentences:
sentence_structure = dict()
words = nltk.pos_tag(word_tokenize(sent))
# note: adjectives is ordered by sentence
adjectives = [words.index(adj)
for adj in words
if adj[1] in ['JJ', 'JJR', 'JJS', 'VBP']]
nouns = [words.index(adj)
for adj in words
if adj[1] in ['NN', 'NNP', 'NNPS', 'NNS']]
closest_nouns = [(adj - np.array(nouns)).argmin() for adj in adjectives]
for pos in range(len(adjectives)):
sentence_structure.setdefault(closest_nouns[pos], [])
sentence_structure[closest_nouns[pos]].append(adjectives[pos])
for noun in sentence_structure:
returned_token_list.append(
str(' '.join([words[adj][0] for adj in sentence_structure[noun]])
+ ' '
+ words[nouns[noun]][0])
)
return returned_token_list
def outputTopics(reviews, n_topics=20, excluded_words=[], n_top_words=13):
"""
Print topic top words
Uses LDA to get topic clusters
outputs top words from each
"""
word_count_model = CountVectorizer(stop_words='english')
word_bag2 = word_count_model.fit_transform(reviews)
vocab = word_count_model.get_feature_names()
model = sklearn.decomposition.LatentDirichletAllocation(
n_topics=n_topics, learning_method ='batch',
random_state=1)
model.fit(word_bag2)
topic_word = model.components_
# Output top topic words
for i, topic_dist in enumerate(topic_word):
topic_words = list(np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words):-1])
for drop_word in excluded_words:
try: topic_words.remove(drop_word)
except: pass
print('Topic {}: {}'.format(i, ' '.join(topic_words)))