# Libraries

In [None]:
!pip install google_play_scraper
!pip install sklearn

import pandas as pd
from google_play_scraper.features.reviews import Sort, reviews_all, reviews
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


# Reviews data extraction

In [293]:
result = reviews_all('com.bt.bms',sleep_milliseconds=0,lang='en', country='us')

# Create dataframe of the reviews

In [294]:
df = pd.DataFrame(result)

In [295]:
df.head(2)

Unnamed: 0,at,content,repliedAt,replyContent,reviewCreatedVersion,reviewId,score,thumbsUpCount,userImage,userName
0,2020-07-30 00:09:15,Good App for booking movie tickets.,2020-07-30 06:02:40,"Thanks for the 5 star rating, we much apprecia...",7.0.0,lg:AOqpTOHGjnDzrMs6OFCzH5ifiZgAN8HrxLvxB7WTw2w...,5,1,https://lh3.googleusercontent.com/EGemoI2NTXmT...,A Google user
1,2020-07-29 15:20:00,are you serious? the app gave me a notificatio...,2020-07-29 16:15:30,"Hi Anurag, we didn't mean to disappoint you an...",5.1.3,gp:AOqpTOEMfJKXVKHVdwddDi3Yelseu8EqVwljqMW53MY...,1,0,https://lh3.googleusercontent.com/a-/AOh14GgLn...,Anurag Gopi


In [296]:
print(f'Total textual reviews: {len(result)} \n')

unique_users  = len(df['userName'].unique())
unknown_users = len(df[df['userName']=='A Google user'])
total_reviews = len(df)

print(f'Total unique users : {unique_users}')
print(f'Total unknown users: {unknown_users}')
print(f'Total users who gave multiple reviews: {total_reviews - unique_users - unknown_users}\n')


mean = df['score'].mean()
print(f'Average rating for this app based on the textual reviews: {round(mean,2)} \n')

Total textual reviews: 233202 

Total unique users : 179630
Total unknown users: 28231
Total users who gave multiple reviews: 25341

Average rating for this app based on the textual reviews: 3.99 



# Extract all reviews with rating below 4

In [403]:
df_tm = df[df['score']<=3]
df_tm = df_tm[df_tm.content.str.len()>=30]
print(f'Remaining textual reviews: {len(df_tm)} \n')

Remaining textual reviews: 37996 



# Get the relevant columns for topic modelling

In [406]:
df_tm = df_tm[['reviewId','content']].drop_duplicates()
df_tm.dropna(inplace=True)
df_tm = df_tm.reset_index().drop(columns='index')
print(f'Remaining textual reviews: {len(df_tm)} \n')

Remaining textual reviews: 37996 



# Create document term matrix of the reviews

In [409]:
cv = CountVectorizer(max_df=0.95, min_df=2, stop_words='english')

# max_df : discard words that occur more than 95% documents
# min_df : include only those words that occur atleast in 2 documents

In [410]:
dtm = cv.fit_transform(df_tm['content'])

In [468]:
dtm
#shows 8839 terms and 37996 articles

<37996x8839 sparse matrix of type '<class 'numpy.int64'>'
	with 381346 stored elements in Compressed Sparse Row format>

In [412]:
len(cv.get_feature_names())

8839

# Using LDA for topic modelling

In [413]:
LDA = LatentDirichletAllocation(n_components=5,random_state=1)

LDA.fit(dtm)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=5, n_jobs=None,
                          perp_tol=0.1, random_state=1, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

# Extract data

In [471]:
for index,topic in enumerate(LDA.components_):
    print(f'topic #{index} : ')
    print([cv.get_feature_names()[i] for i in topic.argsort()[-20:]])

topic #0 : 
['rating', 'convenience', 'charging', 'people', 'book', 'movie', 'extra', 'fees', 'fee', 'tickets', 'offers', 'good', 'charge', 'ticket', 'booking', 'high', 'app', 'handling', 'charges', 'internet']
topic #1 : 
['able', 'card', 'fix', 'problem', 'offers', 'unable', 'shows', 'tried', 'doesn', 'work', 'times', 'book', 'payment', 'try', 'offer', 'open', 'working', 'error', 'time', 'app']
topic #2 : 
['download', 'don', 'like', 'updated', 'slow', 'previous', 'bad', 'phone', 'old', 'user', 'hai', 'need', 'latest', 'good', 'better', 'worst', 'new', 'version', 'update', 'app']
topic #3 : 
['care', 'deducted', 'didn', 'account', 'transaction', 'time', 'movie', 'service', 'payment', 'got', 'refund', 'worst', 'customer', 'booking', 'book', 'app', 'booked', 'ticket', 'money', 'tickets']
topic #4 : 
['add', 'cancellation', 'cinema', 'able', 'theatre', 'good', 'seat', 'ticket', 'location', 'shows', 'movies', 'showing', 'booking', 'available', 'seats', 'option', 'movie', 'book', 'tickets

0. App/OverallExp
1. Delivery-CommitmentIssue
2. FoodQuality
3. Offers
4. App/Coupons
5. Delivery-RestaurantIssue
6. CustomerSupport
7. Competitors
8. Refund-ChatSupport
9. Refund-Cancellation

In [416]:
topic_results = LDA.transform(dtm)

In [417]:
topic_results

array([[0.01712729, 0.0174859 , 0.25014487, 0.01756706, 0.69767487],
       [0.75641496, 0.02506515, 0.16804106, 0.025339  , 0.02513982],
       [0.02508926, 0.02534895, 0.89876985, 0.02542457, 0.02536738],
       ...,
       [0.02609256, 0.02558532, 0.02587528, 0.17860603, 0.74384082],
       [0.01726216, 0.54357687, 0.01717501, 0.0174214 , 0.40456456],
       [0.01007858, 0.34969724, 0.37805311, 0.10358898, 0.15858208]])

In [456]:

df_topic_results = pd.DataFrame(topic_results, columns=[
'0_InternetCharges',
'1_Payment/Offers' ,
'2_App'            ,
'3_Booking-Refund/Ticket'  ,
'4_Booking-Location/language' 
])

In [457]:
df_topic_results.head(3)

Unnamed: 0,0_InternetCharges,1_Payment/Offers,2_App,3_Booking-Refund/Ticket,4_Booking-Location/language
0,0.017127,0.017486,0.250145,0.017567,0.697675
1,0.756415,0.025065,0.168041,0.025339,0.02514
2,0.025089,0.025349,0.89877,0.025425,0.025367


In [458]:
df_result = pd.merge(df_tm,df_topic_results,  how='inner', left_index=True, right_index=True )

In [459]:
df_result.drop(columns='reviewId').iloc[175,]['content']

'Extremely regressive booking process.'

In [464]:
df_output = pd.merge(df, df_result,  how='left', on=[ 'reviewId','content' ])

In [465]:
len(df_output)

233202

In [466]:
df_output.to_csv('app_reviews_bms.csv')