In [None]:
# Running through the list of appIds and making two datasets
# First - with groupped by month values of 
# - Total number of reviews
# - Number of developer replies
# - Average rating
# - Average rating derived from thumbsUp count and rating that was marked with thumb up
# Second - with linear regression coefficients for the 
# 'reviews','rating' and 'thumbsUpRating'
# Saving each into a separate csv file 
# The followinng notebook was made for this. We will use multi-thread to speed up the calculations

In [1]:
def Get_LR_coef(X, y):
    lr = LinearRegression()
    lr.fit(X, y)
    return lr


In [2]:
def LR(app_idn):
    df = df_score[df_score.id == app_idn]
    # we do not consider apps with less than 10 reviews
    if df.id.count() < 10:
        return

    if df.thumbsUpCount.sum() == 0:
        return

    per = df['at'].dt.to_period("M")
    
    app_score = df.groupby(per).agg(reviews=('id','count'),
                                     replies=('repliedAt','count'),
                                     rating=('score','mean'),
                                     thumbsUpScore=('thumbsUpScore','sum'),
                                     thumbsUpCount=('thumbsUpCount','sum'))
    app_score['thumbsUpRating'] = app_score['thumbsUpScore']/app_score['thumbsUpCount']
    app_score['thumbsUpRating'].replace(np.inf,np.nan,inplace=True)
    app_score['thumbsUpRating'].replace(-np.inf,np.nan,inplace=True)
    app_score['thumbsUpRating'] = app_score['thumbsUpRating'].fillna(np.mean(app_score['thumbsUpRating']))
    app_score = app_score[['reviews','replies','rating','thumbsUpRating']]
    app_score['id'] = app_idn
    app_score = app_score.reset_index()
    
    monthly_score.append(app_score)
    
    X = app_score['at'].dt.to_timestamp().map(datetime.toordinal).to_frame(name='at')

    score = {}
    score['id'] = app_idn
    for nm in ['thumbsUpRating', 'reviews', 'rating']:
        y = app_score[nm]
        lr = Get_LR_coef(X, y)
        score[nm] = lr.coef_[0]

    score_trends.append(score)

    return 


In [3]:
import queue
import threading
from threading import Lock

# Trick to print while multithreading
print_lock = Lock()
def safe_print(*args, **kwargs):
    with print_lock:
        print(*args, **kwargs)

import pandas as pd
import numpy as np
from datetime import datetime

from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf

class Worker(threading.Thread):
    def __init__(self,q):
        threading.Thread.__init__(self)
        self.q=q
    def run(self):
        safe_print('In Worker Class {}'.format(self.name))
        while True:
            app_idn=self.q.get()
            safe_print('Thread: {}. App: {}. Apps left: {} '.format(self.name, app_idn, q.qsize()), end='\r')
            LR(app_idn)
            self.q.task_done()
            
score_trends = []
monthly_score = []

q = queue.Queue()
Data_Folder = 'Data'
df_score = pd.read_pickle('{}\\{:02d}_app_all_review_scores.pkl'.format(Data_Folder,15))
df_score['thumbsUpScore'] = df_score['thumbsUpCount'] * df_score['score']

# Ten worker threads should be enough for parallellizm
for i in range(10):
    worker=Worker(q)
    safe_print('Going to Thread {}'.format(i))
    worker.daemon=True
    worker.start()
    
for app_idn in df_score.id.unique():
    q.put(app_idn)
    
q.join()
print('\n')
print('Done')

Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Going to Thread!
In Worker Class
Donead: Thread-10. App: 109922. Apps left: 0    


In [6]:
monthly_score_df = pd.concat(monthly_score, ignore_index=True)

In [7]:
monthly_score_df.to_csv('{}\\{:02d}_app_score_monthly.csv'.format(Data_Folder,16),
                       columns=['id', 'at', 'reviews', 'replies', 'rating', 'thumbsUpRating'],
                       index=False,
                       header=True,
                       mode='w')

score_trends_df = pd.DataFrame(score_trends)
score_trends_df.to_csv('{}\\{:02d}_app_score_trends.csv'.format(Data_Folder,17), index=False, header=True, mode='w')
print("Done")


Done
