# Making a single pickle file with all statistical data 
# from 'ReviewsPKL' folder

In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from datetime import datetime

In [2]:
df_appId = pd.read_csv('Dataset//app_data_processed.csv', usecols=['appId'])

file_to_base = 'Dataset//app_all_review_scores.'

In [3]:
# For easier statistical plotting we need to consolidate the data and simplify in order to optimize memory allocation
# Date fields are shorten, appId is replaced with dataframe index 
# Running through the list of appIds and loading app review stats fields 
# Saving into a separate csv file as it allowes append writing, contrary to pickle
try:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
except:
    pass

file_to = ''.join([file_to_base,'csv'])

try:
    processed_ids = pd.read_csv(file_to, usecols=['id'])
    last_processed_id = df_appId.loc[processed_ids['id'].max()]['appId']
    mode = 'a'
except:
    last_processed_id = ''
    mode = 'w'

for app_Id in tqdm(list(df_appId[df_appId['appId']>last_processed_id]['appId'])):
        
    file_from = ''.join(['ReviewsPKL//',app_Id,'.pkl']) 
    
    if not os.path.exists(file_from):
        continue

    scores = pd.read_pickle(file_from)

    scores['id'] = df_appId.index[df_appId.appId == app_Id][0]
    scores['at'] = pd.to_datetime(scores['at'], format="%Y-%m-%d").dt.strftime("%Y%m%d")
    scores['repliedAt'] = scores.apply(lambda x: 0 if pd.isnull(x['repliedAt']) else pd.to_datetime(x['repliedAt'], format="%Y-%m-%d").strftime("%Y%m%d"), axis=1)

    scores.to_csv(file_to, 
                  columns=['id','at','repliedAt','score','thumbsUpCount'], 
                  index = False,
                  header = (mode=='w'),
                  mode = mode)
    mode = 'a'


HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [4]:
# Reading 'score' dataframe
df_score = pd.read_csv(file_to)

In [None]:
# Converting date fields back to date
df_score['at'] = pd.to_datetime(df_score['at'],format='%Y%m%d', errors='ignore')
df_score['repliedAt'] = pd.to_datetime(df_score['repliedAt'],format='%Y%m%d', errors='coerce')

In [7]:
# Save it altogether as pickle for keeping the data types
file_to = ''.join([file_to_base,'pkl'])
df_score.to_pickle(file_to)

In [8]:
df_score = pd.read_pickle(file_to)

In [9]:
df_score.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449840499 entries, 0 to 449840498
Data columns (total 5 columns):
id               int32
at               datetime64[ns]
repliedAt        datetime64[ns]
score            int8
thumbsUpCount    int8
dtypes: datetime64[ns](2), int32(1), int8(2)
memory usage: 9.2 GB
