# Making a single pickle file with all statistical data 
# from 'ReviewsSentiment' folder

In [1]:
import pandas as pd
import os
from tqdm.auto import tqdm
from datetime import datetime

In [None]:
df_appId = pd.read_csv('Dataset//app_data_processed.csv', usecols=['appId'])

In [2]:
file_to_base = 'Dataset//app_all_review_stats.'

In [3]:
# For easier statistical plotting we need to consolidate the data and simplify in order to optimize memory allocation
# Date fields are shorten, appId is replaced with dataframe index 
# Running through the list of appIds and loading app review stats fields 
# Saving into a separate csv file as it allowes append writing, contrary to pickle
try:
    while len(tqdm._instances) > 0:
        tqdm._instances.pop().close()
except:
    pass

file_to = ''.join([file_to_base,'csv'])

try:
    processed_ids = pd.read_csv(file_to, usecols=['id'])
    last_processed_id = df_appId.loc[processed_ids['id'].max()]['appId']
    mode = 'a'
except:
    last_processed_id = ''
    mode = 'w'

for app_Id in tqdm(list(df_appId[df_appId['appId']>last_processed_id]['appId'])):
        
    file_from = ''.join(['ReviewsSentiment//',app_Id,'.pkl']) 
    
    if not os.path.exists(file_from):
        continue

    scores = pd.read_pickle(file_from)

    scores['id'] = df_appId.index[df_appId.appId == app_Id][0]
    scores['origId'] = scores.index
    scores['at'] = pd.to_datetime(scores['at'], format="%Y-%m-%d").dt.strftime("%Y%m%d")
    scores['repliedAt'] = scores.apply(lambda x: 0 if pd.isnull(x['repliedAt']) else pd.to_datetime(x['repliedAt'], format="%Y-%m-%d").strftime("%Y%m%d"), axis=1)
    scores['reviewLength'] = scores['content'].str.len()

    scores.to_csv('Dataset//app_all_review_stats.csv', columns=['id','origId','at','repliedAt','score','thumbsUpCount','polarity','subjectivity','reviewLength'], 
               index = False,
               header = (mode == 'w'),
               mode = mode)
    mode = 'a'

HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…




In [4]:
# Reading 'score' dataframe
df_stats = pd.read_csv(file_to)

In [None]:
# Converting date fields back to date
df_stats['at'] = pd.to_datetime(df_stats['at'],format='%Y%m%d', errors='ignore')
df_stats['repliedAt'] = pd.to_datetime(df_stats['repliedAt'],format='%Y%m%d', errors='coerce')

In [7]:
# Save it altogether as pickle for keeping the data types
df_stats.to_pickle(''.join([file_to_base,'pkl']))

In [3]:
# Read stats dataframe to check its structure
df_stats = pd.read_pickle(''.join([file_to_base,'pkl']))

In [4]:
df_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 448167785 entries, 0 to 448167784
Data columns (total 9 columns):
 #   Column         Dtype         
---  ------         -----         
 0   id             int64         
 1   origId         int64         
 2   at             datetime64[ns]
 3   repliedAt      datetime64[ns]
 4   score          int64         
 5   thumbsUpCount  int64         
 6   polarity       float64       
 7   subjectivity   float64       
 8   reviewLength   int64         
dtypes: datetime64[ns](2), float64(2), int64(5)
memory usage: 30.1 GB
