In [None]:
import json
import os
import io

In [5]:
JSON_File = 'Scrapped//GPA_Final_App_Data.json'

def Load_Json(file_name):
    data_dict = []
    if os.path.exists(file_name):
        with io.open(file_name, encoding='utf8') as data_file:
            data_dict = json.load(data_file)
    return data_dict

In [6]:
# Load JSON file into dictionnary
app_data = Load_Json(JSON_File)

In [7]:
# Number of scrapped apps 
len(app_data)

123460

In [8]:
# Example of the data to look at at it
first_app = app_data[list(app_data.keys())[0]]
first_app

{'adSupported': True,
 'androidVersion': '4.0.3',
 'androidVersionText': '4.0.3 and up',
 'appId': 'AutomateIt.mainPackage',
 'comments': ["App has always been a little glitchy, but its usefulness outbalanced that until recently. I've never gotten a response from support from the handful of times I've emailed them about issues over the past few years, but I've generally been able to figure out solutions on my own. Now the app isn't working at all, and since I can't fix it this time, lack of support means the app is now useless.",
  "Very limited app but with potential. Simple triggers only. No between one time and another time trigger, one of the most useful triggers but not in this app. Time triggers only operate at the exact time they are set for which makes them pretty limited in their usefulness. No ability to prioritize phone numbers that will ring when silent mode is selected, eg when a family member rings. MacroDroid is considerably more advanced. I'm a normal user with no assoc

In [9]:
# Check available features of the apps and store their type
features = {}

for _, app in app_data.items():
    for f in app.items():
        # Do not add empty features as their data type is not defined 
        if f[1] != None:
            features[f[0]] = str(type(f[1]))

In [10]:
# Full list of features
for f,t in sorted(features.items()):
    print(f,t)

adSupported <class 'bool'>
androidVersion <class 'str'>
androidVersionText <class 'str'>
appId <class 'str'>
comments <class 'list'>
containsAds <class 'bool'>
contentRating <class 'str'>
contentRatingDescription <class 'str'>
currency <class 'str'>
description <class 'str'>
descriptionHTML <class 'str'>
developer <class 'str'>
developerAddress <class 'str'>
developerEmail <class 'str'>
developerId <class 'str'>
developerInternalID <class 'str'>
developerWebsite <class 'str'>
free <class 'bool'>
genre <class 'str'>
genreId <class 'str'>
headerImage <class 'str'>
histogram <class 'list'>
icon <class 'str'>
installs <class 'str'>
minInstalls <class 'int'>
offersIAP <class 'bool'>
originalPrice <class 'float'>
price <class 'int'>
privacyPolicy <class 'str'>
ratings <class 'int'>
recentChanges <class 'str'>
recentChangesHTML <class 'str'>
released <class 'str'>
reviews <class 'int'>
sale <class 'bool'>
saleText <class 'str'>
saleTime <class 'int'>
score <class 'float'>
screenshots <class '

In [11]:
# Features with extra hierarchy
for f,t in features.items():
    if t == "<class 'list'>":
        print(f,t)

comments <class 'list'>
histogram <class 'list'>
screenshots <class 'list'>


In [12]:
# 'comments' and 'screenshots' are some separate field of exploration, but 'histogram' could possibly be used in dataset 
# So: 
# 'histogram' field will be flattened
# 'comments' and 'screenshots' will go to their separate datasets
# Let's see historgam example:
hier_fields = {'comments','screenshots','histogram'}

first_app['histogram']


[2801, 1254, 1826, 2319, 6935]

In [13]:
# We can represent it as 5 different fields
rating_example = {'rating' + str(num_stars+1):star_rating for num_stars, star_rating in enumerate(first_app['histogram'])}
rating_example

{'rating1': 2801,
 'rating2': 1254,
 'rating3': 1826,
 'rating4': 2319,
 'rating5': 6935}

In [14]:
# All features except of list type with quotes around their names
for f,t in features.items():
    if t != "<class 'list'>":
        print("'"+f+"',")

'adSupported',
'androidVersion',
'androidVersionText',
'appId',
'containsAds',
'contentRating',
'currency',
'description',
'descriptionHTML',
'developer',
'developerAddress',
'developerEmail',
'developerId',
'developerInternalID',
'developerWebsite',
'free',
'genre',
'genreId',
'headerImage',
'icon',
'installs',
'minInstalls',
'offersIAP',
'price',
'privacyPolicy',
'ratings',
'released',
'reviews',
'sale',
'score',
'size',
'summary',
'summaryHTML',
'title',
'updated',
'url',
'version',
'video',
'videoImage',
'recentChanges',
'recentChangesHTML',
'contentRatingDescription',
'originalPrice',
'saleText',
'saleTime',


In [15]:
# Example of the data to look at at it
first_app_reduced = {key:first_app[key] for key in set(first_app.keys()) - hier_fields if key in first_app}
first_app_reduced


{'version': '3.0.241',
 'ratings': 15138,
 'summaryHTML': 'Automatically launch tasks by location, SMS, battery level, wifi, calls and more',
 'icon': 'https://lh3.googleusercontent.com/AX5i8AxpkNAcaRCTAeBhogtRfy5BsgazxdRkJcsNAded9ffGoxUXQcKso13qS_Nx14g',
 'adSupported': True,
 'saleText': None,
 'androidVersionText': '4.0.3 and up',
 'sale': False,
 'developerWebsite': 'http://automateitapp.com',
 'score': 3.6165802,
 'installs': '1,000,000+',
 'containsAds': True,
 'title': 'AutomateIt - Automate tasks, save battery and more',
 'size': '23M',
 'contentRatingDescription': None,
 'recentChangesHTML': None,
 'genreId': 'PRODUCTIVITY',
 'minInstalls': 1000000,
 'recentChanges': None,
 'saleTime': None,
 'genre': 'Productivity',
 'androidVersion': '4.0.3',
 'appId': 'AutomateIt.mainPackage',
 'currency': 'USD',
 'free': True,
 'summary': 'Automatically launch tasks by location, SMS, battery level, wifi, calls and more',
 'url': 'https://play.google.com/store/apps/details?id=AutomateIt.mai

In [16]:
# Add flatten rating histogram to the dictionnary
first_app_reduced.update(rating_example)

In [17]:
first_app_reduced.keys()

dict_keys(['version', 'ratings', 'summaryHTML', 'icon', 'adSupported', 'saleText', 'androidVersionText', 'sale', 'developerWebsite', 'score', 'installs', 'containsAds', 'title', 'size', 'contentRatingDescription', 'recentChangesHTML', 'genreId', 'minInstalls', 'recentChanges', 'saleTime', 'genre', 'androidVersion', 'appId', 'currency', 'free', 'descriptionHTML', 'description', 'summary', 'url', 'contentRating', 'developerInternalID', 'video', 'headerImage', 'videoImage', 'developerId', 'reviews', 'offersIAP', 'privacyPolicy', 'developer', 'developerEmail', 'developerAddress', 'originalPrice', 'price', 'updated', 'released', 'rating1', 'rating2', 'rating3', 'rating4', 'rating5'])

In [18]:
# Let's filter out fields that are not important for future analysis
# For that let's visualise what all fields represent
for f in sorted(set(features) - hier_fields):
    for _, app in app_data.items():
        if app[f] != None:
            print({f:app[f]})
            break

{'adSupported': True}
{'androidVersion': '4.0.3'}
{'androidVersionText': '4.0.3 and up'}
{'appId': 'AutomateIt.mainPackage'}
{'containsAds': True}
{'contentRating': 'Everyone'}
{'contentRatingDescription': 'Fantasy Violence'}
{'currency': 'USD'}
{'developer': 'SmarterApps Ltd'}
{'developerAddress': 'Petah-Tikva, Israel'}
{'developerEmail': 'support@automateitapp.com'}
{'developerId': '8202756898217994038'}
{'developerInternalID': '8202756898217994038'}
{'developerWebsite': 'http://automateitapp.com'}
{'free': True}
{'genre': 'Productivity'}
{'genreId': 'PRODUCTIVITY'}
{'headerImage': 'https://lh3.googleusercontent.com/fmgYVgTZTl5MiwsHdl9fE7qdsW1bi5I3sBJk11bllLL5XQnC3Anxo40IBq0KdEQxlvI'}
{'icon': 'https://lh3.googleusercontent.com/AX5i8AxpkNAcaRCTAeBhogtRfy5BsgazxdRkJcsNAded9ffGoxUXQcKso13qS_Nx14g'}
{'installs': '1,000,000+'}
{'minInstalls': 1000000}
{'offersIAP': True}
{'originalPrice': 3.99}
{'price': 0}
{'privacyPolicy': 'https://automateitapp.com/privacy-policy/'}
{'ratings': 15138}

In [None]:
# Let's see some detailed info for some fields

In [19]:
# 'genre' and 'genreId' difference
comp = {}
for _, app in app_data.items():
    comp[app['genre']] = app['genreId']
comp

{'Productivity': 'PRODUCTIVITY',
 'Music & Audio': 'MUSIC_AND_AUDIO',
 'Lifestyle': 'LIFESTYLE',
 'Tools': 'TOOLS',
 'Role Playing': 'GAME_ROLE_PLAYING',
 'Puzzle': 'GAME_PUZZLE',
 'Books & Reference': 'BOOKS_AND_REFERENCE',
 'Education': 'EDUCATION',
 'Travel & Local': 'TRAVEL_AND_LOCAL',
 'Communication': 'COMMUNICATION',
 'Personalization': 'PERSONALIZATION',
 'Medical': 'MEDICAL',
 'Libraries & Demo': 'LIBRARIES_AND_DEMO',
 'Finance': 'FINANCE',
 'Casual': 'GAME_CASUAL',
 'Video Players & Editors': 'VIDEO_PLAYERS',
 'Trivia': 'GAME_TRIVIA',
 'Social': 'SOCIAL',
 'Word': 'GAME_WORD',
 'Health & Fitness': 'HEALTH_AND_FITNESS',
 'Maps & Navigation': 'MAPS_AND_NAVIGATION',
 'Business': 'BUSINESS',
 'Shopping': 'SHOPPING',
 'Photography': 'PHOTOGRAPHY',
 'Simulation': 'GAME_SIMULATION',
 'Entertainment': 'ENTERTAINMENT',
 'Sports': 'SPORTS',
 'News & Magazines': 'NEWS_AND_MAGAZINES',
 'House & Home': 'HOUSE_AND_HOME',
 'Beauty': 'BEAUTY',
 'Strategy': 'GAME_STRATEGY',
 'Weather': 'WEATH

In [20]:
# 'installs' and 'minInstalls' difference
comp = {}
for _, app in app_data.items():
    comp[app['installs']] = app['minInstalls']
comp

{'1,000,000+': 1000000,
 '50,000+': 50000,
 '100,000+': 100000,
 '50,000,000+': 50000000,
 '500,000+': 500000,
 '10,000,000+': 10000000,
 '10,000+': 10000,
 '500+': 500,
 '100+': 100,
 '1,000+': 1000,
 '5,000+': 5000,
 '10+': 10,
 '5+': 5,
 '5,000,000+': 5000000,
 '50+': 50,
 '100,000,000+': 100000000,
 None: None,
 '1+': 1,
 '0+': 0,
 '500,000,000+': 500000000,
 '5,000,000,000+': 5000000000,
 '1,000,000,000+': 1000000000}

In [21]:
# 'installs' and 'minInstalls' difference
comp = {}
for _, app in app_data.items():
    comp[app['currency']] = app['currency']
comp

{'USD': 'USD',
 'JPY': 'JPY',
 None: None,
 'EUR': 'EUR',
 'GBP': 'GBP',
 'RON': 'RON',
 'VND': 'VND'}

In [22]:
# We can filter out the following fields:
# 'androidVersionText', - duplicates 'androidversion'
# 'developerEmail' - not needed
# 'developerInternalID' - duplicates 'developerID'
# 'descriptionHTML' - duplicates 'description'
# 'recentChangesHTML' - duplicates 'recentChanges'
# 'summaryHTML' - duplicates 'summary'
# 'installs' - duplicates 'minInstalls'
# 'genre' - duplicates 'genreId'

tech_fields = {'androidVersionText','developerEmail','developerInternalID','descriptionHTML','recentChangesHTML','summaryHTML','installs','genre'}

In [23]:
# Let's prepare new dataset, without extra hierarchy and without technical fields
new_dataset = []

for _, app in app_data.items():
    app_features_reduced = {key:app[key] for key in set(app.keys()) - hier_fields - tech_fields if key in app}
    if app['histogram'] != None:
        rating_feature = {'rating' + str(num_stars+1):star_rating for num_stars, star_rating in enumerate(app['histogram'])}
        app_features_reduced.update(rating_feature)
    new_dataset.append(app_features_reduced)

In [24]:
len(new_dataset)

123460

In [25]:
# Example what we've got as a new, flatten and reduced dataset.
new_dataset[0]

{'version': '3.0.241',
 'ratings': 15138,
 'icon': 'https://lh3.googleusercontent.com/AX5i8AxpkNAcaRCTAeBhogtRfy5BsgazxdRkJcsNAded9ffGoxUXQcKso13qS_Nx14g',
 'adSupported': True,
 'saleText': None,
 'sale': False,
 'developerWebsite': 'http://automateitapp.com',
 'score': 3.6165802,
 'containsAds': True,
 'title': 'AutomateIt - Automate tasks, save battery and more',
 'size': '23M',
 'contentRatingDescription': None,
 'genreId': 'PRODUCTIVITY',
 'minInstalls': 1000000,
 'recentChanges': None,
 'saleTime': None,
 'androidVersion': '4.0.3',
 'appId': 'AutomateIt.mainPackage',
 'currency': 'USD',
 'free': True,
 'summary': 'Automatically launch tasks by location, SMS, battery level, wifi, calls and more',
 'url': 'https://play.google.com/store/apps/details?id=AutomateIt.mainPackage&hl=en&gl=us',
 'contentRating': 'Everyone',
 'video': 'https://www.youtube.com/embed/hfnguZ2XMMM?ps=play&vq=large&rel=0&autohide=1&showinfo=0',
 'headerImage': 'https://lh3.googleusercontent.com/fmgYVgTZTl5MiwsH

In [26]:
# Check available features of the apps and store their type
features = {}

for app in new_dataset:
    for f in app.items():
        # Do not add empty features as their data type is not defined 
        if f[1] != None:
            features[f[0]] = str(type(f[1]))

In [27]:
# Next step we will split the dataset and save it to three different flat csv files
# app_base_data - dataset with most of the fields that we've defined above
# app_screenshots - dataset with user screenshot links 
# app_comments - dataset with user comments
import pandas as pd

In [28]:
# Convert data dictionnary to pandas dataframe and save it to csv file for further access
app_base_data = pd.DataFrame(new_dataset)

In [29]:
app_base_data.head()

Unnamed: 0,version,ratings,icon,adSupported,saleText,sale,developerWebsite,score,containsAds,title,...,developerAddress,originalPrice,price,updated,released,rating1,rating2,rating3,rating4,rating5
0,3.0.241,15138.0,https://lh3.googleusercontent.com/AX5i8AxpkNAc...,True,,False,http://automateitapp.com,3.61658,True,"AutomateIt - Automate tasks, save battery and ...",...,"Petah-Tikva, Israel",,0.0,1558506000.0,"Feb 4, 2011",2801.0,1254.0,1826.0,2319.0,6935.0
1,4.0.241,6197.0,https://lh3.googleusercontent.com/2QBsyy6BPV3Q...,,,False,http://automateitapp.com,3.238007,,AutomateIt Pro - Automate tasks on your Android,...,"Petah-Tikva, Israel",,2.99,1558506000.0,"Jun 12, 2011",1577.0,640.0,880.0,926.0,2172.0
2,5.9,1425.0,https://lh3.googleusercontent.com/-l7k-c6zHbC0...,True,,False,http://www.eurozet.pl,3.093458,True,Chillizet,...,Lagardere Active Radio International\nEUROZET\...,,0.0,1577043000.0,"Mar 27, 2012",359.0,279.0,119.0,199.0,466.0
3,5.0.14,62254.0,https://lh3.googleusercontent.com/xEZTcdyGLms1...,True,,False,http://www.tworld.co.kr/,4.032904,True,T world,...,서울특별시 중구 을지로65(을지로2가) SK T-타워,,0.0,1590483000.0,"Aug 10, 2010",8193.0,1963.0,5684.0,10173.0,36239.0
4,4.5,350.0,https://lh3.googleusercontent.com/gmR0kF1AT7Wd...,,,False,https://www.drivehq.com,4.28,,Cloud File Manager,...,"2551 San Ramon Valley Blvd, Suite 213,\nSan Ra...",,0.0,1590307000.0,"Sep 6, 2011",35.0,10.0,24.0,31.0,248.0


In [92]:
# Write to file
app_base_data.to_csv('Dataset//app_base_data.csv', index = False)

In [93]:
# Let's check that csv is OK
test = pd.read_csv('Dataset//app_base_data.csv')
test.head()

Unnamed: 0,adSupported,androidVersion,appId,containsAds,contentRating,contentRatingDescription,currency,description,developer,developerAddress,...,saleTime,score,size,summary,title,updated,url,version,video,videoImage
0,True,4.0.3,AutomateIt.mainPackage,True,Everyone,,USD,AutomateIt is designed to make your life easie...,SmarterApps Ltd,"Petah-Tikva, Israel",...,,3.61658,23M,"Automatically launch tasks by location, SMS, b...","AutomateIt - Automate tasks, save battery and ...",1558506000.0,https://play.google.com/store/apps/details?id=...,3.0.241,https://www.youtube.com/embed/hfnguZ2XMMM?ps=p...,https://i.ytimg.com/vi/hfnguZ2XMMM/hqdefault.jpg
1,,4.0.3,AutomateItPro.mainPackage,,Everyone,,USD,AutomateIt is designed to make your life easie...,SmarterApps Ltd,"Petah-Tikva, Israel",...,,3.238007,23M,Turn your smartphone into a genius-phone!,AutomateIt Pro - Automate tasks on your Android,1558506000.0,https://play.google.com/store/apps/details?id=...,4.0.241,https://www.youtube.com/embed/hfnguZ2XMMM?ps=p...,https://i.ytimg.com/vi/hfnguZ2XMMM/hqdefault.jpg
2,True,4.1,ChilliZET.app,True,Everyone,,USD,Słuchaj Radia Chillizet i kanałów muzycznych w...,Eurozet Sp. z o.o.,Lagardere Active Radio International\nEUROZET\...,...,,3.093458,6.9M,Radio Chillizet - setting the CHILL OUT,Chillizet,1577043000.0,https://play.google.com/store/apps/details?id=...,5.9,,
3,True,4.4,Com.sktelecom.minit,True,Everyone,,USD,SK텔레콤 고객이라면 새로워진 T world 서비스를 기존과 동일하게 3G/LTE ...,SKTelecom,서울특별시 중구 을지로65(을지로2가) SK T-타워,...,,4.032904,31M,When you log in to the newly refurbished T wor...,T world,1590483000.0,https://play.google.com/store/apps/details?id=...,5.0.14,,
4,,4.1,DHQ.FileManagerForAndroid,,Everyone,,USD,DriveHQ Cloud File Manager can manage both loc...,DriveHQ.com & CameraFTP.com,"2551 San Ramon Valley Blvd, Suite 213,\nSan Ra...",...,,4.28,17M,"Cloud Base, File Sharing, FTP Server, Online S...",Cloud File Manager,1590307000.0,https://play.google.com/store/apps/details?id=...,4.5,https://www.youtube.com/embed/icneXELTCOo?ps=p...,https://i.ytimg.com/vi/icneXELTCOo/hqdefault.jpg


In [1]:
test.head()

NameError: name 'test' is not defined

In [36]:
# Now screenshots dataset
screens = []
for _, app in app_data.items():
    if app['screenshots'] != None:
        for s in app['screenshots']:
            screens.append((app['appId'],s))

In [37]:
screens[0]

('AutomateIt.mainPackage',
 'https://lh3.googleusercontent.com/K9y27QwA87js_rxvj9eTMR1psN8CRceuuFKpBJTb03f6PP6CVSVgN3YL5Flq_wH-mfU')

In [38]:
len(screens)

1236966

In [39]:
# Convert to pandas dataframe
app_screens = pd.DataFrame(screens)

In [40]:
app_screens.head()

Unnamed: 0,0,1
0,AutomateIt.mainPackage,https://lh3.googleusercontent.com/K9y27QwA87js...
1,AutomateIt.mainPackage,https://lh3.googleusercontent.com/RDo97gBifZ6x...
2,AutomateIt.mainPackage,https://lh3.googleusercontent.com/CiMiMmtLYVPu...
3,AutomateIt.mainPackage,https://lh3.googleusercontent.com/GtS_ySii8s34...
4,AutomateIt.mainPackage,https://lh3.googleusercontent.com/TG7shPkSRkm5...


In [41]:
# Write to file
app_screens.to_csv('Dataset//app_screens.csv', index = False)

In [44]:
# Finally, comments dataset
comments = []
for _, app in app_data.items():
    if app['comments'] != None:
        for c in app['comments']:
            comments.append((app['appId'],c))

In [46]:
comments[0]

('AutomateIt.mainPackage',
 "App has always been a little glitchy, but its usefulness outbalanced that until recently. I've never gotten a response from support from the handful of times I've emailed them about issues over the past few years, but I've generally been able to figure out solutions on my own. Now the app isn't working at all, and since I can't fix it this time, lack of support means the app is now useless.")

In [47]:
len(comments)

3060403

In [48]:
# Convert to pandas dataframe
app_comments = pd.DataFrame(comments)

In [49]:
app_comments.head()

Unnamed: 0,0,1
0,AutomateIt.mainPackage,"App has always been a little glitchy, but its ..."
1,AutomateIt.mainPackage,Very limited app but with potential. Simple tr...
2,AutomateIt.mainPackage,All i want is to mute sound when i am not on m...
3,AutomateIt.mainPackage,"Great potential & I want to love this app, but..."
4,AutomateIt.mainPackage,"This app looks so much like ""Automated Device""..."


In [50]:
# Write to file
app_comments.to_csv('Dataset//app_comments.csv', index = False)