# Profitable App Profiles for the App Store and Google Play Markets

The aim of this project is to find the type of apps that are most likely to attract (new) users and are therefore attractive for developers that rely on mobile ad revenue.

### Opening and exploring the data

In [1]:
from csv import reader

open_file = open('AppleStore.csv')
read_file = reader(open_file)
AppleStore = list(read_file)

open_file = open('googleplaystore.csv')
read_file = reader(open_file)
GooglePlayStore = list(read_file)

In [2]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
explore_data(AppleStore,0,2,True)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


Number of rows: 7198
Number of columns: 16


In [4]:
explore_data(GooglePlayStore,0,2,True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


Number of rows: 10842
Number of columns: 13


### Data cleaning

In [5]:
print(GooglePlayStore[10473])

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [6]:
del GooglePlayStore[10473]

Upon investigation of the Google Play Store dataset, a number of duplicate apps were discovered. Instagram, for example, appears for times:

In [7]:
for app in GooglePlayStore[1:]:
    name = app[0]
    if name == 'Instagram':
        print(app)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


Looking further into this issue, it appears that the data was collected at different points in time as indicated by the increasing number of reviews. A higher number of reviews would suggest that the data is more up-to-date. Hence, we will use the most recent entry for our analysis whilst removing the old data.

In [8]:
GooglePlayStore_duplicates = []
GooglePlayStore_unique = []

for app in GooglePlayStore[1:]:
    name = app[0]
    if name in GooglePlayStore_unique:
        GooglePlayStore_unique.append(name)
    else:
        GooglePlayStore_duplicates.append(name)

In [9]:
reviews_max = {}

for app in GooglePlayStore[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews

In [10]:
android_clean = []
already_added = []

for app in GooglePlayStore[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(app)
        already_added.append(name)

In [11]:
explore_data(android_clean,0,2,True)

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


Number of rows: 9659
Number of columns: 13


In [12]:
def english_language(string):
    non_english = 0
    
    for character in string:
        if ord(character) > 127:
            non_english += 1
    
    if non_english > 3:
        return False
    else:
        return True

In [13]:
android_english = []
ios_english = []

for app in android_clean[1:]:
    name = app[0]
    if english_language(name):
        android_english.append(app)
        
for app in AppleStore[1:]:
    name = app[1]
    if english_language(name):
        ios_english.append(app)

In [14]:
explore_data(android_english,0,2,True)

['U Launcher Lite – FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 9613
Number of columns: 13


In [15]:
explore_data(ios_english,0,2,True)

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


Number of rows: 6183
Number of columns: 16


In [16]:
android_final = []
ios_final = []

for app in android_english[1:]:
    price = app[7]
    if price == '0':
        android_final.append(app)
        
for app in ios_english[1:]:
    price = app[4]
    if price == '0.0':
        ios_final.append(app)

print(len(android_final))
print(len(ios_final))

8862
3221


### Data Analysis

Our aim is it to find app profiles that are successfull in both environments, i.e. Google Play Store and the Apple App Store.

In [17]:
def freq_table(dataset, index):
    table = {}
    
    for row in dataset:
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_pct = {}
                  
    for key in table:
        pct = (table[key] / (len(dataset) - 1)) * 100
        table_pct[key] = pct

    return table_pct

In [18]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [19]:
display_table(android_final, 1) #category

FAMILY : 18.91434375352669
GAME : 9.728021667983297
TOOLS : 8.46405597562352
BUSINESS : 4.593161042771697
LIFESTYLE : 3.9047511567543167
PRODUCTIVITY : 3.8934657487868187
FINANCE : 3.7016138133393524
MEDICAL : 3.532332693826882
SPORTS : 3.3969077982169056
PERSONALIZATION : 3.317909942444419
COMMUNICATION : 3.2389120866719328
HEALTH_AND_FITNESS : 3.080916375126961
PHOTOGRAPHY : 2.9454914795169844
NEWS_AND_MAGAZINES : 2.7987811759395105
SOCIAL : 2.663356280329534
TRAVEL_AND_LOCAL : 2.3360794492720913
SHOPPING : 2.245796185532107
BOOKS_AND_REFERENCE : 2.144227513824625
DATING : 1.8620923146371742
VIDEO_PLAYERS : 1.794379866832186
MAPS_AND_NAVIGATION : 1.3993905879697552
FOOD_AND_DRINK : 1.2413948764247829
EDUCATION : 1.1623970206522967
ENTERTAINMENT : 0.9592596772373322
LIBRARIES_AND_DEMO : 0.9366888613023362
AUTO_AND_VEHICLES : 0.9254034533348381
HOUSE_AND_HOME : 0.8238347816273558
WEATHER : 0.8012639656923597
EVENTS : 0.7109807019523756
PARENTING : 0.6545536621148855
COMICS : 0.62069743

In [20]:
display_table(android_final, -4) #genres

Tools : 8.452770567656021
Entertainment : 6.071549486513937
Education : 5.349283376594064
Business : 4.593161042771697
Productivity : 3.8934657487868187
Lifestyle : 3.8934657487868187
Finance : 3.7016138133393524
Medical : 3.532332693826882
Sports : 3.464620246021894
Personalization : 3.317909942444419
Communication : 3.2389120866719328
Action : 3.103487191061957
Health & Fitness : 3.080916375126961
Photography : 2.9454914795169844
News & Magazines : 2.7987811759395105
Social : 2.663356280329534
Travel & Local : 2.3247940413045933
Shopping : 2.245796185532107
Books & Reference : 2.144227513824625
Simulation : 2.0426588421171425
Dating : 1.8620923146371742
Arcade : 1.8508069066696762
Video Players & Editors : 1.7718090508971898
Casual : 1.760523642929692
Maps & Navigation : 1.3993905879697552
Food & Drink : 1.2413948764247829
Puzzle : 1.1285407967498025
Racing : 0.9931159011398263
Role Playing : 0.9366888613023362
Libraries & Demo : 0.9366888613023362
Auto & Vehicles : 0.925403453334838

In [21]:
display_table(ios_final, -5) #prime_genre

Games : 58.19875776397515
Entertainment : 7.888198757763975
Photo & Video : 4.968944099378882
Education : 3.6645962732919255
Social Networking : 3.260869565217391
Shopping : 2.608695652173913
Utilities : 2.515527950310559
Sports : 2.142857142857143
Music : 2.049689440993789
Health & Fitness : 2.018633540372671
Productivity : 1.7391304347826086
Lifestyle : 1.5838509316770186
News : 1.3354037267080745
Travel : 1.2422360248447204
Finance : 1.1180124223602486
Weather : 0.8695652173913043
Food & Drink : 0.8074534161490683
Reference : 0.5590062111801243
Business : 0.5279503105590062
Book : 0.43478260869565216
Navigation : 0.18633540372670807
Medical : 0.18633540372670807
Catalogs : 0.12422360248447205


In [22]:
ios_genres = freq_table(ios_final, -5) #prime_genre

for genre in ios_genres:
    total = 0
    len_genre = 0
    
    for app in ios_final:
        genre_app = app[-5]
        if genre_app == genre:
            user_ratings = float(app[5])
            total += user_ratings
            len_genre += 1
    avg_ratings = total / len_genre
    print(genre + ': ', avg_ratings)

Business:  7491.117647058823
Finance:  31467.944444444445
Book:  39758.5
Weather:  52279.892857142855
News:  21248.023255813954
Navigation:  86090.33333333333
Reference:  74942.11111111111
Social Networking:  43899.514285714286
Entertainment:  14029.830708661417
Education:  7003.983050847458
Shopping:  26919.690476190477
Health & Fitness:  23298.015384615384
Medical:  612.0
Sports:  23008.898550724636
Food & Drink:  33333.92307692308
Productivity:  21028.410714285714
Travel:  28243.8
Utilities:  18684.456790123455
Lifestyle:  16485.764705882353
Catalogs:  4004.0
Games:  22788.6696905016
Music:  57326.530303030304
Photo & Video:  28441.54375


In [23]:
android_categories = freq_table(android_final, 1) #category

for category in android_categories:
    total = 0
    len_category = 0
    
    for app in android_final:
        category_app = app[1]
        if category_app == category:
            num_install = app[5]
            num_install = num_install.replace('+','')
            num_install = num_install.replace(',','')
            total += float(num_install)
            len_category += 1
    avg_num_install = total / len_category
    print(category + ': ',avg_num_install)

HEALTH_AND_FITNESS:  4188821.9853479853
FAMILY:  3695641.8198090694
HOUSE_AND_HOME:  1331540.5616438356
PHOTOGRAPHY:  17840110.40229885
COMICS:  817657.2727272727
SPORTS:  3638640.1428571427
EDUCATION:  1833495.145631068
DATING:  854028.8303030303
EVENTS:  253542.22222222222
BEAUTY:  513151.88679245283
PARENTING:  542603.6206896552
BOOKS_AND_REFERENCE:  8767811.894736841
BUSINESS:  1712290.1474201474
NEWS_AND_MAGAZINES:  9549178.467741935
LIFESTYLE:  1437816.2687861272
TOOLS:  10801391.298666667
MAPS_AND_NAVIGATION:  4056941.7741935486
FINANCE:  1387692.475609756
AUTO_AND_VEHICLES:  647317.8170731707
ART_AND_DESIGN:  1967474.5454545454
FOOD_AND_DRINK:  1924897.7363636363
SOCIAL:  23253652.127118643
LIBRARIES_AND_DEMO:  638503.734939759
COMMUNICATION:  38456119.167247385
PERSONALIZATION:  5201482.6122448975
MEDICAL:  120550.61980830671
TRAVEL_AND_LOCAL:  13984077.710144928
GAME:  15588015.603248259
SHOPPING:  7036877.311557789
ENTERTAINMENT:  11640705.88235294
WEATHER:  5074486.19718309