# Project analysing what makes apps more likely to attract users

Project is about analysing market data regarding apps and isolating the factors that cause these apps to be attractive to users



In [1]:
def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]
    for row in dataset_slice:
        print(row)
        print('\n') #Adds a new empty line
    if rows_and_columns:
        print('Number of Rows:', len(dataset))
        print('Number of Columns',len(dataset[0]))

In [2]:
apple_opened_file = open('AppleStore.csv')
google_opened_file = open('googleplaystore.csv')
from csv import reader
apple_reader = reader(apple_opened_file)
google_reader = reader(google_opened_file)
apple_dataset = list(apple_reader)
google_dataset = list(google_reader)

In [3]:
explore_data(apple_dataset, 0, 10, rows_and_columns=True)


['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


['284035177', 'Pandora - Music & Radio', '130242560', 'USD', '0.0', '1126879', '3594', '4.0', '4.5', '8.4.1', '12+', 'Music', '37', '4', '1', '1']


['429047995', 'Pinterest', '74778624', 'USD', '0.0', '1061

In [4]:
explore_data(google_dataset, 0, 3, True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


Number of Rows: 10842
Number of Columns 13


In [5]:
explore_data(google_dataset, 0, 1)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']




In [6]:
explore_data(google_dataset, 10473, 10474)

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']




In [7]:
del google_dataset[10473]

The dataset naturally contains duplicates, these differ based on the review counts. The largest review counts will also be the latest version of the application.

In [8]:
for data in google_dataset:
    name = data[0]
    if name == "Instagram":
        print(data)

['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577446', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66577313', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']
['Instagram', 'SOCIAL', '4.5', '66509917', 'Varies with device', '1,000,000,000+', 'Free', '0', 'Teen', 'Social', 'July 31, 2018', 'Varies with device', 'Varies with device']


In [9]:
duplicates = []
uniques = []
for app in google_dataset[1:]:
    name = app[0]
    if name in uniques:
        duplicates.append(name)
    else:
        uniques.append(name)
print("Number of duplicate apps: " +  str(len(duplicates)))

Number of duplicate apps: 1181


The duplicate apps will not be removed indiscrimnantly, rather the criterion set will be that the duplicate with the largest number of reviews will be kept as this will be the latest version of the application that has had its data pulled.


In [10]:
reviews_max = {}
for app in google_dataset[1:]:
    name = app[0]
    n_reviews = float(app[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    elif name not in reviews_max:
        reviews_max[name] = n_reviews
        
print(len(reviews_max))

9659


Now we can begin removing the duplicate entries


In [11]:
android_clean = [] #this will store the cleaned dataset
already_added = [] #This list exists to keep track of app names

for app in google_dataset[1:]:
    name = app[0]
    n_reviews = float(app[3])
    
    if n_reviews == reviews_max[name] and name not in already_added: #if the number of reviews is the  same as the greatest number of reviews and we have not already added the name
        android_clean.append(app)
        already_added.append(name)
        
print(len(android_clean))

9659


As this is a project for English speaking consumers and apps, there are non-english apps present in our dataset. Therefore, we will use ASCII codes in order to determine if the apps present in the dataset are english speaking or not.

In [12]:
def check_english(string):
    non_eng = 0
    for char in string:
        if ord(char) > 127:
            non_eng +=1
    if non_eng > 3:
        return False
    else:
        return True

We can now use this function to filter away the apps that have non-english app names.


In [13]:
eng_android = []
eng_apple = []

for app in android_clean:
    name = app[0]
    if check_english(name) == True:
        eng_android.append(app)

for app in apple_dataset[1:]:
    name = app[1]
    if check_english(name) == True:
        eng_apple.append(app)

print("English Android Apps:" + " " + str(len(eng_android)))
print("English Apple Apps" + " " + str(len(eng_apple)))

English Android Apps: 9614
English Apple Apps 6183


As the purpose of the project is to focus on free to download and install apps, the non-free apps will be removed from the datasets.

In [14]:
#Google Dataset price column is index 7
#Apple is index 4

free_google = []
free_apple = []

for app in eng_android:
    price = app[7]
    if price == '0':
        free_google.append(app)
        
for app in eng_apple:
    price = app[4]
    if price == '0.0':
        free_apple.append(app)
        
print(len(free_google))
print(len(free_apple))

8864
3222


To minimize risks and overhead, our validation strategy for an app idea has three steps:

Build a minimal Android version of the app, and add it to Google Play.
If the app has a good response from users, we develop it further.
If the app is profitable after six months, we build an iOS version of the app and add it to the App Store.

Because our end goal is to add the app on both Google Play and the App Store, we need to find app profiles that are successful in both markets. For instance, a profile that works well for both markets might be a productivity app that makes use of gamification.

In [15]:
def freq_table(dataset, index):
    total_apps = 0
    output = {}
    for row in dataset:
        total_apps +=1
        data = row[index]
        if data in output:
            output[data] += 1
        else:
            output[data] = 1
    percentage_table = {}
    for app in output:
        percentage = (output[app] / total_apps) * 100
        percentage_table[app] = percentage
    
    return percentage_table
    


        
            

In [16]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

print("GOOGLE GENRE")
display_table(free_google, 9)
print("GOOGLE CATEGORY")
display_table(free_google, 1)
print("APPLE PRIME GENRE")
display_table(free_apple, 11)

GOOGLE GENRE
Tools : 8.449909747292418
Entertainment : 6.069494584837545
Education : 5.347472924187725
Business : 4.591606498194946
Productivity : 3.892148014440433
Lifestyle : 3.892148014440433
Finance : 3.7003610108303246
Medical : 3.531137184115524
Sports : 3.463447653429603
Personalization : 3.3167870036101084
Communication : 3.2378158844765346
Action : 3.1024368231046933
Health & Fitness : 3.0798736462093865
Photography : 2.944494584837545
News & Magazines : 2.7978339350180503
Social : 2.6624548736462095
Travel & Local : 2.3240072202166067
Shopping : 2.2450361010830324
Books & Reference : 2.1435018050541514
Simulation : 2.0419675090252705
Dating : 1.861462093862816
Arcade : 1.8501805054151623
Video Players & Editors : 1.7712093862815883
Casual : 1.7599277978339352
Maps & Navigation : 1.3989169675090252
Food & Drink : 1.2409747292418771
Puzzle : 1.128158844765343
Racing : 0.9927797833935018
Role Playing : 0.9363718411552346
Libraries & Demo : 0.9363718411552346
Auto & Vehicles : 0.

# Apple Store Most Popular Genres

The data generated above shows the genres as well as their percentage proportions out of all apps. We will consider the most popular genres by computing the average installs per app genre - this can be done easily with the Play Store dataset. However, the Apple Store does not have a similar column, therefore the total number of ratings will be used as a proxy for this.

In [17]:
unique_apple_genres = freq_table(free_apple, 11) #Apple Rating Count is Index 5
genre_installs = []
for genre in unique_apple_genres:
    total = 0
    len_genre = 0
    for row in free_apple:
        genre_app = row[11]
        ratings = float(row[5])
        if genre_app == genre:
            total += ratings
            len_genre += 1
    average_rating = total / len_genre
    genre_installs.append((average_rating, genre))
print(sorted(genre_installs))

[(612.0, 'Medical'), (4004.0, 'Catalogs'), (7003.983050847458, 'Education'), (7491.117647058823, 'Business'), (14029.830708661417, 'Entertainment'), (16485.764705882353, 'Lifestyle'), (18684.456790123455, 'Utilities'), (21028.410714285714, 'Productivity'), (21248.023255813954, 'News'), (22788.6696905016, 'Games'), (23008.898550724636, 'Sports'), (23298.015384615384, 'Health & Fitness'), (26919.690476190477, 'Shopping'), (28243.8, 'Travel'), (28441.54375, 'Photo & Video'), (31467.944444444445, 'Finance'), (33333.92307692308, 'Food & Drink'), (39758.5, 'Book'), (52279.892857142855, 'Weather'), (57326.530303030304, 'Music'), (71548.34905660378, 'Social Networking'), (74942.11111111111, 'Reference'), (86090.33333333333, 'Navigation')]


Potential recommended application profile can be a Navigation app, however that is most likely skewed due to popular Navigation apps. 

# Play Store Most Popular Genres

In [21]:
unique_google = freq_table(free_google, 1)
category_installs = []
for category in unique_google:
    total = 0
    len_category = 0
    for row in free_google:
        category_app = row[1]
        if category == category_app:
            installs = row[5].replace('+', "").replace(",", "")
            total += float(installs)
            len_category +=1
    
    average_installs = total / len_category
    category_installs.append((average_installs, category))
print(sorted(category_installs))
    
            

[(120550.61980830671, 'MEDICAL'), (253542.22222222222, 'EVENTS'), (513151.88679245283, 'BEAUTY'), (542603.6206896552, 'PARENTING'), (638503.734939759, 'LIBRARIES_AND_DEMO'), (647317.8170731707, 'AUTO_AND_VEHICLES'), (817657.2727272727, 'COMICS'), (854028.8303030303, 'DATING'), (1331540.5616438356, 'HOUSE_AND_HOME'), (1387692.475609756, 'FINANCE'), (1437816.2687861272, 'LIFESTYLE'), (1712290.1474201474, 'BUSINESS'), (1833495.145631068, 'EDUCATION'), (1924897.7363636363, 'FOOD_AND_DRINK'), (1986335.0877192982, 'ART_AND_DESIGN'), (3638640.1428571427, 'SPORTS'), (3695641.8198090694, 'FAMILY'), (4056941.7741935486, 'MAPS_AND_NAVIGATION'), (4188821.9853479853, 'HEALTH_AND_FITNESS'), (5074486.197183099, 'WEATHER'), (5201482.6122448975, 'PERSONALIZATION'), (7036877.311557789, 'SHOPPING'), (8767811.894736841, 'BOOKS_AND_REFERENCE'), (9549178.467741935, 'NEWS_AND_MAGAZINES'), (10801391.298666667, 'TOOLS'), (11640705.88235294, 'ENTERTAINMENT'), (13984077.710144928, 'TRAVEL_AND_LOCAL'), (15588015.

Ba