# Data analysis in pure Python

- Free iOS and Android apps: analysis of in-app add revenue

In [1]:
# read in datasets

from csv import reader


#ios
open_file = open('AppleStore.csv')
read_file = reader(open_file)
ios = list(read_file)

open_file.close()

#android
open_file = open('googleplaystore.csv')
read_file = reader(open_file)
android = list(read_file)

open_file.close()

In [2]:
# helper function to explore datasets

def explore_data(dataset, start, end, rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [3]:
print('\niOS:\n')
explore_data(ios,1,5,rows_and_columns=True)
print('\nAndroid:\n')
explore_data(android,1,5,rows_and_columns=True)


iOS:

['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']


['389801252', 'Instagram', '113954816', 'USD', '0.0', '2161558', '1289', '4.5', '4.0', '10.23', '12+', 'Photo & Video', '37', '0', '29', '1']


['529479190', 'Clash of Clans', '116476928', 'USD', '0.0', '2130805', '579', '4.5', '4.5', '9.24.12', '9+', 'Games', '38', '5', '18', '1']


['420009108', 'Temple Run', '65921024', 'USD', '0.0', '1724546', '3842', '4.5', '4.0', '1.6.2', '9+', 'Games', '40', '5', '1', '1']


Number of rows: 7198
Number of columns: 16

Android:

['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher

Android: https://www.kaggle.com/lava18/google-play-store-apps/home
iOS: https://www.kaggle.com/ramamet4/app-store-apple-data-set-10k-apps/home

In [4]:
explore_data(ios,0,1)
explore_data(android,0,1)

['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']




In [5]:
columns = len(android[0])
for row in android:
    if len(row)!= columns:
        print(row)
        print('\n')
        print(android.index(row))
        to_remove = android.index(row)
       


['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


10473


In [6]:
del android[to_remove]

In [7]:
# find duplicates

unique = []
duplicate=[]

for app in android[1:]:
    name = app[0]
    if name in unique:
        duplicate.append(name)
    else:
        unique.append(name)
 
print(str(len(duplicate)) + ' duplicate apps')
print('\n')
print(duplicate[:5])
print('\n')

# print all duplicate rows

# for i in android[1:]:
#    if i[0] in duplicate:
#        print(i)
#        print('\n')    


1181 duplicate apps


['Quick PDF Scanner + OCR FREE', 'Box', 'Google My Business', 'ZOOM Cloud Meetings', 'join.me - Simple Meetings']




In [8]:
# remove duplicate rows

# create dict with {app:max(n_reviews)}
reviews_max = {}

for row in android[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews

print(len(reviews_max))

# clean dataset
# keep only rows where, for each app, n_reviews == reviews_max

android_clean = []
already_added = []

for row in android[1:]:
    name = row[0]
    n_reviews = float(row[3])
    
    if n_reviews == reviews_max[name] and name not in already_added:
        android_clean.append(row)
        already_added.append(name)
        
print(len(android_clean))

9659
9659


In [9]:
# Helper function to detect non-english characters

def is_english(s):
    count = 0
    for char in s:
        if ord(char) > 127:
            count +=1
    if count > 3:
        return False
    else:
        return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))


True
False
True
True


In [10]:
ios_en = []

for row in ios[1:]:
    app = row[1]
    if(is_english(app) == True):
        ios_en.append(row)
        
android_en = []

for row in android_clean[1:]:
    app = row[1]
    if(is_english(app) == True):
        android_en.append(row)

print('iOS EN: ' + str(len(ios_en)))
print('Android EN: ' + str(len(android_en)))

iOS EN: 6183
Android EN: 9658


In [11]:
# Isolate free apps

android_free = []
iOS_free = []

for row in android_en:
    price = row[6]
    if price == 'Free':
        android_free.append(row)

for row in ios_en:
    price = float(row[4])
    if price == 0:
        iOS_free.append(row)
        
print('iOS free: ' + str(len(iOS_free)))
print('Android free: ' + str(len(android_free)))     
        

iOS free: 3222
Android free: 8903


In [12]:
# Create frequency tables for genres / categories

def freq_table(dataset,index):
    freq={}
    for row in dataset:
        val = row[index]
        if val in freq:
            freq[val] +=1
        else:
            freq[val] = 1

    return freq
            
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])
        
print(display_table(android_free,1))
print('\n')
print(display_table(android_free,9))
print('\n')
print(display_table(iOS_free,11))

FAMILY : 1689
GAME : 864
TOOLS : 751
BUSINESS : 408
LIFESTYLE : 350
PRODUCTIVITY : 346
FINANCE : 328
MEDICAL : 313
SPORTS : 301
PERSONALIZATION : 295
COMMUNICATION : 288
HEALTH_AND_FITNESS : 273
PHOTOGRAPHY : 262
NEWS_AND_MAGAZINES : 252
SOCIAL : 236
TRAVEL_AND_LOCAL : 207
SHOPPING : 200
BOOKS_AND_REFERENCE : 194
DATING : 165
VIDEO_PLAYERS : 160
MAPS_AND_NAVIGATION : 126
FOOD_AND_DRINK : 110
EDUCATION : 104
ENTERTAINMENT : 85
LIBRARIES_AND_DEMO : 83
AUTO_AND_VEHICLES : 82
HOUSE_AND_HOME : 73
WEATHER : 71
EVENTS : 63
PARENTING : 58
ART_AND_DESIGN : 57
COMICS : 56
BEAUTY : 53
None


Tools : 750
Entertainment : 542
Education : 480
Business : 408
Lifestyle : 349
Productivity : 346
Finance : 328
Medical : 313
Sports : 307
Personalization : 295
Communication : 288
Action : 275
Health & Fitness : 273
Photography : 262
News & Magazines : 252
Social : 236
Travel & Local : 206
Shopping : 200
Books & Reference : 194
Simulation : 184
Dating : 165
Arcade : 164
Video Players & Editors : 158
Casual :

In [18]:
# Get avg number of user ratings (proxy for installs) by genre for iOS free apps

unique_genre = freq_table(iOS_free,11)

for genre in unique_genre:
    total = 0
    len_genre = 0
    
    for row in iOS_free:
        if row[11] == genre:
            len_genre +=1
            total += float(row[5])
    
    avg_rating = total / len_genre
    print('App genre: ' + genre + ' ' + str(avg_rating))

App genre: Productivity 21028.410714285714
App genre: Utilities 18684.456790123455
App genre: Weather 52279.892857142855
App genre: Health & Fitness 23298.015384615384
App genre: Education 7003.983050847458
App genre: Reference 74942.11111111111
App genre: Finance 31467.944444444445
App genre: Photo & Video 28441.54375
App genre: Sports 23008.898550724636
App genre: Games 22788.6696905016
App genre: Shopping 26919.690476190477
App genre: Business 7491.117647058823
App genre: Entertainment 14029.830708661417
App genre: Social Networking 71548.34905660378
App genre: Navigation 86090.33333333333
App genre: Catalogs 4004.0
App genre: News 21248.023255813954
App genre: Medical 612.0
App genre: Travel 28243.8
App genre: Food & Drink 33333.92307692308
App genre: Music 57326.530303030304
App genre: Book 39758.5
App genre: Lifestyle 16485.764705882353


In [32]:
# Get avg number of installs by category for free Android apps

android_unique_cat = freq_table(android_free,1)
#print(android_unique_cat.keys())

for category in android_unique_cat:
    total = 0
    len_category = 0
    for row in android_free:
        category_app = row[1]
        if category_app == category:
            installs = row[5]
            installs = installs.replace('+','')
            installs = installs.replace(',','')
            installs = float(installs)
            total += installs
            len_category +=1
    
    avg_installs = total / len_category
    
    print('Category: ' + category)
    print('Category Size: ' + str(len_category))
    print('Category Avg Installs: ' + str(installs))
    print('\n')

Category: PRODUCTIVITY
Category Size: 346
Category Avg Installs: 10.0


Category: SHOPPING
Category Size: 200
Category Avg Installs: 1000000.0


Category: LIFESTYLE
Category Size: 350
Category Avg Installs: 10000000.0


Category: SOCIAL
Category Size: 236
Category Avg Installs: 5000000.0


Category: BOOKS_AND_REFERENCE
Category Size: 194
Category Avg Installs: 1000.0


Category: SPORTS
Category Size: 301
Category Avg Installs: 1.0


Category: PERSONALIZATION
Category Size: 295
Category Avg Installs: 500.0


Category: BUSINESS
Category Size: 408
Category Avg Installs: 10.0


Category: FINANCE
Category Size: 328
Category Avg Installs: 10000.0


Category: VIDEO_PLAYERS
Category Size: 160
Category Avg Installs: 100.0


Category: BEAUTY
Category Size: 53
Category Avg Installs: 100000.0


Category: LIBRARIES_AND_DEMO
Category Size: 83
Category Avg Installs: 10000000.0


Category: MEDICAL
Category Size: 313
Category Avg Installs: 1000.0


Category: EVENTS
Category Size: 63
Category Avg Instal