# Analyzing Mobile App Data
An app building company wants to analyze 'free but in-app ads' apps that
generates more revenue. Revenue generation is tightly linked with the user 
interactions with the ads.
- some google apps have duplicate enteries, so the duplicates should be removed, and the function `unique_duplicate()` is responsible for this action.
- The list `goog_clean` is a list of lists of uniques google apps with maximum reviews
- We need to select high-performing apps in both App stores, and create 
a profile for each of those apps


In [42]:
from csv import reader
def open_file(filename):
    """ open and read a file as a list of lists"""
    opened_file = open(filename,encoding='utf8');
    read_file = reader(opened_file);
    read_file = list(read_file);
    opened_file.close();
    return read_file

         

In [43]:
def explore_data(dataset, start, end, rows_and_columns=False):
    """Print out a few rows for examination
    and also determine the numbers of rows and columns"""
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print('\n') # adds a new (empty) line after each row

    if rows_and_columns:
        print('Number of rows:', len(dataset))
        print('Number of columns:', len(dataset[0]))

In [44]:
apple_data = open_file('AppleStore.csv')
goog_data = open_file('googleplaystore.csv')
apple_header = apple_data[0]
goog_header = goog_data[0]

In [45]:
explore_data(goog_data, 0,5,  True)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']


['Photo Editor & Candy Camera & Grid & ScrapBook', 'ART_AND_DESIGN', '4.1', '159', '19M', '10,000+', 'Free', '0', 'Everyone', 'Art & Design', 'January 7, 2018', '1.0.0', '4.0.3 and up']


['Coloring book moana', 'ART_AND_DESIGN', '3.9', '967', '14M', '500,000+', 'Free', '0', 'Everyone', 'Art & Design;Pretend Play', 'January 15, 2018', '2.0.0', '4.0.3 and up']


['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']


['Sketch - Draw & Paint', 'ART_AND_DESIGN', '4.5', '215644', '25M', '50,000,000+', 'Free', '0', 'Teen', 'Art & Design', 'June 8, 2018', 'Varies with device', '4.2 and up']


Number of rows: 10842
Number of columns: 13


In [46]:
"""Data Cleaning: remove incorrect data"""
n_header = len(goog_data[0])
n_rows = len(goog_data)
print(f"number of rows {n_rows} and number of columns {n_header}") 
for index, row in zip(range(0,n_rows),goog_data):
    if len(row)!= n_header:
        print(index)


number of rows 10842 and number of columns 13
10473


In [47]:
#Remove row 10473
print(goog_data[10473])
del goog_data[10473]

['Life Made WI-Fi Touchscreen Photo Frame', '1.9', '19', '3.0M', '1,000+', 'Free', '0', 'Everyone', '', 'February 11, 2018', '1.0.19', '4.0 and up']


In [48]:
def unique_duplicate(dataset):
    """Determine the unique app names and those duplicates"""
    unique_rows = []
    duplicate_rows = []
    for row in dataset[1:]:
        if row[0] in unique_rows:
            duplicate_rows.append(row[0])
        else:
            unique_rows.append(row[0])
    return unique_rows, duplicate_rows


In [49]:
unique_goog_apps, duplicate_goog_apps = unique_duplicate(goog_data)
print(len(unique_goog_apps))
print(len(duplicate_goog_apps))

9659
1181


In [50]:
# select unique apps with the most reviews
reviews_max = {}
for row in goog_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if name in reviews_max and reviews_max[name] < n_reviews:
        reviews_max[name] = n_reviews
    if name not in reviews_max:
        reviews_max[name] = n_reviews
# Check reviews_max's length
print(len(reviews_max))    

9659


In [51]:
# remove duplicates from google apps
goog_clean = []
already_added = []
for row in goog_data[1:]:
    name = row[0]
    n_reviews = float(row[3])
    if reviews_max[name] == n_reviews and name not in already_added:
        goog_clean.append(row)
        already_added.append(name)
# check length of goog_clean for completeness
print(len(goog_clean))


9659


In [52]:
# check for apps with non-English names or characters
# if an app name has 3 or more characters that fail english character test, remove the app
def foreign_characters(strings):
    check = 0
    for character in strings:
        if ord(character) > 127:
            check += True
    if check >=3:
        return False
    else:
        return True
        

In [53]:
# test foreign_characters()
str_chk = ['Instagram','Áà±Â•áËâ∫PPS -„ÄäÊ¨¢‰πêÈ¢Ç2„ÄãÁîµËßÜÂâßÁÉ≠Êí≠','Docs To Go‚Ñ¢ Free Office Suite','Instachat üòú']
for entry in str_chk:
    print(foreign_characters(entry))

True
False
True
True


In [54]:
# clean English apps
#goog_clean = goog_clean.insert(0,goog_header)
goog_eng_apps = []
apple_eng_apps = []
for row in goog_clean[1:]:
    if foreign_characters(row[0]):
        goog_eng_apps.append(row)
goog_eng_apps.insert(0,goog_header)        
for row in apple_data[1:]:
    if foreign_characters(row[0]):
        apple_eng_apps.append(row)
apple_eng_apps.insert(0,apple_header)

In [55]:
# check updated data
print(f"Number of Android Apps: {len(goog_eng_apps[1:])}")
print(f"Number of Apple Apps: {len(apple_eng_apps)}")

Number of Android Apps: 9596
Number of Apple Apps: 7198


In [56]:
# Remove priced apps: we need free apps only
print(goog_eng_apps[:2])
print(apple_eng_apps[:2])
# price indices: google [7] and apple [4]
print(goog_eng_apps[1][7])
print(apple_eng_apps[:2])

[['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver'], ['U Launcher Lite ‚Äì FREE Live Cool Themes, Hide Apps', 'ART_AND_DESIGN', '4.7', '87510', '8.7M', '5,000,000+', 'Free', '0', 'Everyone', 'Art & Design', 'August 1, 2018', '1.2.4', '4.0.3 and up']]
[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD', '0.0', '2974676', '212', '3.5', '3.5', '95.0', '4+', 'Social Networking', '37', '1', '29', '1']]
0
[['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic'], ['284882215', 'Facebook', '389879808', 'USD

In [57]:
#sort out the free apps
def get_free_app(dataset, index):
    free_app = []
    for row in dataset[1:]:
        price = row[index].replace('$', '') # remove dollar signs
        price = float(price)
        if price == 0.0:
            free_app.append(row)
    return free_app
# apply function and check
goog_eng_free_apps = get_free_app(goog_eng_apps, 7)
apple_eng_free_apps = get_free_app(apple_eng_apps, 4)

In [58]:
print(len(goog_eng_free_apps))
print(len(apple_eng_free_apps))

8847
4056


In [59]:
print(goog_header)
print(apple_header)

['App', 'Category', 'Rating', 'Reviews', 'Size', 'Installs', 'Type', 'Price', 'Content Rating', 'Genres', 'Last Updated', 'Current Ver', 'Android Ver']
['id', 'track_name', 'size_bytes', 'currency', 'price', 'rating_count_tot', 'rating_count_ver', 'user_rating', 'user_rating_ver', 'ver', 'cont_rating', 'prime_genre', 'sup_devices.num', 'ipadSc_urls.num', 'lang.num', 'vpp_lic']


In [60]:
def freq_table(dataset, index):
    table = {}
    col_total = 0
    for row in dataset[1:]:
        col_total += 1
        if row[index] in table:
            table[row[index]] += 1
        else:
            table[row[index]] = 1
    for key in table:
        table[key] = table[key] * 100 / col_total
    return table

In [61]:
def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)

    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [62]:
display_table(apple_eng_free_apps,11)

Games : 55.6596794081381
Entertainment : 8.236744759556103
Photo & Video : 4.118372379778052
Social Networking : 3.5018495684340323
Education : 3.255240443896424
Shopping : 2.9839704069050557
Utilities : 2.6880394574599262
Lifestyle : 2.318125770653514
Finance : 2.0715166461159065
Sports : 1.9482120838471024
Health & Fitness : 1.87422934648582
Music : 1.6522811344019728
Book : 1.627620221948212
Productivity : 1.528976572133169
News : 1.4303329223181258
Travel : 1.3810110974106042
Food & Drink : 1.060419235511714
Weather : 0.7644882860665845
Reference : 0.4932182490752158
Navigation : 0.4932182490752158
Business : 0.4932182490752158
Catalogs : 0.2219482120838471
Medical : 0.19728729963008632


# Analysis of frequency table for prime_genre column of App store data set
- Game apps dominate the free-apps in English Language
- The leading apps are fun-related apps, which means that users use the apps to de-stress
- General impression: any app that brings some form of happiness and pastime will get more patronage
- The number of apps per genre is a mere indication of the category of apps that is popular with the developers. A new developer might want to do something amazing the Medical genre, because it is not crowded yet.

In [63]:
display_table(goog_eng_free_apps,1)

FAMILY : 18.946416459416685
GAME : 9.699299118245534
TOOLS : 8.455799231290978
BUSINESS : 4.600949581731856
PRODUCTIVITY : 3.900067827266561
LIFESTYLE : 3.8887632828397014
FINANCE : 3.707890572009948
MEDICAL : 3.538322405607054
SPORTS : 3.391363328057879
PERSONALIZATION : 3.3235360614967218
COMMUNICATION : 3.2330997060818447
HEALTH_AND_FITNESS : 3.0861406285326702
PHOTOGRAPHY : 2.950486095410355
NEWS_AND_MAGAZINES : 2.8035270178611804
SOCIAL : 2.667872484738865
TRAVEL_AND_LOCAL : 2.3400406963599365
SHOPPING : 2.24960434094506
BOOKS_AND_REFERENCE : 2.136558896676464
DATING : 1.8652498304318337
VIDEO_PLAYERS : 1.797422563870676
MAPS_AND_NAVIGATION : 1.3904589645037304
FOOD_AND_DRINK : 1.2434998869545557
EDUCATION : 1.1643680759665385
ENTERTAINMENT : 0.9608862762830658
LIBRARIES_AND_DEMO : 0.9382771874293466
AUTO_AND_VEHICLES : 0.926972643002487
HOUSE_AND_HOME : 0.8026226543070314
WEATHER : 0.7913181098801718
EVENTS : 0.7121862988921547
PARENTING : 0.6556635767578567
ART_AND_DESIGN : 0.62

# Analysis of frequency table for category column of Google App store data set
- The family category top the table and it is followed by the Games.
- The productivity-related apps tends to gain some grounds among google apps in English
- In comparison with App stores, google users do not just use their apps to pastime, they are also productive with them.
- If recommendation must be done, it will be for a developer who is looking to create a niche in a category.

In [75]:
def app_rating_freq(dataset, index):
    rating_per_genre = {}
    total_per_genre = {}
    genre_index = 11
    for row in dataset[1:]:
        if row[genre_index] in rating_per_genre:
            rating_per_genre[row[genre_index]] += float(row[index])
            total_per_genre[row[genre_index]] += 1
        else:
            rating_per_genre[row[genre_index]] = float(row[index])
            total_per_genre[row[genre_index]] = 1
    for entry in rating_per_genre:
        rating_per_genre[entry] = rating_per_genre[entry] / total_per_genre[entry]
    return rating_per_genre

In [76]:
table = app_rating_freq(apple_eng_free_apps, 5)
table_list =sorted([(table[i],i) for i in table], reverse = True)
def display(t):
    for entry in t:
        print(entry[1], ":", entry[0])
display(table_list)

Reference : 67447.9
Music : 56482.02985074627
Weather : 47220.93548387097
Social Networking : 32503.563380281692
Photo & Video : 27249.892215568863
Navigation : 25972.05
Travel : 20216.01785714286
Food & Drink : 20179.093023255813
Sports : 20128.974683544304
Health & Fitness : 19952.315789473683
Productivity : 19053.887096774193
Games : 18924.68896765618
Shopping : 18746.677685950413
News : 15892.724137931034
Utilities : 14010.100917431193
Finance : 13522.261904761905
Entertainment : 10822.961077844311
Lifestyle : 8978.308510638299
Book : 8498.333333333334
Business : 6367.8
Education : 6266.333333333333
Catalogs : 1779.5555555555557
Medical : 459.75


# Apple store app recommendation
-There is not so much apps in the Reference genre and it is highly rated,
therefore, an new app in the genre with extra and attractive features
might be outstanding and have good market share.

In [83]:
def app_rating_freq(dataset, index, genre_index = 1):
    rating_per_genre = {}
    total_per_genre = {}
    #genre_index = 11
    for row in dataset[1:]:
        if row[genre_index] in rating_per_genre:
            row[index]=row[index].replace(',', '')
            row[index]=row[index].replace('+','')
            rating_per_genre[row[genre_index]] += float(row[index])
            total_per_genre[row[genre_index]] += 1
        else:
            row[index] = row[index].replace(',','')
            row[index] = row[index].replace('+','')
            rating_per_genre[row[genre_index]] = float(row[index])
            total_per_genre[row[genre_index]] = 1
    for entry in rating_per_genre:
        rating_per_genre[entry] = rating_per_genre[entry] / total_per_genre[entry]
    return rating_per_genre

In [84]:
 app_rating_freq(goog_eng_free_apps, 5, genre_index = 1)

{'ART_AND_DESIGN': 1967474.5454545454,
 'AUTO_AND_VEHICLES': 647317.8170731707,
 'BEAUTY': 513151.88679245283,
 'BOOKS_AND_REFERENCE': 8814199.78835979,
 'BUSINESS': 1712290.1474201474,
 'COMICS': 832613.8888888889,
 'COMMUNICATION': 38590581.08741259,
 'DATING': 854028.8303030303,
 'EDUCATION': 1833495.145631068,
 'ENTERTAINMENT': 11640705.88235294,
 'EVENTS': 253542.22222222222,
 'FINANCE': 1387692.475609756,
 'FOOD_AND_DRINK': 1924897.7363636363,
 'HEALTH_AND_FITNESS': 4188821.9853479853,
 'HOUSE_AND_HOME': 1360598.042253521,
 'LIBRARIES_AND_DEMO': 638503.734939759,
 'LIFESTYLE': 1446158.2238372094,
 'GAME': 15544014.51048951,
 'FAMILY': 3695641.8198090694,
 'MEDICAL': 120550.61980830671,
 'SOCIAL': 23253652.127118643,
 'SHOPPING': 7036877.311557789,
 'PHOTOGRAPHY': 17840110.40229885,
 'SPORTS': 3650602.276666667,
 'TRAVEL_AND_LOCAL': 13984077.710144928,
 'TOOLS': 10830251.970588235,
 'PERSONALIZATION': 5201482.6122448975,
 'PRODUCTIVITY': 16787331.344927534,
 'PARENTING': 542603.62

# Google App St
- I will be recommending an app in the art & design category because it has low number of apps in the category but it is highly downloaded.