# The purpose of this project is to refactor the code cells

In [1]:
import csv

### The Google Play data set ###
with open('googleplaystore.csv', encoding='utf-8') as file:
    android = list(csv.reader(file))
android_header = android[0]
android = android[1:]

### The App Store data set ###
with open('AppleStore.csv', encoding='utf-8') as file:
    ios = list(csv.reader(file))
ios_header = ios[0]
ios = ios[1:]

In [None]:
def explore_data(dataset, start, end, show_rows_and_columns=False):
    dataset_slice = dataset[start:end]    
    for row in dataset_slice:
        print(row)
        print() # adds a new (empty) line between rows
        
    if show_rows_and_columns:
        num_rows = len(dataset)
        num_cols = len(dataset[0])
        print(f'Number of rows: {num_rows}')
        print(f'Number of columns: {num_cols}')

# Print header of android dataset
print(android_header)
print()

# Explore android dataset
explore_data(android, 0, 3, show_rows_and_columns=True)


In [None]:
print(ios_header)
print()
explore_data(ios, 0, 3, True)

In [None]:
print(android[10472])  # incorrect row
print()
print(android_header)  # header
print()
print(android[0])      # correct row

In [None]:
print(len(android))
del android[10472]  # don't run this more than once
print(len(android))

In [None]:
for app in android:
    name = app[0]
    if name == 'Instagram':
        print(app)

In [None]:
unique_apps = set()
duplicate_apps = set()

for app in android:
    name = app[0]
    if name in unique_apps:
        duplicate_apps.add(name)
    else:
        unique_apps.add(name)
    
print('Number of duplicate apps:', len(duplicate_apps))
print('\n')
print('Examples of duplicate apps:', list(duplicate_apps)[:15])


In [8]:
reviews_max = {}

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if name not in reviews_max:
        reviews_max[name] = n_reviews
        
    elif n_reviews > reviews_max[name]:
        reviews_max[name] = n_reviews

In [None]:
print('Expected length:', len(android) - 1181)
print('Actual length:', len(reviews_max))

In [10]:
android_clean = []
already_added = set()

for app in android:
    name = app[0]
    n_reviews = float(app[3])
    
    if (reviews_max[name] == n_reviews) and (name not in already_added):
        android_clean.append(app)
        already_added.add(name)

In [None]:
explore_data(android_clean, 0, 3, True)

In [None]:
print(ios[813][1])
print(ios[6731][1])

print(android_clean[4412][0])
print(android_clean[7940][0])

In [None]:
def is_english(string):
    
    for character in string:
        if ord(character) > 127:
            return False
    
    return True

print(is_english('Instagram'))
print(is_english('爱奇艺PPS -《欢乐颂2》电视剧热播'))

In [None]:
print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

print(ord('™'))
print(ord('😜'))

In [None]:
import unicodedata

def is_english(string):
    for character in string:
        if ord(character) > 127 or unicodedata.category(character)[0] != 'L':
            return False
    
    return True

print(is_english('Docs To Go™ Free Office Suite'))
print(is_english('Instachat 😜'))

In [None]:
android_english = [app for app in android_clean if is_english(app[0])]
ios_english = [app for app in ios if is_english(app[1])]

explore_data(android_english, 0, 3, True)
print('\n')
explore_data(ios_english, 0, 3, True)


In [None]:
android_final = [app for app in android_english if app[7] == '0']
ios_final = [app for app in ios_english if app[4] == '0.0']

print(len(android_final))
print(len(ios_final))

In [18]:
def freq_table(dataset, index):
    table = {}
    total = 0
    
    for row in dataset:
        total += 1
        value = row[index]
        if value in table:
            table[value] += 1
        else:
            table[value] = 1
    
    table_percentages = {}
    for key in table:
        percentage = (table[key] / total) * 100
        table_percentages[key] = percentage 
    
    return table_percentages


def display_table(dataset, index):
    table = freq_table(dataset, index)
    table_display = []
    for key in table:
        key_val_as_tuple = (table[key], key)
        table_display.append(key_val_as_tuple)
        
    table_sorted = sorted(table_display, reverse = True)
    for entry in table_sorted:
        print(entry[1], ':', entry[0])

In [None]:
display_table(ios_final, -5)

In [None]:
display_table(android_final, 1) # Category

In [None]:
display_table(android_final, -4)

In [None]:
genres_ios = freq_table(ios_final, -5)

for genre in genres_ios:
    total = 0
    len_genre = 0
    
    for app in ios_final:
        if app[-5] == genre:
            n_ratings = float(app[5])
            total += n_ratings
            len_genre += 1
    
    avg_n_ratings = total / len_genre
    print(f"{genre}: {avg_n_ratings:.2f}")

In [None]:
for app in ios_final:
    if app[-5] == 'Navigation':
        print(app[1], ':', app[5]) # print name and number of ratings

In [None]:
for app in ios_final:
    if app[-5] == 'Reference':
        print(app[1], ':', app[5])

In [None]:
display_table(android_final, 5) # the Installs columns

In [None]:
categories_android = freq_table(android_final, 1)

for category in categories_android:
    total = 0
    len_category = 0
    for app in android_final:
        category_app = app[1]
        if category_app == category:            
            n_installs = app[5]
            n_installs = n_installs.replace(',', '')
            n_installs = n_installs.replace('+', '')
            total += float(n_installs)
            len_category += 1
    avg_n_installs = total / len_category
    print(category, ':', avg_n_installs)

In [None]:
for app in android_final:
    if app[1] == 'COMMUNICATION' and (app[5] == '1,000,000,000+'
                                      or app[5] == '500,000,000+'
                                      or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

In [None]:
under_100_m = []

for app in android_final:
    n_installs = app[5]
    n_installs = n_installs.replace(',', '')
    n_installs = n_installs.replace('+', '')
    if (app[1] == 'COMMUNICATION') and (float(n_installs) < 100000000):
        under_100_m.append(float(n_installs))
        
sum(under_100_m) / len(under_100_m)

In [None]:
for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE':
        print(app[0], ':', app[5])

In [None]:
for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000,000+'
                                            or app[5] == '500,000,000+'
                                            or app[5] == '100,000,000+'):
        print(app[0], ':', app[5])

In [None]:
for app in android_final:
    if app[1] == 'BOOKS_AND_REFERENCE' and (app[5] == '1,000,000+'
                                            or app[5] == '5,000,000+'
                                            or app[5] == '10,000,000+'
                                            or app[5] == '50,000,000+'):
        print(app[0], ':', app[5])