In [None]:
import pandas as pd
import numpy as np
import re
from string import *

#Loading csv files directly from the Github website
play_store = pd.read_csv("https://raw.githubusercontent.com/gdv/foundationsCS-2018/master/ex-data/project/googleplaystore.csv")
play_store_reviews = pd.read_csv("https://raw.githubusercontent.com/gdv/foundationsCS-2018/master/ex-data/project/googleplaystore_user_reviews.csv")

play_store = play_store.drop(play_store[play_store.Category == '1.9'].index)

# 1. Convert the app sizes to a number

In [None]:
m = re.compile('[0-9]M')
k = re.compile('[0-9]k')

size_len = len(play_store['Size'])
sizes = list(play_store['Size'])

for i in range(0, size_len):
    old = sizes[i]
    if(re.search(m, old) != None): #abbiamo trovato il pattern 'xxM'
        sizes[i] = float(old.split('M')[0])*1000000
    else:
        if(re.search(k, old) != None):
            sizes[i] = float(old.split('k')[0])*1000
            
play_store['Size'] = sizes

# 2. Convert the number of installs to a number

In [None]:
inst = list(play_store['Installs'])
inst_len = len(play_store['Installs'])

In [None]:
for i in range(0, inst_len):
    x = (inst[i].split('+')[0])
    inst[i] = x.replace(',', '')
        
for i in range(0, inst_len):
    inst[i] = int(inst[i])
    
play_store["Installs"] = inst

# 3. Transform “Varies with device” into a missing value

In [None]:
play_store = play_store.replace('Varies with device', 'NaN')

# 4. Convert Current Ver and Android Ver into a dotted number (e.g. 4.0.3 or 4.2)

In [None]:
cur_ver = list(play_store['Current Ver'])
andr_ver = list(play_store['Android Ver'])

for i in range(0, len(cur_ver)):
    if(re.search(re.compile('[0-9][.][0-9]([.][0-9])*'), str(cur_ver[i])) != None):
        cur_ver[i] = cur_ver[i].replace(cur_ver[i], re.search(re.compile('[0-9][.][0-9]([.][0-9])*'), cur_ver[i]).group(0))
        
for i in range(0, len(andr_ver)):
    if(re.search(re.compile('[0-9][.][0-9]([.][0-9])*'), str(andr_ver[i])) != None):
        andr_ver[i] = andr_ver[i].replace(andr_ver[i], re.search(re.compile('[0-9][.][0-9]([.][0-9])*'), andr_ver[i]).group(0))
        
play_store['Current Ver'] = cur_ver
play_store['Android Ver'] = andr_ver

# 5. Remove the duplicates

In [None]:
#number of duplicates
play_store.duplicated().sum()

play_store = play_store.drop_duplicates(subset=None, keep='first', inplace=False) #eliminate i duplicati delle righe identiche

In [None]:
play_store = play_store.sort_values(by = ['App', 'Reviews'], ascending=False).drop_duplicates('App', keep = 'first', inplace = False).sort_index().reset_index(drop=True)

# 6. For each category, compute the number of apps

In [None]:
play_store[["App"]].groupby(play_store["Category"]).count()

# 7. For each category, compute the average rating

In [None]:
play_store[["Rating"]].groupby(play_store["Category"]).mean()

# 8. Create two dataframes: one for the genres and one bridging apps and genres. So that, for instance, the app Pixel Draw - Number Art Coloring Book appears twice in the bridging table, once for Art & Design, once for Creativity

In [None]:
#Creazione df dei generi
genres = list(set(play_store['Genres']))
genres_len = len(genres)
for i in range(0, genres_len):
    if(re.search(re.compile('[;]'), genres[i])):
        x = genres[i]
        genres[i] = x.split(';')[0]
        genres.append(x.split(';')[1])
        
genres = list(set(genres))
genres_len = len(genres)
genres_df = pd.DataFrame(genres, columns = ['Genre'])


#creazione df app e generi
apps_genres_df = pd.DataFrame(columns=["App", "Genre"])
for i in play_store.values:
    for j in i[9].split(";"): 
        x = pd.Series([i[0], j], index = apps_genres_df.columns)
        apps_genres_df = apps_genres_df.append(x, ignore_index = True)


# 9. For each genre, create a new column of the original dataframe. The new columns must have boolean values (True if the app has a given genre)

In [None]:
bool_genres = pd.DataFrame()
for i in range(0, genres_len):
    x = pd.DataFrame(data = np.full(shape = len(play_store['Genres']), fill_value = False, dtype = bool), columns = [str(genres[i])])
    bool_genres = pd.concat([bool_genres, x], axis = 1)
    
play_store = play_store.join(bool_genres, sort = False)


In [None]:
pd.options.mode.chained_assignment = None  # default='warn'

for i in range(0, len(play_store['App'])):
    if(re.search(re.compile('[;]'), (play_store['Genres'][i]))):
        play_store[str(play_store['Genres'][i].split(';')[0])][i] = True
        play_store[str(play_store['Genres'][i].split(';')[1])][i] = True
    else:
        play_store[str(play_store['Genres'][i])][i] = True    

# 10. For each genre, compute the average rating. What is the genre with highest average?

In [None]:
apps_genres_df.join(play_store["Rating"], sort = False)[["Rating"]].groupby(apps_genres_df["Genre"]).mean()

# 11. For each app, compute the approximate income, obtain as a product of number of installs and price

In [None]:
play_store["Income"] = pd.Series()
for i in range(0, len(play_store['Price'])):
    play_store["Income"][i] = play_store["Installs"][i] * float(play_store["Price"][i].replace('$', ''))

# 12. For each app, compute its minimum and maximum Sentiment_polarity

In [None]:
play_store_reviews[["Sentiment_Polarity"]].groupby(play_store_reviews['App']).min()

In [None]:
play_store_reviews[["Sentiment_Polarity"]].groupby(play_store_reviews['App']).max()