# Pandas Deep-Dive


In [46]:
import pandas as pd
import numpy as np

In [47]:
apps = pd.read_csv("../data/appleStore.csv")

In [48]:
apps.head()

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,prime_genre
0,0,281656475,PAC-MAN Premium,100788224,3.99,21292,26,4.0,Games
1,1,281796108,Evernote - stay organized,158578688,0.0,161065,26,4.0,Productivity
2,2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.0,188583,2822,3.5,Weather
3,3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,0.0,262241,649,4.0,Shopping
4,4,282935706,Bible,92774400,0.0,985920,5320,4.5,Reference


In [49]:
apps.shape

(7197, 9)

In [50]:
apps.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7197 entries, 0 to 7196
Data columns (total 9 columns):
Unnamed: 0          7197 non-null int64
id                  7197 non-null int64
track_name          7197 non-null object
size_bytes          7197 non-null int64
price               7197 non-null float64
rating_count_tot    7197 non-null int64
rating_count_ver    7197 non-null int64
user_rating         7197 non-null float64
prime_genre         7197 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 506.1+ KB


In [51]:
apps.duplicated().sum()

0

In [52]:
apps.track_name.nunique()

7195

In [53]:
apps[apps.track_name.duplicated(keep=False)]

Unnamed: 0.1,Unnamed: 0,id,track_name,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,prime_genre
3319,3319,952877179,VR Roller Coaster,169523200,0.0,107,102,3.5,Games
5603,5603,1089824278,VR Roller Coaster,240964608,0.0,67,44,3.5,Games
7092,7092,1173990889,Mannequin Challenge,109705216,0.0,668,87,3.0,Games
7128,7128,1178454060,Mannequin Challenge,59572224,0.0,105,58,4.0,Games


In [54]:
apps.drop("Unnamed: 0", axis=1, inplace=True)

In [55]:
apps.id.duplicated().sum()

0

In [56]:
apps.head()

Unnamed: 0,id,track_name,size_bytes,price,rating_count_tot,rating_count_ver,user_rating,prime_genre
0,281656475,PAC-MAN Premium,100788224,3.99,21292,26,4.0,Games
1,281796108,Evernote - stay organized,158578688,0.0,161065,26,4.0,Productivity
2,281940292,"WeatherBug - Local Weather, Radar, Maps, Alerts",100524032,0.0,188583,2822,3.5,Weather
3,282614216,"eBay: Best App to Buy, Sell, Save! Online Shop...",128512000,0.0,262241,649,4.0,Shopping
4,282935706,Bible,92774400,0.0,985920,5320,4.5,Reference


### What is the average rating of all apps? 


In [57]:
apps.user_rating.mean()

3.526955675976101

### How many apps have an average rating no less than 4?

1st way

In [58]:
user_rating_above_4 = apps[apps.user_rating>=4]

In [59]:
len(user_rating_above_4)

4781

2nd more direct way

In [60]:
apps[apps.user_rating>=4].id.count()

4781

### How many genres are there in total for all the apps?

In [61]:
apps.prime_genre.nunique()

23

### What are the top 3 genres that have the most number of apps?

In [62]:
apps.prime_genre.value_counts().nlargest(3)

Games            3862
Entertainment     535
Education         453
Name: prime_genre, dtype: int64

### Which genre is most likely to contain free apps?

Absolute figures

In [35]:
free_apps = apps.loc[apps.price == 0, :]

In [100]:
free_apps.prime_genre.value_counts().to_frame(name="Free apps")

Unnamed: 0,Free apps
Games,2257
Entertainment,334
Photo & Video,167
Social Networking,143
Education,132
Shopping,121
Utilities,109
Lifestyle,94
Finance,84
Sports,79


### Now you can calculate the proportion of the free apps in each genre based on the value counts you obtained in the previous two steps.

Relative figures. Before we don't really get to know which are the categories with the highest concentration of free apps, because we are not considering number of apps of each category.

In [63]:
apps["status"] = np.where(apps.price == 0, "free", "paid")

In [89]:
apps_pivot = apps.groupby(["prime_genre", "status"]).id.count().to_frame().unstack()

In [90]:
apps_pivot.columns = apps_pivot.columns.droplevel(0)

In [96]:
apps_pivot.head()

status,free,paid,total_apps,free_proportion
prime_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Book,66,46,112,0.589286
Business,20,37,57,0.350877
Catalogs,9,1,10,0.9
Education,132,321,453,0.291391
Entertainment,334,201,535,0.624299


In [92]:
apps_pivot["total_apps"] = apps_pivot.free + apps_pivot.paid

In [94]:
apps_pivot["free_proportion"] = apps_pivot.free / apps_pivot.total_apps

In [95]:
apps_pivot.sort_values(by="free_proportion", ascending=False)

status,free,paid,total_apps,free_proportion
prime_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Shopping,121,1,122,0.991803
Catalogs,9,1,10,0.9
Social Networking,143,24,167,0.856287
Finance,84,20,104,0.807692
News,58,17,75,0.773333
Sports,79,35,114,0.692982
Travel,56,25,81,0.691358
Food & Drink,43,20,63,0.68254
Lifestyle,94,50,144,0.652778
Entertainment,334,201,535,0.624299


### If a developer tries to make money by developing and selling Apple Store apps, in which genre should s/he develop the apps? Please assume all apps cost the same amount of time and expense to develop.

In [107]:
apps_summary=apps.groupby("prime_genre")\
            .agg({"id":"count","price":"mean", "rating_count_tot":"sum", "user_rating":"mean"})

In [109]:
apps_summary["user_response"] = apps_summary.rating_count_tot / apps_summary.id

In [112]:
apps_summary["profit_proxy"] = apps_summary.user_response * apps_summary.price

In [113]:
apps_summary.sort_values(by="profit_proxy", ascending=False)

Unnamed: 0_level_0,id,price,rating_count_tot,user_rating,user_response,profit_proxy
prime_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Music,138,4.835435,3980199,3.978261,28842.021739,139463.715118
Reference,64,4.836875,1434294,3.453125,22410.84375,108398.449863
Navigation,46,4.124783,545282,2.684783,11853.956522,48894.993705
Weather,72,1.605417,1597034,3.597222,22181.027778,35609.791678
Productivity,178,4.330562,1433136,4.005618,8051.325843,34866.764116
Business,57,5.116316,272921,3.745614,4788.087719,24497.3688
Food & Drink,63,1.552381,878133,3.18254,13938.619048,21638.046712
Photo & Video,349,1.473295,5008946,3.80086,14352.280802,21145.145395
Games,3862,1.432923,52878491,3.685008,13691.996634,19619.581764
Health & Fitness,180,1.916444,1784371,3.7,9913.172222,18998.043832


In [122]:
apps_summary["user_rating_adjustment_factor"] = apps_summary.user_rating \
                                                .apply(lambda x: 1/(x/apps_summary.user_rating.min()))

In [123]:
apps_summary["profit_proxy_adjusted"] = apps_summary.profit_proxy * apps_summary.user_rating_adjustment_factor

In [124]:
apps_summary.sort_values(by="profit_proxy_adjusted", ascending=False)

Unnamed: 0_level_0,id,price,rating_count_tot,user_rating,user_response,profit_proxy,user_rating_adjustment_factor,profit_proxy_adjusted
prime_genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Music,138,4.835435,3980199,3.978261,28842.021739,139463.715118,0.527869,73618.551259
Reference,64,4.836875,1434294,3.453125,22410.84375,108398.449863,0.608145,65921.95322
Navigation,46,4.124783,545282,2.684783,11853.956522,48894.993705,0.782186,38244.991028
Weather,72,1.605417,1597034,3.597222,22181.027778,35609.791678,0.583784,20788.418926
Productivity,178,4.330562,1433136,4.005618,8051.325843,34866.764116,0.524264,18279.377877
Food & Drink,63,1.552381,878133,3.18254,13938.619048,21638.046712,0.65985,14277.873217
Business,57,5.116316,272921,3.745614,4788.087719,24497.3688,0.560656,13734.590376
Photo & Video,349,1.473295,5008946,3.80086,14352.280802,21145.145395,0.552507,11682.832311
Games,3862,1.432923,52878491,3.685008,13691.996634,19619.581764,0.569877,11180.742158
Social Networking,167,0.33988,7598316,2.98503,45498.898204,15464.176419,0.703511,10879.210973
