In [43]:
# Dependencies
import pandas as pd
import numpy as np

In [44]:
# Name of the CSV file
csv_path = 'googleplaystore.csv'

In [45]:
# The read the CSV file in pandas
googlestore = pd.read_csv(csv_path)
googlestore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,7-Jan-18,1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,15-Jan-18,2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,1-Aug-18,1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,8-Jun-18,Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,20-Jun-18,1.1,4.4 and up


In [46]:
# Delete column "Current Ver" , "Android Ver" and "Genres"
del googlestore["Current Ver"]
del googlestore["Android Ver"] 
del googlestore["Genres"] 
googlestore.head()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Last Updated
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,7-Jan-18
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,15-Jan-18
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,1-Aug-18
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,8-Jun-18
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,20-Jun-18


In [47]:
# Identify incomplete rows
googlestore.count()

App               10841
Category          10841
Rating             9367
Reviews           10841
Size              10841
Installs          10841
Type              10840
Price             10841
Content Rating    10840
Last Updated      10841
dtype: int64

In [48]:
# Drop all rows with missing information
googlestore = googlestore.dropna(how='any')

In [49]:
# Verify dropped rows
googlestore.count()

App               9366
Category          9366
Rating            9366
Reviews           9366
Size              9366
Installs          9366
Type              9366
Price             9366
Content Rating    9366
Last Updated      9366
dtype: int64

In [50]:
# The "Reviews" and "Price" column are the wrong data types. They should be numeric.
googlestore.dtypes

App                object
Category           object
Rating            float64
Reviews            object
Size               object
Installs           object
Type               object
Price              object
Content Rating     object
Last Updated       object
dtype: object

In [51]:
# Convert the datatype of 'Reviews' column
googlestore['Reviews'] = pd.to_numeric(googlestore['Reviews'])
# Remove dollar sign and convert the datatype of 'Price' column
googlestore['Price'] = googlestore['Price'].str.replace('$', '')
googlestore['Price'] = pd.to_numeric(googlestore['Price'])

In [52]:
# Verify datatypes have been changed
googlestore.dtypes

App                object
Category           object
Rating            float64
Reviews             int64
Size               object
Installs           object
Type               object
Price             float64
Content Rating     object
Last Updated       object
dtype: object

In [None]:
# Push the cleaned dataset to a new CSV file
googlestore.to_csv("googlestore_clean.csv",
                  encoding="utf-8", index=False, header=True)

In [53]:
# Display an overview of the Category column
googlestore['Category'].value_counts()

FAMILY                 1747
GAME                   1097
TOOLS                   734
PRODUCTIVITY            351
MEDICAL                 350
COMMUNICATION           328
FINANCE                 323
SPORTS                  319
PHOTOGRAPHY             317
LIFESTYLE               314
PERSONALIZATION         314
BUSINESS                303
HEALTH_AND_FITNESS      297
SOCIAL                  259
SHOPPING                238
NEWS_AND_MAGAZINES      233
TRAVEL_AND_LOCAL        226
DATING                  195
BOOKS_AND_REFERENCE     178
VIDEO_PLAYERS           160
EDUCATION               155
ENTERTAINMENT           149
MAPS_AND_NAVIGATION     124
FOOD_AND_DRINK          109
HOUSE_AND_HOME           76
WEATHER                  75
AUTO_AND_VEHICLES        73
LIBRARIES_AND_DEMO       65
ART_AND_DESIGN           62
COMICS                   58
PARENTING                50
EVENTS                   45
BEAUTY                   42
Name: Category, dtype: int64

In [54]:
googlestore['Rating'].value_counts()

4.4    1109
4.3    1076
4.5    1038
4.2     952
4.6     823
4.1     708
4.0     568
4.7     499
3.9     386
3.8     303
5.0     274
3.7     239
4.8     234
3.6     174
3.5     163
3.4     128
3.3     102
4.9      87
3.0      83
3.1      69
3.2      64
2.9      45
2.8      42
2.6      25
2.7      25
2.5      21
2.3      20
2.4      19
1.0      16
2.2      14
1.9      13
2.0      12
1.7       8
2.1       8
1.8       8
1.6       4
1.4       3
1.5       3
1.2       1
Name: Rating, dtype: int64

In [55]:
googlestore['Size'].value_counts()

Varies with device    1637
14M                    166
12M                    161
11M                    160
15M                    159
                      ... 
27k                      1
982k                     1
608k                     1
378k                     1
540k                     1
Name: Size, Length: 414, dtype: int64

In [56]:
googlestore['Installs'].value_counts()

1,000,000+        1577
10,000,000+       1252
100,000+          1150
10,000+           1010
5,000,000+         752
1,000+             713
500,000+           538
50,000+            467
5,000+             432
100,000,000+       409
100+               309
50,000,000+        289
500+               201
500,000,000+        72
10+                 69
1,000,000,000+      58
50+                 56
5+                   9
1+                   3
Name: Installs, dtype: int64

In [57]:
googlestore['Type'].value_counts()

Free    8719
Paid     647
Name: Type, dtype: int64

In [58]:
googlestore['Price'].value_counts()

0.00      8719
2.99       114
0.99       107
4.99        70
1.99        59
          ... 
299.99       1
1.59         1
1.61         1
3.90         1
2.90         1
Name: Price, Length: 73, dtype: int64

In [59]:
googlestore['Content Rating'].value_counts()

Everyone           7420
Teen               1084
Mature 17+          461
Everyone 10+        397
Adults only 18+       3
Unrated               1
Name: Content Rating, dtype: int64

In [60]:
googlestore['Last Updated'].value_counts()

3-Aug-18     319
2-Aug-18     284
31-Jul-18    279
1-Aug-18     275
30-Jul-18    199
            ... 
13-Sep-15      1
15-Feb-16      1
25-Jun-16      1
1-Dec-15       1
6-Oct-14       1
Name: Last Updated, Length: 1300, dtype: int64

In [61]:
# Display a statistical overview
googlestore.describe()

Unnamed: 0,Rating,Reviews,Price
count,9366.0,9366.0,9366.0
mean,4.191757,514049.8,0.960928
std,0.515219,3144042.0,15.816585
min,1.0,1.0,0.0
25%,4.0,186.25,0.0
50%,4.3,5930.5,0.0
75%,4.5,81532.75,0.0
max,5.0,78158310.0,400.0
