In [1]:
# Import Libraries 
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.io as pio
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import webbrowser
import os

In [2]:
# Load App dataset 
app_data = pd.read_csv("D:/Google Play Store Intern/Play Store Data.csv")

In [38]:
app_data.head(2)

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up


In [3]:
app_data.shape

(10841, 13)

In [4]:
app_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10841 entries, 0 to 10840
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   App             10841 non-null  object 
 1   Category        10841 non-null  object 
 2   Rating          9367 non-null   float64
 3   Reviews         10841 non-null  object 
 4   Size            10841 non-null  object 
 5   Installs        10841 non-null  object 
 6   Type            10840 non-null  object 
 7   Price           10841 non-null  object 
 8   Content Rating  10840 non-null  object 
 9   Genres          10841 non-null  object 
 10  Last Updated    10841 non-null  object 
 11  Current Ver     10833 non-null  object 
 12  Android Ver     10838 non-null  object 
dtypes: float64(1), object(12)
memory usage: 1.1+ MB


In [5]:
app_data.describe()

Unnamed: 0,Rating
count,9367.0
mean,4.193338
std,0.537431
min,1.0
25%,4.0
50%,4.3
75%,4.5
max,19.0


In [6]:
app_data.describe(include = ['object'])

Unnamed: 0,App,Category,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
count,10841,10841,10841,10841,10841,10840,10841,10840,10841,10841,10833,10838
unique,9660,34,6002,462,22,3,93,6,120,1378,2832,33
top,ROBLOX,FAMILY,0,Varies with device,"1,000,000+",Free,0,Everyone,Tools,"August 3, 2018",Varies with device,4.1 and up
freq,9,1972,596,1695,1579,10039,10040,8714,842,326,1459,2451


In [7]:
((app_data.isnull().sum().sum())/(app_data.shape[0]*app_data.shape[1]))*100   #  overall data empty
(app_data.isnull().sum()/app_data.shape[0])*100
app_data.isnull().sum()

App                  0
Category             0
Rating            1474
Reviews              0
Size                 0
Installs             0
Type                 1
Price                0
Content Rating       1
Genres               0
Last Updated         0
Current Ver          8
Android Ver          3
dtype: int64

In [None]:
app_data['Rating'].ffill(inplace = True)

In [9]:
app_data.loc[app_data['Rating'] > 5, 'Rating'] = 5 #if rating greater then 5 then assing 5

In [10]:
app_data['Type'].unique()

array(['Free', 'Paid', nan, '0'], dtype=object)

In [11]:
app_data['Type'] = app_data['Type'].str.replace('0','Free',regex=False)

In [12]:
app_data['Installs'].unique()

array(['10,000+', '500,000+', '5,000,000+', '50,000,000+', '100,000+',
       '50,000+', '1,000,000+', '10,000,000+', '5,000+', '100,000,000+',
       '1,000,000,000+', '1,000+', '500,000,000+', '50+', '100+', '500+',
       '10+', '1+', '5+', '0+', '0', 'Free'], dtype=object)

In [13]:
#clean the installs column
app_data['Installs'] = app_data['Installs'].str.replace(",","",regex = False).str.replace("+","",regex = False) # replace , and + with nothing
app_data = app_data[app_data['Installs'].str.isnumeric()] # only contains numerical values
app_data['Installs'] = pd.to_numeric(app_data['Installs'], errors = 'coerce') # convert in int

In [14]:
def convert_size(size):
    if 'M' in size:
      return float(size.replace('M',""))
    if 'K' in size:
       return float(size.replace('k',""))/1024
    else :
        return 'n/a'
app_data['Size'] = app_data['Size'].apply(convert_size)

In [15]:
app_data.rename(columns = {'Size':"SizeInMB"},inplace = True)
app_data['SizeInMB'] = pd.to_numeric(app_data['SizeInMB'], errors = 'coerce')

In [16]:
app_data.select_dtypes(include = 'object').columns

Index(['App', 'Category', 'Reviews', 'Type', 'Price', 'Content Rating',
       'Genres', 'Last Updated', 'Current Ver', 'Android Ver'],
      dtype='object')

In [None]:
for column in app_data.select_dtypes(include = 'object').columns:
    app_data[column].fillna(app_data[column].mode()[0],inplace = True)

In [18]:
app_data['Reviews'] = pd.to_numeric(app_data['Reviews'],errors = 'coerce') # converting Reviews to integers
# converting $ symbol to nothing in  Price column and change data type str to float
app_data['Price'] = app_data['Price'].str.replace('$',"").astype('float64')
app_data.rename(columns = {'Price':'PriceInDollar'},inplace = True)

In [None]:
app_data['Log_Installs'] = np.log(app_data['Installs']) # functional transformation for reduce the magnitude or
app_data['Log_Reviews'] = np.log(app_data['Reviews']) # compress large values in small values

In [57]:
def rating_group(rating):
    if rating >= 4:
        return 'Top Rated Appa'
    elif rating >= 3:
        return 'Above Average Rated Apps'
    elif rating >= 2:
        return 'Average Rated Apps'
    else:
        return 'Below Average Rated Apps'
app_data['Rating_group'] = app_data['Rating'].apply(rating_group)

In [58]:
app_data['Last Updated'] = pd.to_datetime(app_data['Last Updated'],errors = 'coerce')
app_data['Update_Year'] = app_data['Last Updated'].dt.year

In [59]:
app_data['Revenue'] = app_data['PriceInDollar'] * app_data['Installs']

In [60]:
app_data_cleaned = app_data.groupby('App').agg({
    'Category':'first',
    'Rating':'mean',
    'Reviews':'mean',
    'SizeInMB':'first',
    'Installs':'first',
    'Type':'first',
    'PriceInDollar':'first',
    'Content Rating':'first',
    'Genres':'first',
    'Last Updated':'first',
    'Update_Year':'first',
    'Revenue':'mean',
    'Log_Installs':'mean',
    'Log_Reviews':'mean',
    'Rating_group':'first'
}).reset_index()

In [61]:
# app_data.duplicated() # check duplicates
app_data_cleaned.drop_duplicates(inplace = True) # drop all Duplicates records from data

In [62]:
output_file_path = "D:/Google Play Store Intern/Clean Play Store Data.csv"
app_data_cleaned.to_csv(output_file_path, index = False)