In [4]:
# importing libraries and reading dataset
import numpy as np
import pandas as pd
df = pd.read_csv('Part1.csv')


In [5]:
# removed Unwanted Columns
df.drop(columns=['Developer Id','Developer Website','Developer Email',
                 'Privacy Policy','Minimum Installs','Installs'], inplace=True)

In [6]:
# Renamed Columns
df.rename(columns={
    'App Name': 'app_name',
    'App Id': 'app_id',
    'Category': 'category',
    'Rating': 'rating',
    'Rating Count': 'rating_count',
    'Maximum Installs': 'maximum_installs',
    'Free': 'is_free',
    'Price': 'price',
    'Currency': 'currency',
    'Size': 'app_size',
    'Minimum Android': 'min_android_version',
    'Released': 'release_date',
    'Last Updated': 'last_updated_date',
    'Content Rating': 'content_rating',
    'Ad Supported': 'ad_supported',
    'In App Purchases': 'in_app_purchases',
    'Editors Choice': 'editors_choice',
    'Scraped Time': 'scraped_time'
}, inplace=True)


In [7]:
# Counting duplicates
df.duplicated().sum()

0

In [8]:
# Counting Missing Values
df.isnull().sum()

app_name                   1
app_id                     0
category                   0
rating                  7913
rating_count            7913
maximum_installs           0
is_free                    0
price                      0
currency                  52
app_size                  67
min_android_version     2216
release_date           24440
last_updated_date          0
content_rating             0
ad_supported               0
in_app_purchases           0
editors_choice             0
scraped_time               0
dtype: int64

In [9]:
# Handling Missing Values accourding to the columns

# app_name column
df[df['app_name'].isnull()].index # getting index no. = Index([45686], dtype='int64')
df.drop(45686,inplace=True) # deleted [45686] indexed row

# rating column
df['rating'].fillna(df['rating'].mean(),inplace=True) #filling with mean of the rating

# rating_count column
df['rating_count'].fillna(0,inplace=True) #filling with zero

# currency column
df['currency'].fillna(df['currency'].mode()[0],inplace= True) # filling with most frequent currency type

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(df['rating'].mean(),inplace=True) #filling with mean of the rating
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating_count'].fillna(0,inplace=True) #filling with zero
The behavior will change in pandas 3.0. This inplace method will never work because 

In [10]:
# app_size column
df['app_size'].unique() # checking unique sizes(MB,GB,Kb)

array(['10M', '2.9M', '3.7M', ..., '344M', '232M', '418M'], dtype=object)

In [11]:
def convert_size(x):
    try:
        if 'M' in x :
            return float(x.replace('M',''))
        elif 'G' in x:
            return float(x.replace('G',''))*1024 # GB to MB
        else:
            return 0
    except:
        return 0
    
df['app_size_mb'] = df['app_size'].apply(convert_size)

In [13]:
# release_date column
df = df.dropna(subset=['release_date'])

In [14]:
df.isnull().sum()

app_name                  0
app_id                    0
category                  0
rating                    0
rating_count              0
maximum_installs          0
is_free                   0
price                     0
currency                  0
app_size                  0
min_android_version    2144
release_date              0
last_updated_date         0
content_rating            0
ad_supported              0
in_app_purchases          0
editors_choice            0
scraped_time              0
app_size_mb               0
dtype: int64

In [16]:
df.describe()

Unnamed: 0,rating,rating_count,maximum_installs,price,app_size_mb
count,775558.0,775558.0,775558.0,775558.0,775558.0
mean,2.204687,2877.704,287370.8,0.103628,18.71926
std,2.108583,260146.6,15124460.0,2.609632,23.917583
min,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,85.0,0.0,4.7
50%,3.0,6.0,703.0,0.0,9.8
75%,4.3,42.0,7393.0,0.0,24.0
max,5.0,138557600.0,6265638000.0,400.0,1536.0


In [None]:
# Converting columns to appropriate data types
df['release_date']= pd.to_datetime(df['release_date'])
df['scraped_time']= pd.to_datetime(df['scraped_time'])

In [34]:
# creating month and year columns
df['released_year'] = df['release_date'].dt.year
df['released_month'] = df['release_date'].dt.month

In [None]:
df.groupby('released_year').size().reset_index(name= 'app_count')

Unnamed: 0,released_year,app_count
0,2010,1554
1,2011,4936
2,2012,8798
3,2013,14571
4,2014,24558
5,2015,39997
6,2016,58107
7,2017,90084
8,2018,115563
9,2019,166149
