# Import libraries

In [None]:
import numpy as np
import pandas as pd

# Load data

In [None]:
google_play_apps_df = pd.read_csv("data/google-play-store-data.csv")

In [None]:
google_play_apps_df.head()

# Clean the data

## Initial cleaning

In [None]:
google_play_apps_df.drop_duplicates(subset="App", inplace=True)
google_play_apps_df = google_play_apps_df[google_play_apps_df["Android Ver"] != np.nan]
google_play_apps_df = google_play_apps_df[google_play_apps_df["Android Ver"] != "NaN"]
google_play_apps_df = google_play_apps_df[google_play_apps_df["Installs"] != "Free"]
google_play_apps_df = google_play_apps_df[google_play_apps_df["Installs"] != "Paid"]

In [None]:
google_play_apps_df.shape

## `Rating`

In [None]:
google_play_apps_df["Rating"].isna().sum()

In [None]:
google_play_apps_df["Rating"].dtype

## `Reviews`

In [None]:
google_play_apps_df["Reviews"].dtype

In [None]:
google_play_apps_df["Reviews"].isna().sum()

In [None]:
google_play_apps_df["Reviews"] = pd.to_numeric(google_play_apps_df["Reviews"])

In [None]:
google_play_apps_df["Reviews"].dtype

## Convert `Size` column to numeric:

In [None]:
google_play_apps_df["Size"]

### Replace `Varies with device` with `NaN`

In [None]:
google_play_apps_df.loc[
    google_play_apps_df["Size"] == "Varies with device", "Size"
] = np.nan

In [None]:
google_play_apps_df["Size"].isna().sum()

### Remove the `M` character from values (`M` stands for Mega) and other special characters

In [None]:
google_play_apps_df["Size"] = google_play_apps_df["Size"].str.replace(
    pat="[M,\+]", repl="", regex=True
)

### Remove the `K` character (`K` stands for Kilo) and divide values by 1000

In [None]:
size_contains_k_mask = google_play_apps_df["Size"].str.contains("k", na=False)

In [None]:
google_play_apps_df.loc[size_contains_k_mask, "Size"] = (
    pd.to_numeric(
        google_play_apps_df.loc[size_contains_k_mask, "Size"].str.replace(
            pat="k", repl=""
        )
    )
    / 1000
)

In [None]:
google_play_apps_df["Size"] = pd.to_numeric(google_play_apps_df["Size"])

In [None]:
google_play_apps_df.isna().sum()

In [None]:
print(f"Shape before: {google_play_apps_df.shape}")
print(f"Shape after: {google_play_apps_df.dropna().shape}")

## Convert `Installs` column to integer:

In [None]:
google_play_apps_df["Installs"] = pd.to_numeric(
    google_play_apps_df["Installs"].str.replace(pat="[\+,]", repl="", regex=True)
)

# Price

In [None]:
google_play_apps_df["Price"] = pd.to_numeric(
    google_play_apps_df["Price"].str.replace(pat="$", repl="", regex=False)
)

# Check final `dtypes`

In [None]:
google_play_apps_df[["Rating", "Reviews", "Size", "Installs", "Price"]].dtypes

In [None]:
google_play_apps_df.to_csv('data/google-play-store-data-cleaned.csv', index=False)