# Import libraries

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud

# Load data

In [None]:
google_play_apps_df = pd.read_csv("data/google-play-store-data-cleaned.csv")

In [None]:
google_play_apps_reviews_df = pd.read_csv("data/google-play-store-user-reviews.csv")

In [None]:
google_play_apps_df.head()

In [None]:
google_play_apps_reviews_df.head()

In [None]:
google_play_apps_df.shape

In [None]:
google_play_apps_reviews_df.shape

# Data description:

1. `App`: Application name
2. `Category`: Category the app belongs to
3. `Rating`: Overall user rating of the app (as when scraped)
4. `Reviews`: Number of user reviews for the app (as when scraped)
5. `Size`: Size of the app (as when scraped)
6. `Installs`: Number of user downloads/installs for the app (as when scraped)
7. `Type`: Paid or Free
8. `Price`: Price of the app (as when scraped)
9. `Content Rating`: Age group the app is targeted at - Children / Mature 21+ / Adult
10. `Genres`: An app can belong to multiple genres (apart from its main category). For eg, a musical family game will belong to
11. `Last Updated`: Date when the app was last updated on Play Store (as when scraped)
12. `Current Ver`: Current version of the app available on Play Store (as when scraped)
13. `Android Ver`: Min required Android version (as when scraped)

1. `Translated_Review`: User review (Preprocessed and translated to English)
2. `Sentiment`: Positive/Negative/Neutral (Preprocessed)
3. `Sentiment_Polarity`: Sentiment polarity score
4. `Sentiment_Subjectivity`: Sentiment subjectivity score

# Simple questions

## How many different categories are there?

In [None]:
google_play_apps_df["Category"].nunique()

# What is the highest/lowest rating app?

In [None]:
google_play_apps_df.loc[
    google_play_apps_df["Rating"] == google_play_apps_df["Rating"].min()
]

In [None]:
google_play_apps_df.loc[
    google_play_apps_df["Rating"] == google_play_apps_df["Rating"].max()
]

# Android market breakdown:

The number of active apps per category:

In [None]:
google_play_apps_df["Category"].value_counts(normalize=True)

In [None]:
category_stats_df = google_play_apps_df.groupby(
    by="Category", as_index=False
).aggregate(number_of_apps=pd.NamedAgg(column="Rating", aggfunc="count"))

In [None]:
category_stats_df.head()

In [None]:
category_stats_df = category_stats_df.sort_values(by="number_of_apps", ascending=False)

In [None]:
fig = px.bar(
    data_frame=category_stats_df, x="Category", y="number_of_apps", color="Category"
)

fig.show()

# How apps are rated on general?

In [None]:
google_play_apps_df["Rating"].mean()

In [None]:
google_play_apps_df["Rating"].max()

In [None]:
google_play_apps_df["Rating"].min()

In [None]:
fig = px.histogram(data_frame=google_play_apps_df, x="Rating")

fig.show()

# How apps rating change by app category?

In [None]:
google_play_apps_df[["Category", "Rating"]]

In [None]:
category_rating_stats_df = google_play_apps_df.groupby(
    by="Category", as_index=False
).aggregate(
    category_min_rating=pd.NamedAgg(column="Rating", aggfunc="min"),
    category_max_rating=pd.NamedAgg(column="Rating", aggfunc="max"),
    category_average_rating=pd.NamedAgg(column="Rating", aggfunc="mean"),
)

In [None]:
category_rating_stats_df.head(10)

Now, we have for each _category_ (`ART_AND_DESIGN`, `AUTO_AND_VEHICLES`, `BEAUTY`, etc ...) three measurements:

1. `category_min_rating`: minimum rating value for the category
2. `category_max_rating`: maximum rating value for the category
3. `category_average_rating`: average rating for the category

In [None]:
google_play_apps_df["Category"].nunique()

In [None]:
focus_categories = [
    "BOOKS_AND_REFERENCE",
    "BUSINESS",
    "COMMUNICATION",
    "EDUCATION",
    "ENTERTAINMENT",
]

In [None]:
fig = px.histogram(
    data_frame=google_play_apps_df.loc[
        google_play_apps_df["Category"].isin(focus_categories)
    ],
    x="Rating",
    color='Category',
    facet_row='Category',
    height=1000
)

fig.show()

# What is the relation between app size and rating?

Is there a relation between app size and its rating?

For example, can we infer that bigger apps have higher rating?

In [None]:
google_play_apps_df[['Rating', 'Size']]

In [None]:
fig = px.scatter(data_frame=google_play_apps_df, x='Size', y='Rating')

fig.show()

Most top rated apps are optimally sized between ~2MB to ~40MB - neither too light nor too heavy.