# AppStore Dataset
Tutorial: https://www.makeschool.com/academy/track/app-store-dataset-tutorial


In [None]:
# Pandas is a library for basic data analysis
import pandas as pd

# NumPy is a library for advanced mathematical computation
import numpy as np

# MatPlotLib is a library for basic data visualization
import matplotlib.pyplot as plt

# SeaBorn is a library for advanced data visualization
import seaborn as sns

In [None]:
sns.set(style="white", context="notebook", palette="deep")
COLOR_COLUMNS = ["#66C2FF", "#5CD6D6", "#00CC99", "#85E085", "#FFD966", "#FFB366", "#FFB3B3", "#DAB3FF", "#C2C2D6"]
sns.set_palette(palette=COLOR_COLUMNS, n_colors=4)

In [None]:
ls #bash command to list all files in the current directory

In [None]:
cd ../datasets/ #checks and see if our csv file is there

In [None]:
cd MakeSchool/Term4/DS1.1/classwork/

In [None]:
FILEPATH = "../datasets/AppleStore.csv"
df = pd.read_csv(FILEPATH, index_col="Unnamed: 0")
df

## Some good questions to keep in mind:
- Any convoluted data?
- Any repeated data?
- Any redundant data?
- Any data that's difficult to understand?
---
- Remove **unclean data**
    - irregularities in data types
    - inconsistencies in how data was recorded
    - inappropriate data values (e.g. ```null``` values)

## .iloc[] and .loc[]
- ```.iloc[]``` and ```.loc[]``` are selector tools in Pandas to view a single or multiple rows, columns, and/or cells in a datasets
- `.iloc[]` is useful for selecting data by **index**
- `.loc[]` is useful for selecting data by **label**

In [None]:
# df.iloc[0] #get first index = PACMAN
# df.loc[1] #get first label = PACMAN
# df.iloc[:3] # get top 3
# df.loc[:3] # get top 3
# df.head() #peeks top 5
# df.tail() #peeks bottom 5
# df.price.mean() #does the same as line below
df["price"].mean()

In [None]:
# df.describe() # show general column-formatted information on the dataset

In [None]:
df.describe(include="O")

In [None]:
#gab size_bytes data and divide to give us our new size_Mb column
def _byte_resizer(data):
    return np.around(data / 1000000, decimals=2)

df["size_Mb"] = df["size_bytes"].apply(_byte_resizer)
df.drop("size_bytes", axis="columns", inplace=True)
df

## Page 3: Creating Basic Visualization
##### Part 2: Exploratory Data Science
https://www.makeschool.com/academy/track/standalone/app-store-dataset-tutorial/creating-basic-visualizations

In [None]:
plt.subplots(figsize=(10, 8))
BINS = [0.00, 10.00, 20.00, 50.00, 100.00, 200.00, 500.00, 1000.00, 2000.00, np.inf]
LABELS = ["<10m", "10-20m", "20-50m", "50-100m", "100-200m", "200-500m", "500-1000m", "1-2G", ">2G"]
freqs = pd.cut(df["size_Mb"], BINS, include_lowest=True, labels=LABELS)
sns.barplot(y=freqs.value_counts().values, x=freqs.value_counts().index)


In [None]:
BINS = [-np.inf, 0.00, np.inf]
LABELS = ["FREE", "PAID"]
colors = ['lightcoral', 'yellowgreen']
df["price_categories"] = pd.cut(df["price"], BINS, include_lowest=True, labels=LABELS)
fig, axs = plt.subplots(figsize=(10, 5)) #initialize our plotting space in MatPlotLib.
price_df = df["price_categories"].value_counts()

#Now create a doughnut plot in MapPlotLib
plt.pie(price_df.values, labels=LABELS, colors=colors, autopct='%1.1f%%', shadow=True)
centre_circle = plt.Circle((0,0),0.75,color='black', fc='white',linewidth=1.25)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)
plt.axis('equal')

### Let's keep moving with this idea and check out the highest rated free and paid apps.

In [None]:
free_apps = df.loc[df["price_categories"] == "FREE"]
paid_apps = df.loc[df["price_categories"] == "PAID"]
# sort data based on total user ratings
free_apps_rated = free_apps.sort_values(by=["rating_count_tot"], ascending=False)
paid_apps_rated = paid_apps.sort_values(by=["rating_count_tot"], ascending=False)
# plot free apps
sns.barplot(x=free_apps_rated["rating_count_tot"][:10], y=free_apps_rated["track_name"][:10])

In [None]:
# plot paid apps
sns.barplot(x=paid_apps_rated["rating_count_tot"][:10], y=paid_apps_rated["track_name"][:10])

In [None]:
plt.subplots(figsize=(20, 20))
ratings = df.sort_values(by=["rating_count_tot"], ascending=False) #descending sort of total rating count
sns.barplot(x=ratings["rating_count_tot"][:30], y=ratings["track_name"][:30]) #plot the first 30 rated

## Page 4 See Games with Us
##### Part 3: Exploratory Data Science (finale)
https://www.makeschool.com/academy/track/standalone/app-store-dataset-tutorial/see-games-with-us