# Interactive EDA using Matplotlib and DABL on Windows Store Data!

In [None]:
from IPython.display import HTML
f = open("../input/notebookassets/light_green.css").read()
HTML(f"<style>{f}</style>")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import datetime
from colorama import Fore, Style
from math import floor, ceil

import plotly.express as px
import plotly.graph_objs as go

import warnings
warnings.simplefilter("ignore")

try:
    import dabl
except:
    ! pip -q install dabl
    import dabl
plt.style.use('classic')

A little utility function to print a string in provided color.
Feel free to use 😁

In [None]:
def cout(string: str, color: str) -> str:
    """
    Prints a string in the required color
    """
    print(color+string+Style.RESET_ALL)

In [None]:
data = pd.read_csv("../input/windows-store/msft.csv")
data.head()

In [None]:
data.describe()

In [None]:
cout(f"The Shape of the data is: {data.shape}", Fore.CYAN)

In [None]:
cout(f"There are: {int(data.isna().sum().any())} Nan Values in the Data", Fore.GREEN)

# EDA

In [None]:
data = data.dropna()

## Prices
I am converting the prices from numbers into 4 categories:
- F - Free Apps
- C - Costly Apps (Apps costing less than 250 INR)
- VC - Very Costly (Apps costing between 250 and 600 INR)
- VVC  - Very Very Costly (Apps costing more than 600 INR)

This is my division of prices, you can subcatgorize it further if you want

In [None]:
def clean_prices(string):
    if string == "Free":
        return "F"
    else:
        string = string.replace(',', '')
        price = int(string[2:-3])
        if price <= 250:
            price = "C"
        elif price <= 600 and price > 250:
            price = "VC"
        else:
            price = "VVC"
        return price
data['Price'] = data['Price'].apply(clean_prices)

### Let's Visualize the Price Distribution of apps

In [None]:
targets = data['Price'].value_counts().tolist()
values = list(dict(data['Price'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=["Free", "Very Costly", "Costly", "Very Very Costly"],
    title='App Prices Distribution'
)
fig.show()

## Ratings
There are **9 unique app rating values**;
- 3.5
- 3.0
- 2.0
- 4.5
- 4.0
- 1.0
- 2.5
- 5.0
- 1.5

In [None]:
# Pie Chart
targets = data['Rating'].value_counts().tolist()
values = list(dict(data['Rating'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='App Ratings Distribution'
)
fig.show()

In [None]:
# And it's Count Plot
targets = data['Rating'].value_counts().tolist()
values = list(dict(data['Rating'].value_counts()).keys())

fig = px.bar(
    x=values,
    y=targets,
    color=values,
    labels={'x':'Ratings', 'y':'Count'},
    title="Ratings Count Distribution"
)

fig.show()

## Category
There are 13 Categories of apps in the dataset:

In [None]:
data['Category'].unique()

In [None]:
# Let's look at it's pie chart
targets = data['Category'].value_counts().tolist()
values = list(dict(data['Category'].value_counts()).keys())

fig = px.pie(
    values=targets, 
    names=values,
    title='Categories Distribution'
)
fig.show()

In [None]:
# And also it's count plot
targets = data['Category'].value_counts().tolist()
values = list(dict(data['Category'].value_counts()).keys())

fig = px.bar(
    x=values,
    y=targets,
    color=values,
    labels={'x':'Categories', 'y':'Count'},
    title="Category Count Distribution"
)

fig.show()

## Number of People Rated

Generally speaking, If an applications has more reviews, there are more chances of it having more downloads, so if that logic is correct then the applications in our dataset with more reviews will tend be more popular.

In [None]:
mean_reviews = floor(data['No of people Rated'].mean())
max_rev = data['No of people Rated'].max()
min_rev = data['No of people Rated'].min()
max_reviews_apps = data[data['No of people Rated'] == data['No of people Rated'].max()]['Name'].tolist()
min_reviews_apps = data[data['No of people Rated'] == data['No of people Rated'].min()]['Name'].tolist()

cout(f"Average App reviews are: {mean_reviews}", Fore.CYAN)
cout(f"Apps with most reviews are: {max_reviews_apps} having: {max_rev} reviews.", Fore.BLUE)
cout(f"Apps with most reviews are: {min_reviews_apps} having: {min_rev} reviews.", Fore.MAGENTA)

## Let's Now DABL

The thing is, DABL won't work really well since the data has both datetime objects and a lot of text data, which dabl can't handle very well for the time being.

In [None]:
plt.rcParams['figure.figsize'] = (18, 6)
plt.style.use('fivethirtyeight')
dabl.plot(data, target_col = 'Price')

In [None]:
plt.rcParams['figure.figsize'] = (18, 6)
plt.style.use('fivethirtyeight')
dabl.plot(data, target_col = 'Category')

In [None]:
plt.rcParams['figure.figsize'] = (18, 6)
plt.style.use('fivethirtyeight')
dabl.plot(data, target_col = 'Rating')