# This Notbook include 5 section

- Section 0 : Import Libraries
- Section 1 : Read & Check Data
- Section 2 : DataClean & Preprocessing
- Section 3 : EDA
- Section 4 : Prophet & ARIMA Forecasting
- Section 5 : Prophet & ARIMA Forecasting without free price

# Columns: 

- Name: Name of the app.
- Rating: Rating for the app.
- No of People Rated : No of people who rated the app.
- Category : Category of the app.
- Date : Date when it is posted.
- Price : Price of the app.
- Price_usd : 1 indian rupee = 0.013 usd
- year
- month
- day
- day_of_week

# Import Libraries

In [None]:
pip install chart_studio

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
plt.style.use("ggplot")
import plotly 
import warnings
warnings.filterwarnings("ignore")
import plotly.graph_objs as go 
import chart_studio.plotly as py
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from matplotlib import pyplot
from statsmodels.tsa.arima_model import ARIMA
import fbprophet
from pandas.plotting import autocorrelation_plot

# 1) Read & Check Data

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/windows-store/msft.csv')
print(data.info())
data.head()

# 2) Data Clean & Preprocessing

- Drop NaN rows
- Change Price type.

In [None]:
print(data.isna().sum())

In [None]:
data = data.dropna(axis=0)
print(data.isna().sum())

# Price

In [None]:
data.Price.unique()

In [None]:
data['Price'] = data['Price'].replace('Free','₹ 0')
data['Price'] = data['Price'].apply(lambda x:x.split(' ',1)[1])
data['Price'] = data['Price'].str.replace(',', '').astype('float64')

In [None]:
data['Price_usd'] = data['Price']*0.013

# Date

In [None]:
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day
data['day_of_week'] = data['Date'].dt.dayofweek

data['dat_of_week'] = [{0:Monday ... 6:Sunday}]

# ----------------------------------------------------------------------------------------

# 3) EDA

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('day_of_week').Price_usd.mean().index, 
                 y=data.groupby('day_of_week').Price_usd.mean().values, data=data)
ax.set_title('Mean Price_usd - day_of_week')
ax.set_xlabel('day_of_week')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('day_of_week').Price_usd.sum().index, 
                 y=data.groupby('day_of_week').Price_usd.sum().values, data=data)
ax.set_title('Sum Price_usd - day_of_week')
ax.set_xlabel('day_of_week')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('day').Price_usd.mean().index, 
                 y=data.groupby('day').Price_usd.mean().values, data=data)
ax.set_title('Mean Price_usd - day')
ax.set_xlabel('day')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('day').Price_usd.sum().index, 
                 y=data.groupby('day').Price_usd.sum().values, data=data)
ax.set_title('Sum Price_usd - day')
ax.set_xlabel('day')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('month').Price_usd.mean().index, 
                 y=data.groupby('month').Price_usd.mean().values, data=data)
ax.set_title('Mean Price_usd - month')
ax.set_xlabel('month')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('month').Price_usd.sum().index, 
                 y=data.groupby('month').Price_usd.sum().values, data=data)
ax.set_title('Sum Price_usd - month')
ax.set_xlabel('month')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('year').Price_usd.mean().index, 
                 y=data.groupby('year').Price_usd.mean().values, data=data)
ax.set_title('Mean Price_usd - year')
ax.set_xlabel('year')
ax.set_ylabel('Price_usd')

In [None]:
plt.figure(figsize=(12,8))
ax = sns.barplot(x=data.groupby('year').Price_usd.sum().index, 
                 y=data.groupby('year').Price_usd.sum().values, data=data)
ax.set_title('Sum Price_usd - year')
ax.set_xlabel('year')
ax.set_ylabel('Price_usd')

In [None]:
chart = sns.countplot(x=data['Category'])
chart.set_xticklabels(chart.get_xticklabels(), rotation=90)

In [None]:
data_rating = data.Rating.value_counts()
fig = go.Figure(data=[
    go.Bar(name='Rating', x=data_rating.index, y=data_rating),
])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.update_layout(barmode='group', hovermode='x', title_text='Rating')
fig.show()

In [None]:
sns.countplot(x='Rating',data=data,hue='Category')

In [None]:
data_free = data.Category[data.Price_usd == 0].value_counts()
data_pay = data.Category[data.Price_usd != 0].value_counts()
fig = go.Figure(data=[
    go.Bar(name='Free', x=data_free.index, y=data_free),
    go.Bar(name='Paid', x=data_pay.index, y=data_pay)
])
fig.update_traces(texttemplate='%{value}', textposition='outside')
fig.update_layout(barmode='group', hovermode='x', title_text='Free/Paid Apps')
fig.show()

# Auto Correlation

In [None]:
autocorrelation_plot(data['Price_usd'])
pyplot.show()

# 4) Forecasting Models 

# Forecasting with Prophet

In [None]:
prophet_data = data.rename(columns={'Date': 'ds', 'Price_usd': 'y'})
fbp = fbprophet.Prophet()
fbp.fit(prophet_data)

In [None]:
data_forecast = fbp.make_future_dataframe(periods=24,freq='M')
data_forecast = fbp.predict(data_forecast)
fbp.plot(data_forecast, xlabel = 'Date', ylabel = 'Price_usd')
plt.title('Energy Consumption of Turkey (MWh)')

In [None]:
fbp.plot(data_forecast, xlabel = 'Date', ylabel = 'Price_usd')
plt.vlines([date for date in fbp.changepoints],ymin=0,ymax=82, color='r')
plt.title('Energy Consumption of Turkey (MWh)')
print('Change points:')
print(fbp.changepoints)

In [None]:
fbp.plot_components(data_forecast)

# Daily - Weekly - Monthly

In [None]:
f,ax=plt.subplots(3,1,figsize=(16,9))

data1 = data.resample('M', on='Date').mean()
data2 = data.resample('W', on='Date').mean()
data3 = data.resample('D', on='Date').mean()

data1['Price_usd'].plot(ax = ax[0], color = 'cyan')
ax[0].set_title('Mean Delay (monthly)')
ax[0].set_xlabel('')
ax[0].set_ylabel('minute')

data2['Price_usd'].plot(ax = ax[1], color = 'lime')
ax[1].set_title('Mean Delay (weekly)')
ax[1].set_xlabel('')
ax[1].set_ylabel('minute')

data3['Price_usd'].plot(ax = ax[2], color = 'grey')
ax[2].set_title('Mean Delay (daily)')
ax[2].set_xlabel('')
ax[2].set_ylabel('minute')

f.subplots_adjust(hspace=0.6)
plt.show()

# ARIMA

In [None]:
model = ARIMA(data['Price_usd'], order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

In [None]:
residuals = pd.DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

In [None]:
plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})

# Original Series
fig, axes = plt.subplots(3, 2, sharex=True)
axes[0, 0].plot(data.Price_usd); axes[0, 0].set_title('Original Series')
plot_acf(data.Price_usd, ax=axes[0, 1])

# 1st Differencing
axes[1, 0].plot(data.Price_usd.diff()); axes[1, 0].set_title('1st Order Differencing')
plot_acf(data.Price_usd.diff().dropna(), ax=axes[1, 1])

# 2nd Differencing
axes[2, 0].plot(data.Price_usd.diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(data.Price_usd.diff().diff().dropna(), ax=axes[2, 1])

plt.show()

# 5) Forecasting Models without Free Price 

We have got so many 0 value in data['Price_usd'] so I drop that and check again.

# Prophet_without 0

In [None]:
drop_0_list = data.index[data["Price_usd"] == 0].tolist()
drop_0 = data.drop(drop_0_list)
drop_0.head()

In [None]:
prophet_data_2 = drop_0.rename(columns={'Date': 'ds', 'Price_usd': 'y'})
fbp_2 = fbprophet.Prophet()
fbp_2.fit(prophet_data_2)

In [None]:
data_forecast_2 = fbp_2.make_future_dataframe(periods=24,freq='M')
data_forecast_2 = fbp_2.predict(data_forecast_2)
fbp.plot(data_forecast_2, xlabel = 'Date', ylabel = 'Price_usd')
plt.title('Energy Consumption of Turkey (MWh)')

In [None]:
fbp_2.plot(data_forecast_2, xlabel = 'Date', ylabel = 'Price_usd')
plt.vlines([date for date in fbp_2.changepoints],ymin=0,ymax=82, color='r')
plt.title('Energy Consumption of Turkey (MWh)')
print('Change points:')
print(fbp_2.changepoints)

In [None]:
fbp_2.plot_components(data_forecast_2)

# Daily - Weekly - Monthly

In [None]:
f,ax=plt.subplots(3,1,figsize=(16,9))

data1_1 = drop_0.resample('M', on='Date').mean()
data2_1 = drop_0.resample('W', on='Date').mean()
data3_1 = drop_0.resample('D', on='Date').mean()

data1_1['Price_usd'].plot(ax = ax[0], color = 'cyan')
ax[0].set_title('Mean Delay (monthly)')
ax[0].set_xlabel('')
ax[0].set_ylabel('minute')

data2_1['Price_usd'].plot(ax = ax[1], color = 'lime')
ax[1].set_title('Mean Delay (weekly)')
ax[1].set_xlabel('')
ax[1].set_ylabel('minute')

data3_1['Price_usd'].plot(ax = ax[2], color = 'grey')
ax[2].set_title('Mean Delay (daily)')
ax[2].set_xlabel('')
ax[2].set_ylabel('minute')

f.subplots_adjust(hspace=0.6)
plt.show()

# ARIMA_without 0

In [None]:
model_2 = ARIMA(drop_0['Price_usd'], order=(5,1,0))
model_fit_2 = model_2.fit(disp=0)
print(model_fit_2.summary())

In [None]:
residuals_2 = pd.DataFrame(model_fit_2.resid)
residuals_2.plot()
pyplot.show()
residuals_2.plot(kind='kde')
pyplot.show()
print(residuals_2.describe())

In [None]:
plt.rcParams.update({'figure.figsize':(9,7), 'figure.dpi':120})

# Original Series
fig, axes = plt.subplots(3, 2, sharex=True)
axes[0, 0].plot(drop_0.Price_usd); axes[0, 0].set_title('Original Series')
plot_acf(drop_0.Price_usd, ax=axes[0, 1])

# 1st Differencing
axes[1, 0].plot(drop_0.Price_usd.diff()); axes[1, 0].set_title('1st Order Differencing')
plot_acf(drop_0.Price_usd.diff().dropna(), ax=axes[1, 1])

# 2nd Differencing
axes[2, 0].plot(drop_0.Price_usd.diff().diff()); axes[2, 0].set_title('2nd Order Differencing')
plot_acf(drop_0.Price_usd.diff().diff().dropna(), ax=axes[2, 1])

plt.show()