#### Importing the necessary libraries. 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import plotly.express as px
import plotly as py
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.tsa.arima_model import ARIMA
from progressbar import *

from sklearn.preprocessing import LabelEncoder

from math import sqrt

# Function for splitting training and test set
from sklearn.model_selection import train_test_split

# Function to perform data standardization 
from sklearn.preprocessing import StandardScaler

# Import classes for ML Models
from sklearn.linear_model import Ridge  ## Linear Regression + L2 regularization
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

# Evaluation Metrics
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae


from keras.models import Sequential, Model
from keras import optimizers
from keras.layers import Dense
import tensorflow as tf

from tensorflow.keras import backend as K
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import LSTM, Dense, RepeatVector, Masking, TimeDistributed
from tensorflow. keras.utils import plot_model

from tensorflow.keras.optimizers import Adam

import warnings
warnings.filterwarnings("ignore")

## Exploratory Data Analysis
#### Reading CSV file data.


In [None]:
df = pd.read_csv("../input/iowa-liquor-sales/Iowa_Liquor_Sales.csv")

Displaying first 5 rows.

In [None]:
df.head()

In [None]:
# Last 5 rows
df.tail()

Printing the shape of the data.

In [None]:
df.shape

We have around 12 millions rows and 24 columns in the dataset.

#### Columns of the dataset

In [None]:
df.columns

In [None]:
# Checknig the types of columns
df.dtypes

In [None]:
df.info()

In [None]:
# Displaying unique names of country
print(df['City'].nunique())
df['City'].unique()

> So there are 793 unique cities in our data.

In [None]:
df['City'].value_counts()

There are highest number of shops in the Des Moines city.

In [None]:
# Checking for the Null values
df.isnull().sum()

There are null values. So we will simply drop all such rows that also will reduce the size of dataset.

In [None]:
# Dropping null values rows and again checking shape of dataframe.
df.dropna(inplace = True)
print(df.shape)

In [None]:
# Check for the duplicates values.
df.drop_duplicates()
df.shape

#### Converting the date into month, day, year as separate column.¶

In [None]:
df['Date'] = pd.to_datetime(df['Date'])
df['Year'] = df['Date'].dt.year
df['Month'] = df['Date'].dt.month
df['Day'] = df['Date'].dt.day

In [None]:
df.head()

In [None]:
# Printing minimum and the maximum date from dataset.
print(df['Date'].min())
print(df['Date'].max())

* So we have dataset from the 3rd january 2012 to 31st october 2017.

In [None]:
print(df['Category Name'].nunique())
df['Category Name'].unique()

![](http://)As we can see there are 130 unique different categories of liquors.

In [None]:
# Replacing the dollar symbol in the sales column by None

df['Sale (Dollars)'] = df['Sale (Dollars)'].str.replace('$', '')
df['Sale (Dollars)'] = df['Sale (Dollars)'].astype('float')

#### Top 10 categories of liquors that has highest sale.

In [None]:
df_plot = df.fillna('NA').groupby(['Category Name','Pack','Date'])['Sale (Dollars)'].sum().groupby(
            ['Category Name','Pack']).max().sort_values().groupby(
            ['Category Name']).sum().sort_values(ascending=False)
top_count = pd.DataFrame(df_plot)
top_count1 = pd.DataFrame(df_plot.head(10))

In [None]:
import plotly.graph_objects as go

fig_reg = px.bar(top_count1,x=top_count1.index, y='Sale (Dollars)',color='Sale (Dollars)')
fig_reg.update_layout(
    title="Sales of liquor per category",
    xaxis_title=" Category Name",
    yaxis_title="Sales in dollars",
    )
fig_reg.show()

As we can see that the canadian whiskies has high number of sales may be becuase peoples like to drink it more or it can be very famous liquor brand as well.

#### Sales of liquor by city name (top 20).

In [None]:
df_plot = df.fillna('NA').groupby(['City','Pack','Date'])['Sale (Dollars)'].sum().groupby(
            ['City','Pack']).max().sort_values().groupby(
            ['City']).sum().sort_values(ascending=False)
top_count1 = pd.DataFrame(df_plot)
top_count1 = pd.DataFrame(df_plot.head(20))

fig_reg = px.bar(top_count1,x=top_count1.index, y='Sale (Dollars)',color='Sale (Dollars)')
fig_reg.update_layout(
    title="Sales of liquor per city",
    xaxis_title=" City Name",
    yaxis_title="Sales in dollars",
    )
fig_reg.show()

The CEDAR RAPIDS city has highest sales of liquors.

In [None]:
df_plot = df.fillna('NA').groupby(['Month','Pack','Date'])['Sale (Dollars)'].sum().groupby(
            ['Month','Pack']).max().sort_values().groupby(
            ['Month']).sum().sort_values(ascending=False)
top_count1 = pd.DataFrame(df_plot)
top_count1 = pd.DataFrame(df_plot.head(50))

fig_reg = px.bar(top_count1,x=top_count1.index, y='Sale (Dollars)',color='Sale (Dollars)')
fig_reg.update_layout(
    title="Sales of liquor per Month",
    xaxis_title=" Month Number",
    yaxis_title="Sales in dollars",
    )
fig_reg.show()

From the above plot it is cleared that the October month has highest number of sales of liquors.

In [None]:
daily_sales = df.groupby('Date', as_index=False)['Sale (Dollars)'].sum()

In [None]:
daily_sales_sc = go.Scatter(x=daily_sales['Date'], y=daily_sales['Sale (Dollars)'])
layout = go.Layout(title='Daily sales', xaxis=dict(title='Date'), yaxis=dict(title='Sales'))
fig = go.Figure(data=[daily_sales_sc], layout=layout)
iplot(fig)

> In Sep 2013 there is large sales of the liquors.

#### Sales of liquors with category and the volume of liquor sold.

In [None]:
df_plot = df.fillna('NA').groupby(['Category Name','Pack','Date'])['Sale (Dollars)'].sum().groupby(
            ['Category Name','Pack']).max().sort_values().groupby(
            ['Category Name']).sum().sort_values(ascending=False)
top_count1 = pd.DataFrame(df_plot)
#top_count1 = pd.DataFrame(df_plot.head(10))

df_plot = df.fillna('NA').groupby(['Category Name','Pack','Date'])['Volume Sold (Liters)'].sum().groupby(
            ['Category Name','Pack']).max().sort_values().groupby(
            ['Category Name']).sum().sort_values(ascending=False)
top_count2 = pd.DataFrame(df_plot)
#top_count2 = pd.DataFrame(df_plot.head(10))

In [None]:
# Ordrening the countries by number of fatalities
top_count = pd.concat([top_count1 , top_count2],axis=1)
top_count = top_count.sort_values(['Sale (Dollars)'],ascending=False)[:10]
top_count

In [None]:
fig = go.Figure(data=[
    go.Bar(name='sale in dollars',x=top_count.index, y=top_count['Sale (Dollars)']),
    go.Bar(name='Volume in litres',x=top_count.index, y=top_count['Volume Sold (Liters)'])
])
# Change the bar mode
fig.update_layout(barmode='group',title="Sales of liquors with category and the volume of liquor sold.",
    xaxis_title=" category",
    yaxis_title="Sale and the amount of liquor sold in litre.",)
fig.show()


Canadian Whiskies has highest sale with the 72K litre.

#### Volume sold in litres by vendor name.

In [None]:
df_plot = df.fillna('NA').groupby(['Vendor Name','Pack','Date'])['Volume Sold (Liters)'].sum().groupby(
            ['Vendor Name','Pack']).max().sort_values().groupby(
            ['Vendor Name']).sum().sort_values(ascending=False)
top_count1 = pd.DataFrame(df_plot)
#top_count1 = pd.DataFrame(df_plot.head(50))

fig_reg = px.bar(top_count1,x=top_count1.index, y='Volume Sold (Liters)',color='Volume Sold (Liters)')
fig_reg.update_layout(
    title="Volume sold by vendor name",
    xaxis_title=" Vendor Name",
    yaxis_title="Liquor sold in Litres",
    )
fig_reg.show()