In [112]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rcParams
import warnings

Reading Training File

In [113]:
train_data=pd.read_csv('../input/store-sales-time-series-forecasting/train.csv')
train_data.head()

Reading other supplementary file

In [114]:
stores=pd.read_csv('../input/store-sales-time-series-forecasting/stores.csv')
holidays=pd.read_csv('../input/store-sales-time-series-forecasting/holidays_events.csv')
oil=pd.read_csv('../input/store-sales-time-series-forecasting/oil.csv')
transactions = pd.read_csv('../input/store-sales-time-series-forecasting/transactions.csv')

In [115]:
train_data['date']=pd.to_datetime(train_data['date'])

Merging store data with transaction table

In [116]:
store_data=pd.merge(train_data,stores,on='store_nbr',how='left')
store_data.head()

Average unit sales for each state

In [117]:
state_sales = store_data.groupby(['state'], as_index=False).sales.sum()
state_sales = state_sales.sort_values(by='sales',ascending=False)

In [118]:
state_sales.head()

In [119]:
sns.set(rc={'figure.figsize':(18,5)})
ax = sns.barplot(x="state", y="sales", data=state_sales)
plt.title('Total sales per state')
plt.xlabel('State')
plt.ylabel('Total Sales')
plt.xticks(rotation='60')
plt.show()

From the above data, we can infer that the city of Pichincho contributes for very amount of sales

City wise sales Analysis

In [120]:
city_sales = store_data.groupby(['city'],as_index=False)['sales'].sum()
city_sales=city_sales.sort_values(by='sales',ascending=False)

In [121]:
city_sales.head()

In [122]:
sns.set(rc={'figure.figsize':(18,5)})
ax = sns.barplot(x="city", y="sales", data=city_sales)
plt.title('Total sales per City')
plt.xlabel('City')
plt.ylabel('Total Sales')
plt.xticks(rotation='90')
plt.show()

In [123]:
store_data[store_data['city']=='Quito']

From the above chart, max of products are sold from store loacted at Quito

Sales per Store

In [124]:
#Total unit sales per store
store_sales=store_data.groupby(['store_nbr'],as_index=False)['sales'].sum()
store_sales = store_sales.sort_values(by='sales',ascending=False)

In [125]:
store_sales.head()

In [126]:
sns.set(rc={'figure.figsize':(18,5)})
ax = sns.barplot(x="store_nbr", y="sales", data=store_sales)
plt.title('Total sales per City')
plt.xlabel('Store Number')
plt.ylabel('Total Sales')
plt.xticks(rotation='90')
plt.show()

In [127]:
#Checking which state generates the highest sales
store_data[(store_data['store_nbr']>=44) & (store_data['store_nbr']<50)]

In [128]:
pichincha_data=store_data[store_data['state']=='Pichincha']
pichincha_data.shape

In [129]:
pichincha_data['store_nbr'].value_counts()

In [130]:
pichincha_data_percent=(len(pichincha_data)/len(train_data))*100
pichincha_data_percent

From above, high sales volume 35% of the data is from the state Pinchincha so dropping this state data

In [131]:
store_data=store_data[store_data['state']!='Pichincha']
store_data.shape

Sales per unit Family of products

In [132]:
family_sales=store_data.groupby(['family'],as_index=False)['sales'].sum()
family_sales=family_sales.sort_values(by='sales',ascending=False)

sns.set(rc={'figure.figsize':(18,5)})
ax = sns.barplot(x="family", y="sales", data=family_sales)
plt.title('Total sales per Family of Products')
plt.xlabel('Family')
plt.ylabel('Total Sales')
plt.xticks(rotation='90')
plt.show()

In [133]:
family_sales['family']

Replacing the long tails of data with Family 'OTHERS'

In [134]:
store_data['family']=store_data['family'].replace(['HOME AND KITCHEN I', 'GROCERY II', 'SEAFOOD', 
                                                   'HOME AND KITCHEN II', 'CELEBRATION', 'LINGERIE', 'AUTOMOTIVE', 'LAWN AND GARDEN', 
                                                   'PLAYERS AND ELECTRONICS', 'LADIESWEAR', 'BEAUTY', 'PET SUPPLIES', 'MAGAZINES',
                                                   'SCHOOL AND OFFICE SUPPLIES', 'HARDWARE', 'HOME APPLIANCES', 'BABY CARE', 'BOOKS'],'OTHERS')

In [135]:
family_sales=store_data.groupby(['family'],as_index=False)['sales'].sum()
family_sales=family_sales.sort_values(by='sales',ascending=False)

sns.set(rc={'figure.figsize':(12,4)})
ax = sns.barplot(x="family", y="sales", data=family_sales)
plt.title('Total sales per Family of Products')
plt.xlabel('Product Family')
plt.ylabel('Total Sales')
plt.xticks(rotation='90')
plt.show()

In [136]:
store_data.shape

Sales per Cluster

In [137]:
cluster_sales=store_data.groupby(['cluster'],as_index=False)['sales'].sum()
cluster_sales=cluster_sales.sort_values(by='sales',ascending=False)

sns.set(rc={'figure.figsize':(10,4)})
ax = sns.barplot(x="cluster", y="sales", data=cluster_sales)
plt.title('Total sales per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Total Sales')
plt.show()

In [138]:
type_sales=store_data.groupby(['type'],as_index=False)['sales'].sum()
type_sales=type_sales.sort_values(by='sales',ascending=False)

sns.set(rc={'figure.figsize':(10,4)})
ax = sns.barplot(x='type', y="sales", data=type_sales)
plt.title('Total sales per Type')
plt.xlabel('Type')
plt.ylabel('Total Sales')
plt.show()

In [139]:
#dropping column city as we are already having 'State' as a feature
store_data=store_data.drop(['city'],axis=1)

In [140]:
store_data.head()