In [1]:
# Dataframes.
import pandas as pd

# Numerical arrays.
import numpy as np

# Stationarity
from statsmodels.tsa.stattools import adfuller

# Predictions
from pmdarima.arima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf
from statsmodels.graphics.tsaplots import plot_pacf
from sklearn.metrics import mean_squared_error
from pmdarima.arima.utils import ndiffs
from pmdarima.utils import diff_inv
import warnings
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.seasonal import STL

# Plotting.
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns

%matplotlib inline

In [2]:
# Change style and size of plots
plt.style.use("ggplot")
plt.rcParams["figure.figsize"] = (12, 8)
plt.rcParams["figure.titlesize"] = 15

### Stores
1. Loading the Files

In [3]:
# Load the stores.csv without an index.
stores = pd.read_csv("Walmart Data Set/stores.csv", header=0)

# Display the dafaframe
stores.head()

Unnamed: 0,Store,Type,Size,Store A,Store B,Store C
0,1,A,151315,219622.0,140167.0,42988.0
1,2,A,202307,39690.0,34875.0,39690.0
2,4,A,205863,177247.7273,101190.7059,40541.66667
3,6,A,202505,,,
4,8,A,155078,,,


In [4]:
# Display the dafaframe
stores.tail()

Unnamed: 0,Store,Type,Size,Store A,Store B,Store C
40,37,C,39910,,,
41,38,C,39690,,,
42,42,C,39690,,,
43,43,C,41062,,,
44,44,C,39910,,,


In [5]:
stores.describe()

Unnamed: 0,Store,Size,Store A,Store B,Store C
count,45.0,45.0,3.0,3.0,3.0
mean,23.0,130287.6,145519.9091,92077.568633,41073.222223
std,13.133926,63825.271991,94068.443124,53234.277201,1712.049789
min,1.0,34875.0,39690.0,34875.0,39690.0
25%,12.0,70713.0,108468.86365,68032.85295,40115.833335
50%,23.0,126512.0,177247.7273,101190.7059,40541.66667
75%,34.0,202307.0,198434.86365,120678.85295,41764.833335
max,45.0,219622.0,219622.0,140167.0,42988.0


The stores file consist of information about 45 stores, including the type and size of each. We can observe that there are mainly empty values in the columns Store A, Store B and Store C. 



<br>

_2. Data cleaning._

The first step to cleansing the data is by checking it for empty values.

In [None]:
# Check for empty values.
stores.isnull().sum()

The above confirms that only the last three columns have the empty values. Since they do not provide enough information that could be valuable in the sales forecasting, they will be removed.

In [None]:
# Remove columns with empty values.
cleaned_stores = stores.drop(['Store A','Store B','Store C'], axis=1)

# Check for empty values again.
cleaned_stores.isnull().sum()

In [None]:
# Calculate maximum values of each store type and set as pie sizes
sizes = cleaned_stores.groupby('Type').max().Size.values
labels = cleaned_stores.groupby('Type').max().index

# Create a figure and axis, set a title.
fig, ax = plt.subplots(figsize=(8,8))
fig.suptitle("Store Size per Type")

# Build a pie plot.
ax.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True, startangle=90)
# Equal aspect ratio ensures that pie is drawn as a circle.
ax.axis('equal') 

# Show the plot.
plt.tight_layout()
plt.show()