## Importing Required Packages

In [1]:
import numpy as np
import pandas as pd

## Loading dataset file

In [2]:
data = pd.read_csv('./dataset/HistoricalProductDemand.csv')

In [3]:
data.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand
0,Product_0993,Whse_J,Category_028,2012/7/27,100
1,Product_0979,Whse_J,Category_028,2012/1/19,500
2,Product_0979,Whse_J,Category_028,2012/2/3,500
3,Product_0979,Whse_J,Category_028,2012/2/9,500
4,Product_0979,Whse_J,Category_028,2012/3/2,500


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 5 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   Product_Code      1048575 non-null  object
 1   Warehouse         1048575 non-null  object
 2   Product_Category  1048575 non-null  object
 3   Date              1037336 non-null  object
 4   Order_Demand      1048575 non-null  object
dtypes: object(5)
memory usage: 40.0+ MB


In [5]:
data['Product_Code'].nunique()

2160

In [6]:
data.nunique()

Product_Code        2160
Warehouse              4
Product_Category      33
Date                1729
Order_Demand        3828
dtype: int64

In [7]:
data['Warehouse'].unique()

array(['Whse_J', 'Whse_S', 'Whse_C', 'Whse_A'], dtype=object)

## Spliting data column into year, month and day columns

In [8]:
data[['Year', 'Month', 'Day']] = data['Date'].str.split('/', expand=True)


In [9]:
data

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Year,Month,Day
0,Product_0993,Whse_J,Category_028,2012/7/27,100,2012,7,27
1,Product_0979,Whse_J,Category_028,2012/1/19,500,2012,1,19
2,Product_0979,Whse_J,Category_028,2012/2/3,500,2012,2,3
3,Product_0979,Whse_J,Category_028,2012/2/9,500,2012,2,9
4,Product_0979,Whse_J,Category_028,2012/3/2,500,2012,3,2
...,...,...,...,...,...,...,...,...
1048570,Product_1791,Whse_J,Category_006,2016/4/27,1000,2016,4,27
1048571,Product_1974,Whse_J,Category_006,2016/4/27,1,2016,4,27
1048572,Product_1787,Whse_J,Category_006,2016/4/28,2500,2016,4,28
1048573,Product_0901,Whse_J,Category_023,2016/10/7,50,2016,10,7


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   Product_Code      1048575 non-null  object
 1   Warehouse         1048575 non-null  object
 2   Product_Category  1048575 non-null  object
 3   Date              1037336 non-null  object
 4   Order_Demand      1048575 non-null  object
 5   Year              1037336 non-null  object
 6   Month             1037336 non-null  object
 7   Day               1037336 non-null  object
dtypes: object(8)
memory usage: 64.0+ MB


In [11]:
data.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Year,Month,Day
0,Product_0993,Whse_J,Category_028,2012/7/27,100,2012,7,27
1,Product_0979,Whse_J,Category_028,2012/1/19,500,2012,1,19
2,Product_0979,Whse_J,Category_028,2012/2/3,500,2012,2,3
3,Product_0979,Whse_J,Category_028,2012/2/9,500,2012,2,9
4,Product_0979,Whse_J,Category_028,2012/3/2,500,2012,3,2


## Removing unwanted formats of data

In [12]:
data['Order_Demand'] = data['Order_Demand'].str.replace(r'\D', '', regex=True).astype(int)

In [13]:
data['Year'] = data['Year'].astype(float)

In [14]:
data['Year'] = pd.to_numeric(data['Year'], errors='coerce')

In [15]:
data['Year'] = data['Year'].fillna(0).astype(int)

In [16]:
data['Order_Demand'].sum()

5145333321

In [17]:
data.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Date,Order_Demand,Year,Month,Day
0,Product_0993,Whse_J,Category_028,2012/7/27,100,2012,7,27
1,Product_0979,Whse_J,Category_028,2012/1/19,500,2012,1,19
2,Product_0979,Whse_J,Category_028,2012/2/3,500,2012,2,3
3,Product_0979,Whse_J,Category_028,2012/2/9,500,2012,2,9
4,Product_0979,Whse_J,Category_028,2012/3/2,500,2012,3,2


In [18]:
data['Month'] = data['Month'].astype(float)
data['Month'] = pd.to_numeric(data['Year'], errors='coerce')
data['Month'] = data['Year'].fillna(0).astype(int)

In [19]:
data['Day'] = data['Month'].astype(float)
data['Day'] = pd.to_numeric(data['Year'], errors='coerce')
data['Day'] = data['Year'].fillna(0).astype(int)

## Creating a copy of main data and dropping the date column

In [20]:
df = pd.DataFrame()
df = data

In [21]:
df = df.drop(columns = ['Date'])

In [22]:
df.head()

Unnamed: 0,Product_Code,Warehouse,Product_Category,Order_Demand,Year,Month,Day
0,Product_0993,Whse_J,Category_028,100,2012,2012,2012
1,Product_0979,Whse_J,Category_028,500,2012,2012,2012
2,Product_0979,Whse_J,Category_028,500,2012,2012,2012
3,Product_0979,Whse_J,Category_028,500,2012,2012,2012
4,Product_0979,Whse_J,Category_028,500,2012,2012,2012


In [23]:
df.corr()

Unnamed: 0,Order_Demand,Year,Month,Day
Order_Demand,1.0,0.014139,0.014139,0.014139
Year,0.014139,1.0,1.0,1.0
Month,0.014139,1.0,1.0,1.0
Day,0.014139,1.0,1.0,1.0


## Performing data visualization

In [2]:
import matplotlib 
import seaborn as sns

RuntimeError: module compiled against API version 0xf but this version of numpy is 0xe

ImportError: numpy.core.multiarray failed to import