In [2]:
import pandas as pd
from pathlib import Path

### **Data Source**

In [3]:
BASE_DIR = Path.cwd().resolve().parents[0]
sales = pd.read_csv(f"{BASE_DIR}/data/gold/sales.csv")
customer = pd.read_csv(f"{BASE_DIR}/data/gold/customer.csv")
product = pd.read_csv(f"{BASE_DIR}/data/gold/product.csv")
dim_date = pd.read_csv(f"{BASE_DIR}/data/dim/dim_date.csv")

### **Sales Table**
---

#### **Size**

In [11]:
sales.shape

(60398, 11)

In [12]:
sales.columns

Index(['sales_key', 'order_number', 'product_key', 'customer_key', 'date_key',
       'order_date', 'ship_date', 'delivery_date', 'price', 'quantity',
       'amount'],
      dtype='object')

#### **Data Types**

In [13]:
sales.dtypes

sales_key          int64
order_number      object
product_key       object
customer_key       int64
date_key           int64
order_date        object
ship_date         object
delivery_date     object
price            float64
quantity           int64
amount           float64
dtype: object

#### **Null Values**

In [14]:
sales.isnull().sum()

sales_key        0
order_number     0
product_key      0
customer_key     0
date_key         0
order_date       0
ship_date        0
delivery_date    0
price            7
quantity         0
amount           7
dtype: int64

#### **Statistics Summary** 

In [15]:
sales[['price', 'quantity', 'amount']].describe()

Unnamed: 0,price,quantity,amount
count,60391.0,60398.0,60391.0
mean,486.092348,1.000414,486.098674
std,928.494294,0.044011,928.491742
min,2.0,1.0,2.0
25%,8.0,1.0,8.0
50%,30.0,1.0,30.0
75%,540.0,1.0,540.0
max,3578.0,10.0,3578.0


In [33]:
sales['amount'].sum()/sales['']

np.float64(29355985.0)

In [64]:
sales

Unnamed: 0,sales_key,order_number,product_key,customer_key,date_key,order_date,ship_date,delivery_date,price,quantity,amount
0,1,SO43697,BK-R93R-62,21768,20101229,2010-12-29,2011-01-05,2011-01-10,3578.0,1,3578.0
1,2,SO43698,BK-M82S-44,28389,20101229,2010-12-29,2011-01-05,2011-01-10,3400.0,1,3400.0
2,3,SO43699,BK-M82S-44,25863,20101229,2010-12-29,2011-01-05,2011-01-10,3400.0,1,3400.0
3,4,SO43700,BK-R50B-62,14501,20101229,2010-12-29,2011-01-05,2011-01-10,699.0,1,699.0
4,5,SO43701,BK-M82S-44,11003,20101229,2010-12-29,2011-01-05,2011-01-10,3400.0,1,3400.0
...,...,...,...,...,...,...,...,...,...,...,...
60393,60394,SO75122,FE-6654,15868,20140128,2014-01-28,2014-02-04,2014-02-09,22.0,1,22.0
60394,60395,SO75122,CA-1098,15868,20140128,2014-01-28,2014-02-04,2014-02-09,9.0,1,9.0
60395,60396,SO75123,FE-6654,18759,20140128,2014-01-28,2014-02-04,2014-02-09,22.0,1,22.0
60396,60397,SO75123,ST-1401,18759,20140128,2014-01-28,2014-02-04,2014-02-09,159.0,1,159.0


### **Product Table**
---

#### **Size**

In [17]:
product.shape

(295, 9)

In [18]:
product.columns

Index(['product_key', 'product_name', 'category', 'subcategory',
       'product_line', 'maintenance', 'cost', 'launch_date',
       'last_order_date'],
      dtype='object')

#### **Data Type**

In [61]:
product.dtypes

product_key         object
product_name        object
category            object
subcategory         object
product_line        object
maintenance         object
cost               float64
launch_date         object
last_order_date     object
dtype: object

#### **Null Values**

In [19]:
product.isnull().sum()

product_key          0
product_name         0
category             7
subcategory          7
product_line        17
maintenance          7
cost                 0
launch_date          0
last_order_date    165
dtype: int64

#### **Statistics Summary** 

In [22]:
product['cost'].max() - product['cost'].min()

np.float64(2171.0)

In [21]:
product[['cost']].describe()

Unnamed: 0,cost
count,295.0
mean,418.830508
std,520.211742
min,0.0
25%,31.5
50%,200.0
75%,602.0
max,2171.0


### **Customer Table**
---

#### **Size**

In [29]:
customer.shape

(18484, 7)

In [30]:
customer.columns

Index(['customer_key', 'first_name', 'last_name', 'gender', 'marital_status',
       'country', 'birthdate'],
      dtype='object')

#### **Data Type**

In [62]:
customer.dtypes

customer_key       int64
first_name        object
last_name         object
gender            object
marital_status    object
country           object
birthdate         object
dtype: object

#### **Null Values**

In [38]:
customer.isnull().sum()

customer_key         0
first_name           3
last_name            2
gender             109
marital_status       2
country            337
birthdate         7456
dtype: int64