## Lei Data Munging

In [1]:
# General Dependencies

import numpy as np
import pandas as pd
import datetime as dt


In [2]:
# Read the data
df_original = pd.read_csv('data/Sales_Data.csv')
print('Dataframe dimensions:', df_original.shape)

Dataframe dimensions: (1884, 24)


In [3]:
df_original.head(10)

Unnamed: 0,Stock ID,Order Priority,Cost of Goods,Unit Price,Shipping Cost,Customer ID,Customer Name,Vendor,Order Status (Backorder?),Product Category,...,Region,State or Province,City,Postal Code,Order Date,Ship Date,Profit,Quantity ordered new,Sales,Order ID
0,171001,,720.0,900,0.0,L1784,,V1011,Backorder,Clothing,...,,,,,1/17/2017,,180.0,1.0,900,2017011
1,171002,,165.6,207,0.0,VIP1023,,DIRECT,Backorder,Shoes,...,,,,,1/17/2017,,41.4,1.0,207,2017012
2,171003,,76.0,99,15.0,VVIP1017,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,8.0,1.0,99,2017013
3,171004,,87.0,120,22.5,L1631,,DIRECT,finished,Shoes,...,,,,,1/17/2017,,10.5,1.0,120,2017014
4,171005,,87.0,120,22.5,L1631,,DIRECT,finished,Shoes,...,,,,,1/17/2017,,10.5,1.0,120,2017015
5,171006,,113.0,193,15.0,H1270,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,65.0,1.0,193,2017016
6,171007,,33.0,107,15.0,L1609,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,59.0,1.0,107,2017017
7,171008,,33.0,107,15.0,L1367,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,59.0,1.0,107,2017018
8,171009,,33.0,107,15.0,VVIP1017,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,59.0,1.0,107,2017019
9,171010,,36.0,64,7.5,L1126,,DIRECT,finished,Clothing,...,,,,,1/17/2017,,20.5,1.0,64,20170110


In [4]:
def num_missing(x):
  return sum(x.isnull())

In [5]:
# How many NAN values per column with have.

# Applying per column:
print (f"Missing values per column in df_customers :\n{df_original.apply(num_missing, axis=0)}")

Missing values per column in df_customers :
Stock ID                           1
Order Priority                  1884
Cost of Goods                      0
Unit Price                         0
Shipping Cost                      0
Customer ID                        0
Customer Name                   1882
Vendor                             0
Order Status (Backorder?)          0
Product Category                   0
Product Sub-Category / Brand       0
Product Container               1884
Product_Name_EN                    0
Product Base Margin               80
Region                          1884
State or Province               1884
City                            1883
Postal Code                     1884
Order Date                         0
Ship Date                       1884
Profit                             0
Quantity ordered new               4
Sales                              0
Order ID                           1
dtype: int64


In [6]:
# Removing columns not necessary for Customer Segmentation by Age
list_cols_drop = ['Order Priority', 'Customer Name', 'Product Container', 'Region', 'State or Province', 'City',
                  'Postal Code', 'Ship Date']
df_data= df_original.drop(list_cols_drop, axis=1)

In [7]:
#remove rows where StockID, "Quantity ordered new" , OrderID are NA
df_data.dropna(subset=['Stock ID'],how='all',inplace=True)
df_data.dropna(subset=['Order ID'],how='all',inplace=True)
df_data.dropna(subset=['Quantity ordered new'],how='all',inplace=True)
df_data.shape

(1878, 16)

In [8]:
df_data.columns

Index(['Stock ID', 'Cost of Goods', 'Unit Price', 'Shipping Cost',
       'Customer ID', 'Vendor', 'Order Status (Backorder?)',
       'Product Category', 'Product Sub-Category / Brand', 'Product_Name_EN',
       'Product Base Margin', 'Order Date', 'Profit', 'Quantity ordered new',
       'Sales', 'Order ID'],
      dtype='object')

In [9]:
# Rename columns

df_data.rename(columns={"Product Sub-Category / Brand": 'Brand','Product_Name_EN': 'Product Description', 
                                  'Quantity ordered new': 'Quantity'}, inplace=True)

df_data.head()

Unnamed: 0,Stock ID,Cost of Goods,Unit Price,Shipping Cost,Customer ID,Vendor,Order Status (Backorder?),Product Category,Brand,Product Description,Product Base Margin,Order Date,Profit,Quantity,Sales,Order ID
0,171001,720.0,900,0.0,L1784,V1011,Backorder,Clothing,Canada Goose,Canada goose trillium black s,20%,1/17/2017,180.0,1.0,900,2017011
1,171002,165.6,207,0.0,VIP1023,DIRECT,Backorder,Shoes,Vince,Vince polette suede high heel black 6,20%,1/17/2017,41.4,1.0,207,2017012
2,171003,76.0,99,15.0,VVIP1017,DIRECT,finished,Clothing,Madewell,Madewell Trevi Drapey Blazer Suit Black 2 Last...,8%,1/17/2017,8.0,1.0,99,2017013
3,171004,87.0,120,22.5,L1631,DIRECT,finished,Shoes,UGG,UGG Australia – Joey Leather & Genuine Shearli...,9%,1/17/2017,10.5,1.0,120,2017014
4,171005,87.0,120,22.5,L1631,DIRECT,finished,Shoes,UGG,Ugg red short paragraph 7,9%,1/17/2017,10.5,1.0,120,2017015


In [10]:
# If Profit is negative, change to NaN. 
df_data[df_data['Profit'] < 0] = np.nan

# If Product Base Margin is Nan, change to 0.

df_data['Product Base Margin'].fillna(0)

0       20%
1       20%
2        8%
3        9%
4        9%
5       34%
6       55%
7       55%
8       55%
9       32%
10      55%
11      52%
12      22%
13       7%
14      43%
15      55%
16      46%
17      43%
18      43%
19      32%
20      22%
21      54%
22      54%
23        0
24      66%
25      54%
26      54%
27      39%
28      45%
29      21%
       ... 
1854    31%
1855     5%
1856     4%
1857    19%
1858    23%
1859    19%
1860    34%
1861    32%
1862    30%
1863    32%
1864    11%
1865    28%
1866    15%
1867    11%
1868    26%
1869    11%
1870    11%
1871    21%
1872    33%
1873    13%
1874    23%
1875    15%
1876      0
1877    20%
1878    19%
1879    19%
1880    19%
1881    45%
1882    39%
1883    46%
Name: Product Base Margin, Length: 1878, dtype: object

In [11]:
# Total of orders, customers, products and brands

pd.DataFrame([{'orders': len(df_data['Order ID'].value_counts()),    
               'customers': len(df_data['Customer ID'].value_counts()),
               'products': len(df_data['Product Description'].value_counts()), 
               'total of brands' : len(df_data.Brand.unique())
              }], columns = ['orders', 'customers', 'products','total of brands'], index = ['quantity'])

Unnamed: 0,orders,customers,products,total of brands
quantity,1843,594,1564,142


In [12]:
df_data.describe()

Unnamed: 0,Cost of Goods,Unit Price,Shipping Cost,Profit,Quantity,Sales
count,1849.0,1849.0,1849.0,1849.0,1849.0,1849.0
mean,198.729638,275.190373,11.065928,104.28053,0.976203,365.299081
std,648.244509,995.698451,12.189854,1531.320663,0.321449,3864.958628
min,1.0,21.0,0.0,0.0,0.0,0.0
25%,52.0,86.0,5.0,20.0,1.0,79.0
50%,101.0,154.0,7.5,35.5,1.0,146.0
75%,188.0,254.0,15.0,57.5,1.0,241.0
max,15816.0,26360.0,300.0,50410.0,10.0,126650.0


In [13]:
# df_data['Order Date'] = pd.to_datetime(pd.Series(['05/23/2005']), format="%d/%d/%Y")
df_data['Order Date'] = pd.to_datetime(df_data['Order Date'])

# Get month and year
df_data['Day'] = df_data['Order Date'].dt.day
df_data['Month'] = df_data['Order Date'].dt.month
df_data['Year'] = df_data['Order Date'].dt.year

df_data.count()

Stock ID                     1849
Cost of Goods                1849
Unit Price                   1849
Shipping Cost                1849
Customer ID                  1849
Vendor                       1849
Order Status (Backorder?)    1849
Product Category             1849
Brand                        1849
Product Description          1849
Product Base Margin          1779
Order Date                   1849
Profit                       1849
Quantity                     1849
Sales                        1849
Order ID                     1849
Day                          1849
Month                        1849
Year                         1849
dtype: int64

In [14]:
# Save clean data for work with Customer and Brand 
df_data.to_csv('data/data_ready.csv',index=False)

PermissionError: [Errno 13] Permission denied: 'data/data_ready.csv'