In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('datasets/sales_data_sample.csv', encoding='unicode_escape')
df.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2/24/2003 0:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,5/7/2003 0:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,7/1/2003 0:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,8/25/2003 0:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,10/10/2003 0:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [3]:
# Inspecting data types
df.dtypes

ORDERNUMBER           int64
QUANTITYORDERED       int64
PRICEEACH           float64
ORDERLINENUMBER       int64
SALES               float64
ORDERDATE            object
STATUS               object
QTR_ID                int64
MONTH_ID              int64
YEAR_ID               int64
PRODUCTLINE          object
MSRP                  int64
PRODUCTCODE          object
CUSTOMERNAME         object
PHONE                object
ADDRESSLINE1         object
ADDRESSLINE2         object
CITY                 object
STATE                object
POSTALCODE           object
COUNTRY              object
TERRITORY            object
CONTACTLASTNAME      object
CONTACTFIRSTNAME     object
DEALSIZE             object
dtype: object

In [4]:
# Converting to datetime and fix format
df['ORDERDATE'] = pd.to_datetime(df.ORDERDATE, format='%m/%d/%Y %H:%M')
df['ORDERDATE'] = df.ORDERDATE.dt.strftime('%Y-%m-%d %H:%M')
df.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2003-02-24 00:00,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,2003-05-07 00:00,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,2003-07-01 00:00,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,2003-08-25 00:00,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,2003-10-10 00:00,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [5]:
df['ORDERDATE'] = pd.to_datetime(df.ORDERDATE)
df.tail()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
2818,10350,20,100.0,15,2244.4,2004-12-02,Shipped,4,12,2004,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Small
2819,10373,29,100.0,1,3978.51,2005-01-31,Shipped,1,1,2005,...,Torikatu 38,,Oulu,,90110,Finland,EMEA,Koskitalo,Pirkko,Medium
2820,10386,43,100.0,4,5417.57,2005-03-01,Resolved,1,3,2005,...,"C/ Moralzarzal, 86",,Madrid,,28034,Spain,EMEA,Freyre,Diego,Medium
2821,10397,34,62.24,1,2116.16,2005-03-28,Shipped,1,3,2005,...,1 rue Alsace-Lorraine,,Toulouse,,31000,France,EMEA,Roulet,Annette,Small
2822,10414,47,65.52,9,3079.44,2005-05-06,On Hold,2,5,2005,...,8616 Spinnaker Dr.,,Boston,MA,51003,USA,,Yoshido,Juri,Medium


## Working dimension Time

In [17]:
df_time = df[['ORDERDATE', 'MONTH_ID', 'QTR_ID', 'YEAR_ID']].copy(deep=True)
df_time.head()

Unnamed: 0,ORDERDATE,MONTH_ID,QTR_ID,YEAR_ID
0,2003-02-24,2,1,2003
1,2003-05-07,5,2,2003
2,2003-07-01,7,3,2003
3,2003-08-25,8,3,2003
4,2003-10-10,10,4,2003


## First step will be getting derivate values

### At this moment the dimension time should have the following fields:
- dateid:: yyyymmdd
- day:: dd
- dayname:: (monday-tuesday- and so on)
- month:: mm
- monthname:: (January-February- and so on)
- is_day_of_week:: True/False
- MonthNumberByYear:: 2023-02
- Weeknumber
- Semester
- Quarter
- Year

In [18]:
# Create dateid
df_time['DATEID'] = df_time.ORDERDATE.dt.strftime('%Y%m%d')
df_time['DAY']    = df_time.ORDERDATE.dt.day
df_time['DAYNAME']    = df_time.ORDERDATE.dt.day_name()
df_time['MONTHNAME']  = df_time.ORDERDATE.dt.month_name()
df_time['WEEKNUMBER']  = df_time.ORDERDATE.dt.isocalendar().week
df_time['IS_WEEKDAY'] = df_time['ORDERDATE'].apply(lambda x: 1 if x.weekday() else 0)
df_time['SEMESTER'] = df_time['MONTH_ID'].apply(lambda x: 1 if x < 7 else 2)
df_time['MONTHNUMBERBYYEAR'] = df_time.ORDERDATE.dt.strftime('%Y-%m')
df_time.rename(columns={"ORDERDATE": "FULLDATE", 'YEAR_ID': 'YEAR', 'QTR_ID': 'QUARTER', 'MONTH_ID': 'MONTH'}, inplace=True)
df_time.head()

Unnamed: 0,FULLDATE,MONTH,QUARTER,YEAR,DATEID,DAY,DAYNAME,MONTHNAME,WEEKNUMBER,IS_WEEKDAY,SEMESTER,MONTHNUMBERBYYEAR
0,2003-02-24,2,1,2003,20030224,24,Monday,February,9,0,1,2003-02
1,2003-05-07,5,2,2003,20030507,7,Wednesday,May,19,1,1,2003-05
2,2003-07-01,7,3,2003,20030701,1,Tuesday,July,27,1,2,2003-07
3,2003-08-25,8,3,2003,20030825,25,Monday,August,35,0,2,2003-08
4,2003-10-10,10,4,2003,20031010,10,Friday,October,41,1,2,2003-10


In [19]:
# Reorder dataframe
cols = ['DATEID', 'YEAR', 'SEMESTER', 'QUARTER', 'MONTHNAME', 'MONTH', 'MONTHNUMBERBYYEAR', 'WEEKNUMBER', 'DAYNAME', 'DAY', 'IS_WEEKDAY', 'FULLDATE']
df_time = df_time.reindex(columns=cols)
df_time.head()

Unnamed: 0,DATEID,YEAR,SEMESTER,QUARTER,MONTHNAME,MONTH,MONTHNUMBERBYYEAR,WEEKNUMBER,DAYNAME,DAY,IS_WEEKDAY,FULLDATE
0,20030224,2003,1,1,February,2,2003-02,9,Monday,24,0,2003-02-24
1,20030507,2003,1,2,May,5,2003-05,19,Wednesday,7,1,2003-05-07
2,20030701,2003,2,3,July,7,2003-07,27,Tuesday,1,1,2003-07-01
3,20030825,2003,2,3,August,8,2003-08,35,Monday,25,0,2003-08-25
4,20031010,2003,2,4,October,10,2003-10,41,Friday,10,1,2003-10-10


In [25]:
print(f'Dimension Time shape: {df_time.shape}')

Dimension Time shape: (2823, 12)


In [27]:
df_time.drop_duplicates(keep='first',inplace=True)
print(f'Dimension Time shape after dedup: {df_time.shape}')

Dimension Time shape after dedup: (252, 12)


In [33]:
df_time.sort_values(by=['FULLDATE'], inplace=True)
df_time.to_csv('output_files/dim_time.csv', index=False)

## Working dimension Location/geography

In [22]:
df.head()

Unnamed: 0,ORDERNUMBER,QUANTITYORDERED,PRICEEACH,ORDERLINENUMBER,SALES,ORDERDATE,STATUS,QTR_ID,MONTH_ID,YEAR_ID,...,ADDRESSLINE1,ADDRESSLINE2,CITY,STATE,POSTALCODE,COUNTRY,TERRITORY,CONTACTLASTNAME,CONTACTFIRSTNAME,DEALSIZE
0,10107,30,95.7,2,2871.0,2003-02-24,Shipped,1,2,2003,...,897 Long Airport Avenue,,NYC,NY,10022.0,USA,,Yu,Kwai,Small
1,10121,34,81.35,5,2765.9,2003-05-07,Shipped,2,5,2003,...,59 rue de l'Abbaye,,Reims,,51100.0,France,EMEA,Henriot,Paul,Small
2,10134,41,94.74,2,3884.34,2003-07-01,Shipped,3,7,2003,...,27 rue du Colonel Pierre Avia,,Paris,,75508.0,France,EMEA,Da Cunha,Daniel,Medium
3,10145,45,83.26,6,3746.7,2003-08-25,Shipped,3,8,2003,...,78934 Hillside Dr.,,Pasadena,CA,90003.0,USA,,Young,Julie,Medium
4,10159,49,100.0,14,5205.27,2003-10-10,Shipped,4,10,2003,...,7734 Strong St.,,San Francisco,CA,,USA,,Brown,Julie,Medium


In [23]:
df_geo = df[['CITY', 'STATE', 'COUNTRY', 'TERRITORY']].copy(deep=True)
df_geo.head()

Unnamed: 0,CITY,STATE,COUNTRY,TERRITORY
0,NYC,NY,USA,
1,Reims,,France,EMEA
2,Paris,,France,EMEA
3,Pasadena,CA,USA,
4,San Francisco,CA,USA,


## Working dimension Products

In [24]:
df_prod = df[['PRODUCTCODE', 'PRODUCTLINE', 'MSRP']].copy(deep=True)
df_prod.head()

Unnamed: 0,PRODUCTCODE,PRODUCTLINE,MSRP
0,S10_1678,Motorcycles,95
1,S10_1678,Motorcycles,95
2,S10_1678,Motorcycles,95
3,S10_1678,Motorcycles,95
4,S10_1678,Motorcycles,95
