In [1]:
import pandas as pd
from src.dimension_modeler import Dimension_Modeler
from src.util import update_start_date

### DIM DATES
- date_id (int) DONE
- full_date (date) DONE
- day (int) DONE
- month (int) DONE
- year (int) DONE
- quarter (int) DONE

#### Getting date ID 

In [2]:
transactions_df = pd.read_csv('datasets/donut_shop_transactions.csv')

In [3]:
transactions_df.head()

Unnamed: 0,Employee Name,Product Name,Category,Date,Subtotal,Tax Amount,Total Amount,Payment Method,Discount Amount,Promo Name,Quantity,Price per Unit
0,Bob,Coke,Drinks,2015-04-13 19:28:14,2.5,0.16,2.66,Cash,0.0,,2,1.25
1,Bob,Blueberry,Donut,2015-04-13 19:28:14,1.5,0.09,1.59,Cash,0.0,,3,0.5
2,Bob,Cinnamon Swirl,Donut,2015-04-13 19:28:14,1.5,0.09,1.59,Cash,0.0,,3,0.5
3,David,Croissant Hot Dog,Hot Foods,2015-11-27 10:23:52,1.5,0.09,1.59,Card,0.0,,1,1.5
4,David,Biscuits,Hot Foods,2015-11-27 10:23:52,3.0,0.19,3.19,Card,0.0,,2,1.5


In [6]:
transactions_df['Date'] = pd.to_datetime(transactions_df['Date'])

In [7]:
transactions_df[['Date']].max()

Date   2024-12-31 23:53:27
dtype: datetime64[ns]

In [8]:
transactions_df[['Date']].min()

Date   2015-01-01 00:56:32
dtype: datetime64[ns]

In [9]:
transactions_df['date_raw'] = transactions_df['Date'].dt.strftime('%y-%m-%d')

In [10]:
transactions_df[['date_raw']].min()

date_raw    15-01-01
dtype: object

In [11]:
transactions_df[['date_raw']].max()

date_raw    24-12-31
dtype: object

In [12]:
transactions_df['date_raw'] = pd.to_datetime(transactions_df['date_raw'], format='%y-%m-%d')

In [13]:
transactions_df[['date_raw']].max()

date_raw   2024-12-31
dtype: datetime64[ns]

In [14]:
transactions_df[['date_raw']].min()

date_raw   2015-01-01
dtype: datetime64[ns]

In [23]:
transactions_df['date_raw'].dt.day

0         13
1         13
2         13
3         27
4         27
          ..
756130    13
756131    13
756132    27
756133    27
756134    27
Name: date_raw, Length: 756135, dtype: int32

In [28]:
transactions_df['date_id'] = transactions_df['date_raw'].dt.year.astype(str) + transactions_df['date_raw'].dt.month.astype(str).str.zfill(2) + transactions_df['date_raw'].dt.day.astype(str).str.zfill(2)

In [32]:
transactions_df['date_id'] = transactions_df['date_id'].astype(int)

In [33]:
transactions_df.dtypes

Employee Name              object
Product Name               object
Category                   object
Date               datetime64[ns]
Subtotal                  float64
Tax Amount                float64
Total Amount              float64
Payment Method             object
Discount Amount           float64
Promo Name                 object
Quantity                    int64
Price per Unit            float64
date_raw           datetime64[ns]
date_id                     int32
dtype: object

In [77]:
transactions_df['date_raw'].nunique()

3653

In [78]:
transactions_df['date_id'].nunique()

3653

In [92]:
date_df = transactions_df.drop_duplicates('date_id', keep='first').copy()

In [93]:
date_df['date_id'].count()

3653

In [94]:
transactions_df.groupby('date_id')['Date'].count().sort_values()

date_id
20200125     44
20201121     50
20201224     53
20201101     57
20200708     59
           ... 
20240926    428
20230707    439
20241124    449
20241110    450
20241018    453
Name: Date, Length: 3653, dtype: int64

In [95]:
transactions_df.loc[transactions_df['date_id'] == 20200125]

Unnamed: 0,Employee Name,Product Name,Category,Date,Subtotal,Tax Amount,Total Amount,Payment Method,Discount Amount,Promo Name,Quantity,Price per Unit,date_raw,date_id
285741,Eva,Ham and Cheese,Hot Foods,2020-01-25 14:48:51,1.5,0.09,1.59,Card,1.5,WINTER SPECIAL,1,3.0,2020-01-25,20200125
285772,Alice,Ham and Cheese,Hot Foods,2020-01-25 03:35:14,3.0,0.19,3.19,Card,0.0,BOGO,1,3.0,2020-01-25,20200125
285773,Alice,Coffee,Drinks,2020-01-25 03:35:14,8.0,0.5,8.5,Card,0.0,BOGO,4,2.0,2020-01-25,20200125
287401,Eva,Chocolate Donut Hole,Donut,2020-01-25 02:39:03,1.5,0.09,1.59,Card,1.5,WINTER SPECIAL,3,1.0,2020-01-25,20200125
287402,Eva,Starbucks Mocha,Drinks,2020-01-25 02:39:03,2.0,0.12,2.12,Card,2.0,WINTER SPECIAL,2,2.0,2020-01-25,20200125
287403,Eva,Coke,Drinks,2020-01-25 02:39:03,2.0,0.12,2.12,Card,2.0,WINTER SPECIAL,2,2.0,2020-01-25,20200125
287404,Eva,Redbull,Drinks,2020-01-25 02:39:03,3.0,0.19,3.19,Card,3.0,WINTER SPECIAL,3,2.0,2020-01-25,20200125
288238,Eva,Chocolate Sprinkle,Donut,2020-01-25 10:30:02,2.0,0.12,2.12,Cash,2.0,BOGO,4,1.0,2020-01-25,20200125
288239,Eva,Sprite,Drinks,2020-01-25 10:30:02,2.0,0.12,2.12,Cash,0.0,BOGO,1,2.0,2020-01-25,20200125
290887,David,Cinnamon Swirl,Donut,2020-01-25 16:43:56,1.0,0.06,1.06,Card,0.0,,1,1.0,2020-01-25,20200125


#### Making the date dataframe & extracting the rest of the information we need


In [96]:
date_df = date_df[['date_raw', 'date_id']]

In [97]:
date_df.duplicated().sum()

0

In [98]:
date_df.dtypes

date_raw    datetime64[ns]
date_id              int32
dtype: object

In [102]:
date_df.rename(columns={'date_raw': 'full_date'}, inplace=True)

In [103]:
date_df['day'] = date_df['full_date'].dt.day

In [104]:
date_df.loc[:,'month'] = date_df['full_date'].dt.month

In [106]:
date_df.loc[:,'year'] = date_df['full_date'].dt.year

In [107]:
date_df.loc[:,'quarter'] = date_df['full_date'].dt.quarter

In [122]:
date_df = date_df[['date_id', 'full_date', 'year', 'month', 'day', 'quarter']]
date_df

Unnamed: 0,date_id,full_date,year,month,day,quarter
0,20150413,2015-04-13,2015,4,13,2
3,20151127,2015-11-27,2015,11,27,4
7,20151021,2015-10-21,2015,10,21,4
8,20151211,2015-12-11,2015,12,11,4
10,20150624,2015-06-24,2015,6,24,2
...,...,...,...,...,...,...
632083,20240630,2024-06-30,2024,6,30,2
632695,20240928,2024-09-28,2024,9,28,3
632721,20240801,2024-08-01,2024,8,1,3
633412,20240719,2024-07-19,2024,7,19,3


In [123]:
date_dim_model = Dimension_Modeler(date_df)

In [124]:
date_dim_model.make_csv('dim_dates')

dim_dates.csv created in datasets/dimensions/


### DIM PROMOTIONS
- promotion_id (int)
- promotion_name (str) DONE
- discount_percentage (decimal 5,2) DONE
- description (str) DONE

In [29]:
promo_df = pd.read_csv('datasets/donut_shop_promos.csv')

In [30]:
promo_df.head()

Unnamed: 0,Promo Name,Discount Perc,Description
0,BOGO,50%,"Happens sporadically throughout the year, all ..."
1,WINTER SPECIAL,50%,"During Dec, Jan, Feb, all products 50% off."
2,BACK TO SCHOOL,25%,"During first week of September, 25% off all pr..."


#### Conversion of discount percentage to float

In [31]:
promo_df['Discount Perc'] = promo_df['Discount Perc'].apply(lambda x: x.split('%')[0])

In [32]:
promo_df['Discount Perc'] = promo_df['Discount Perc'].astype(float)

In [33]:
promo_df.dtypes

Promo Name        object
Discount Perc    float64
Description       object
dtype: object

In [34]:
promo_df['Discount Perc'] = promo_df['Discount Perc']/100

In [35]:
promo_df

Unnamed: 0,Promo Name,Discount Perc,Description
0,BOGO,0.5,"Happens sporadically throughout the year, all ..."
1,WINTER SPECIAL,0.5,"During Dec, Jan, Feb, all products 50% off."
2,BACK TO SCHOOL,0.25,"During first week of September, 25% off all pr..."


#### Adding id column, rearranging and fixing column names and saving to csv

In [36]:
promo_dim_model = Dimension_Modeler(promo_df)

In [37]:
promo_dim_model.make_id_col('promotion_id')

Unnamed: 0,promotion_id,Promo Name,Discount Perc,Description
0,1,BOGO,0.5,"Happens sporadically throughout the year, all ..."
1,2,WINTER SPECIAL,0.5,"During Dec, Jan, Feb, all products 50% off."
2,3,BACK TO SCHOOL,0.25,"During first week of September, 25% off all pr..."


In [38]:
promo_dim_model.df

Unnamed: 0,promotion_id,Promo Name,Discount Perc,Description
0,1,BOGO,0.5,"Happens sporadically throughout the year, all ..."
1,2,WINTER SPECIAL,0.5,"During Dec, Jan, Feb, all products 50% off."
2,3,BACK TO SCHOOL,0.25,"During first week of September, 25% off all pr..."


In [39]:
promo_dim_model.df.columns = promo_dim_model.df.columns.str.lower().str.replace(' ', '_')

In [40]:
promo_dim_model.df.rename(columns={'promo_name': 'promotion_name', 'discount_perc': 'discount_percentage'}, inplace=True)

In [41]:
promo_dim_model.df.dtypes

promotion_id             int64
promotion_name          object
discount_percentage    float64
description             object
dtype: object

In [42]:
new_row = pd.DataFrame([{'promotion_id': 4, 'promotion_name': 'None', 'discount_percentage': 0, 'description': 'No promotion'}])
new_row

Unnamed: 0,promotion_id,promotion_name,discount_percentage,description
0,4,,0,No promotion


In [43]:
promo_dim_model.df = pd.concat([promo_dim_model.df, new_row], ignore_index=True)

In [44]:
promo_dim_model.df

Unnamed: 0,promotion_id,promotion_name,discount_percentage,description
0,1,BOGO,0.5,"Happens sporadically throughout the year, all ..."
1,2,WINTER SPECIAL,0.5,"During Dec, Jan, Feb, all products 50% off."
2,3,BACK TO SCHOOL,0.25,"During first week of September, 25% off all pr..."
3,4,,0.0,No promotion


In [45]:
promo_dim_model.make_csv('dim_promotions')

dim_promotions.csv created in datasets/dimensions/


### DIM PAYMENT METHODS
- payment_method_id (int) DONE
- payment_method (str) DONE

In [65]:
transactions_df = pd.read_csv('datasets/donut_shop_transactions.csv')

In [66]:
payment_method_df = transactions_df[['Payment Method']]
payment_method_df

Unnamed: 0,Payment Method
0,Cash
1,Cash
2,Cash
3,Card
4,Card
...,...
756130,Card
756131,Card
756132,Cash
756133,Cash


In [79]:
payment_method_df =payment_method_df.drop_duplicates(keep='first')

In [83]:
payment_method_df.drop(columns='level_0', inplace=True)

In [85]:
payment_method_df.columns = payment_method_df.columns.str.lower().str.replace(' ', '_')

In [86]:
payment_method_df

Unnamed: 0,payment_method
0,Cash
1,Card


In [87]:
payment_method_dim_model = Dimension_Modeler(payment_method_df)

In [88]:
payment_method_dim_model.make_id_col('payment_method_id')
payment_method_dim_model.df

Unnamed: 0,payment_method_id,payment_method
0,1,Cash
1,2,Card


In [89]:
payment_method_dim_model.df.dtypes

payment_method_id     int64
payment_method       object
dtype: object

In [90]:
payment_method_dim_model.make_csv('dim_payment_methods')

dim_payment_methods.csv created in datasets/dimensions/


### DIM PRODUCTS
- product_id (int) (Natural Key) DONE
- product_key (int) (Surrogate Key) DONE
- product_name (str) DONE
- product_category (str) DONE
- product_price (decimal 10,2) DONE
- product_cost (decimal 10,2) DONE 
- is_current(boolean) DONE
- start_date (date) DONE 
- end_date (date) DONE

In [2]:
products_df = pd.read_csv('datasets/donut_shop_product_info.csv')

In [3]:
products_df

Unnamed: 0,Product Category,Product Price,Product Cost,Historical Cost
0,Donut,1,0.1,0.1
1,Hot Foods,3,0.8,0.5
2,Drinks,2,1.15,1.0


#### Getting historical prices and dates associated

In [4]:
transactions_df = pd.read_csv('datasets/donut_shop_transactions.csv')

In [5]:
price_history = []


In [6]:
for x in transactions_df.groupby('Category')['Price per Unit'].value_counts().index:
    price_history.append({
            'category': x[0],
            'price': x[1],
            'start_date': transactions_df[transactions_df['Price per Unit'] == x[1]]['Date'].min(),
            'end_date': transactions_df[transactions_df['Price per Unit'] == x[1]]['Date'].max(),
            }
                         )

In [7]:
price_history

[{'category': 'Donut',
  'price': 1.0,
  'start_date': '2020-01-01 00:27:34',
  'end_date': '2024-12-31 23:53:27'},
 {'category': 'Donut',
  'price': 0.5,
  'start_date': '2015-01-01 01:35:44',
  'end_date': '2019-12-31 23:49:08'},
 {'category': 'Drinks',
  'price': 2.0,
  'start_date': '2020-01-01 03:05:09',
  'end_date': '2024-12-31 23:53:27'},
 {'category': 'Drinks',
  'price': 1.25,
  'start_date': '2015-01-01 00:56:32',
  'end_date': '2019-12-31 23:49:08'},
 {'category': 'Hot Foods',
  'price': 3.0,
  'start_date': '2020-01-01 00:27:34',
  'end_date': '2024-12-31 23:53:27'},
 {'category': 'Hot Foods',
  'price': 1.5,
  'start_date': '2015-01-01 01:41:45',
  'end_date': '2019-12-31 23:32:46'}]

In [8]:
transactions_df[transactions_df['Price per Unit'] == 1.00]['Date'].max()

'2024-12-31 23:53:27'

In [9]:
transactions_df[transactions_df['Price per Unit'] == 1.00]['Date'].min()

'2020-01-01 00:27:34'

In [10]:
dim_prod_df = pd.DataFrame(price_history)
    

#### Getting is_current field

In [11]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date
0,Donut,1.0,2020-01-01 00:27:34,2024-12-31 23:53:27
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08
2,Drinks,2.0,2020-01-01 03:05:09,2024-12-31 23:53:27
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08
4,Hot Foods,3.0,2020-01-01 00:27:34,2024-12-31 23:53:27
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46


In [12]:
dim_prod_df['max_start_date'] = dim_prod_df.groupby('category')['start_date'].transform('max')
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,max_start_date
0,Donut,1.0,2020-01-01 00:27:34,2024-12-31 23:53:27,2020-01-01 00:27:34
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,2020-01-01 00:27:34
2,Drinks,2.0,2020-01-01 03:05:09,2024-12-31 23:53:27,2020-01-01 03:05:09
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,2020-01-01 03:05:09
4,Hot Foods,3.0,2020-01-01 00:27:34,2024-12-31 23:53:27,2020-01-01 00:27:34
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,2020-01-01 00:27:34


In [13]:
dim_prod_df['is_current'] = dim_prod_df['max_start_date'] == dim_prod_df['start_date']
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,max_start_date,is_current
0,Donut,1.0,2020-01-01 00:27:34,2024-12-31 23:53:27,2020-01-01 00:27:34,True
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,2020-01-01 00:27:34,False
2,Drinks,2.0,2020-01-01 03:05:09,2024-12-31 23:53:27,2020-01-01 03:05:09,True
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,2020-01-01 03:05:09,False
4,Hot Foods,3.0,2020-01-01 00:27:34,2024-12-31 23:53:27,2020-01-01 00:27:34,True
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,2020-01-01 00:27:34,False


In [14]:
dim_prod_df.drop(columns='max_start_date', inplace=True)


#### Updating end_date

In [15]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current
0,Donut,1.0,2020-01-01 00:27:34,2024-12-31 23:53:27,True
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False
2,Drinks,2.0,2020-01-01 03:05:09,2024-12-31 23:53:27,True
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False
4,Hot Foods,3.0,2020-01-01 00:27:34,2024-12-31 23:53:27,True
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False


In [16]:
max_date = pd.Timestamp.max.strftime('%Y-%m-%d %H:%M:%S')
max_date

'2262-04-11 23:47:16'

In [17]:
dim_prod_df.loc[dim_prod_df['is_current'], 'end_date'] = max_date

#### Updating Start_date

In [18]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current
0,Donut,1.0,2020-01-01 00:27:34,2262-04-11 23:47:16,True
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False
2,Drinks,2.0,2020-01-01 03:05:09,2262-04-11 23:47:16,True
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False
4,Hot Foods,3.0,2020-01-01 00:27:34,2262-04-11 23:47:16,True
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False


In [19]:
dim_prod_df = dim_prod_df.groupby('category', group_keys=False).apply(update_start_date)

In [20]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False
2,Drinks,2.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False
4,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False


#### Adding Historical prices


In [21]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False
2,Drinks,2.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False
4,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False


In [22]:
products_df

Unnamed: 0,Product Category,Product Price,Product Cost,Historical Cost
0,Donut,1,0.1,0.1
1,Hot Foods,3,0.8,0.5
2,Drinks,2,1.15,1.0


In [23]:
dim_prod_df = pd.merge(dim_prod_df, products_df, how='left', left_on='category', right_on='Product Category')

In [24]:
dim_prod_df = dim_prod_df[['category', 'price', 'start_date', 'end_date', 'is_current', 'Product Cost', 'Historical Cost']]

In [25]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current,Product Cost,Historical Cost
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,0.1
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,0.1
2,Drinks,2.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,1.15,1.0
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False,1.15,1.0
4,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,0.5
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.8,0.5


In [26]:
dim_prod_df.loc[~dim_prod_df['is_current'], 'Product Cost'] = dim_prod_df['Historical Cost']
dim_prod_df.drop(columns='Historical Cost', inplace=True)

In [27]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current,Product Cost
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1
2,Drinks,2.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,1.15
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False,1.0
4,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5


In [28]:
products_df

Unnamed: 0,Product Category,Product Price,Product Cost,Historical Cost
0,Donut,1,0.1,0.1
1,Hot Foods,3,0.8,0.5
2,Drinks,2,1.15,1.0


#### Adding product name

In [29]:
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current,Product Cost
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1
1,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1
2,Drinks,2.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,1.15
3,Drinks,1.25,2015-01-01 00:56:32,2019-12-31 23:49:08,False,1.0
4,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8
5,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5


In [30]:
products_list_df = transactions_df[['Product Name', 'Category']].drop_duplicates(keep='first').reset_index().drop(columns='index')

In [31]:
products_list_df

Unnamed: 0,Product Name,Category
0,Coke,Drinks
1,Blueberry,Donut
2,Cinnamon Swirl,Donut
3,Croissant Hot Dog,Hot Foods
4,Biscuits,Hot Foods
5,Sprite,Drinks
6,Blueberry Donut Hole,Donut
7,Glaze,Donut
8,Chocolate,Donut
9,Tropicana,Drinks


In [32]:
dim_prod_df = pd.merge(dim_prod_df, products_list_df, how='left', left_on='category', right_on='Category')


In [33]:
dim_prod_df.drop(columns='Category', inplace=True)
dim_prod_df

Unnamed: 0,category,price,start_date,end_date,is_current,Product Cost,Product Name
0,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry
1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Cinnamon Swirl
2,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole
3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Glaze
4,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate
5,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate Donut Hole
6,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Frosted Donut
7,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Donut Odyssey
8,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Frosted Donut Sprinkle
9,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Boston Creme


#### Adding surrogate key / PK

In [34]:
prod_dim_model = Dimension_Modeler(dim_prod_df)

In [35]:
prod_dim_model.make_id_col('product_key')

Unnamed: 0,product_key,category,price,start_date,end_date,is_current,Product Cost,Product Name
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry
1,2,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Cinnamon Swirl
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole
3,4,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Glaze
4,5,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate
5,6,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate Donut Hole
6,7,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Frosted Donut
7,8,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Donut Odyssey
8,9,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Frosted Donut Sprinkle
9,10,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Boston Creme


#### Product ID creation (Natural key/business key)

In [36]:
dim_prod_df = prod_dim_model.df

In [37]:
dim_prod_df.head()

Unnamed: 0,product_key,category,price,start_date,end_date,is_current,Product Cost,Product Name
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry
1,2,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Cinnamon Swirl
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole
3,4,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Glaze
4,5,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate


In [38]:
grouped_df = dim_prod_df.groupby('Product Name', group_keys=False)

modified_groups = []

In [39]:
id_counter = 100

for group_name, group_data in grouped_df:
    current_df = grouped_df.get_group(group_name).copy()
    current_df['product_id'] = id_counter
    id_counter += 1
    modified_groups.append(current_df)

combined_df = pd.concat(modified_groups)

In [41]:
dim_prod_df = combined_df

In [45]:
dim_prod_df.sort_values(by='product_id')


Unnamed: 0,product_key,category,price,start_date,end_date,is_current,Product Cost,Product Name,product_id
45,46,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,Biscuits,100
50,51,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5,Biscuits,100
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry,101
14,15,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry,101
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole,102
16,17,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry Donut Hole,102
9,10,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Boston Creme,103
23,24,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Boston Creme,103
4,5,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Chocolate,104
18,19,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Chocolate,104


#### Fixing any dtypes and reordering columns & saving dim_products csv file.

In [46]:
dim_prod_df.head()

Unnamed: 0,product_key,category,price,start_date,end_date,is_current,Product Cost,Product Name,product_id
45,46,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,Biscuits,100
50,51,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5,Biscuits,100
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry,101
14,15,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry,101
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole,102


In [51]:
dim_prod_df.dtypes

product_key       int64
category         object
price           float64
start_date       object
end_date         object
is_current         bool
Product Cost    float64
Product Name     object
product_id        int64
dtype: object

In [54]:
dim_prod_df['start_date'] = pd.to_datetime(dim_prod_df['start_date'])
dim_prod_df['end_date'] = pd.to_datetime(dim_prod_df['end_date'])

In [56]:
dim_prod_df.head()

Unnamed: 0,product_key,category,price,start_date,end_date,is_current,Product Cost,Product Name,product_id
45,46,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,Biscuits,100
50,51,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5,Biscuits,100
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry,101
14,15,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry,101
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole,102


In [57]:
dim_prod_df.columns = dim_prod_df.columns.str.lower().str.replace(' ', '_')
dim_prod_df.head()

Unnamed: 0,product_key,category,price,start_date,end_date,is_current,product_cost,product_name,product_id
45,46,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,Biscuits,100
50,51,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5,Biscuits,100
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry,101
14,15,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry,101
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole,102


In [58]:
dim_prod_df.rename(columns={'category': 'product_category', 'price': 'product_price'}, inplace=True)

In [59]:
dim_prod_df.head()

Unnamed: 0,product_key,product_category,product_price,start_date,end_date,is_current,product_cost,product_name,product_id
45,46,Hot Foods,3.0,2019-12-31 23:32:46,2262-04-11 23:47:16,True,0.8,Biscuits,100
50,51,Hot Foods,1.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False,0.5,Biscuits,100
0,1,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry,101
14,15,Donut,0.5,2015-01-01 01:35:44,2019-12-31 23:49:08,False,0.1,Blueberry,101
2,3,Donut,1.0,2019-12-31 23:49:08,2262-04-11 23:47:16,True,0.1,Blueberry Donut Hole,102


In [60]:
prod_dim_model.df = dim_prod_df[['product_key', 'product_id', 'product_name', 'product_category', 'product_price', 'product_cost', 'start_date', 'end_date', 'is_current']]

In [61]:
prod_dim_model.df.head()

Unnamed: 0,product_key,product_id,product_name,product_category,product_price,product_cost,start_date,end_date,is_current
45,46,100,Biscuits,Hot Foods,3.0,0.8,2019-12-31 23:32:46,2262-04-11 23:47:16,True
50,51,100,Biscuits,Hot Foods,1.5,0.5,2015-01-01 01:41:45,2019-12-31 23:32:46,False
0,1,101,Blueberry,Donut,1.0,0.1,2019-12-31 23:49:08,2262-04-11 23:47:16,True
14,15,101,Blueberry,Donut,0.5,0.1,2015-01-01 01:35:44,2019-12-31 23:49:08,False
2,3,102,Blueberry Donut Hole,Donut,1.0,0.1,2019-12-31 23:49:08,2262-04-11 23:47:16,True


In [62]:
prod_dim_model.make_csv('dim_products')

dim_products.csv created in datasets/dimensions/


### DIM EMPLOYEES
- employee_id (int) DONE 
- employee_name (str) DONE 
- job_title (str) DONE 

In [None]:
employee_df = pd.read_csv('datasets/donut_shop_employees.csv')


In [None]:
dim_model = Dimension_Modeler(employee_df)

In [None]:
employee_df.columns = ['employee_name', 'job_title']

In [None]:
employee_df.dtypes

In [None]:
employee_df = dim_model.make_id_col('employee_id')

In [None]:
employee_df.dtypes

In [None]:
employee_df

In [None]:
dim_model.make_csv('dim_employees')