In [1]:
import pandas as pd

In [2]:
# Read the Clean Data CSV file
cleanData = pd.read_csv('./data/clean_data.csv')

print(cleanData.head().to_markdown())



|    | date       | gender   |   age | product_category   |   quantity |   price_per_unit |   total_amount |
|---:|:-----------|:---------|------:|:-------------------|-----------:|-----------------:|---------------:|
|  0 | 2023-11-24 | Male     |    34 | Beauty             |          3 |               50 |            150 |
|  1 | 2023-02-27 | Female   |    26 | Clothing           |          2 |              500 |           1000 |
|  2 | 2023-01-13 | Male     |    50 | Electronics        |          1 |               30 |             30 |
|  3 | 2023-05-21 | Male     |    37 | Clothing           |          1 |              500 |            500 |
|  4 | 2023-05-06 | Male     |    30 | Beauty             |          2 |               50 |            100 |


In [3]:
# Convert timestamp to date fields
engDates = cleanData.copy()
engDates['date'] = pd.to_datetime(engDates['date'])
engDates['year'] = engDates['date'].dt.year
engDates['month'] = engDates['date'].dt.month
engDates['day'] = engDates['date'].dt.day
engDates['dayOfWeek'] = engDates['date'].dt.dayofweek

dayOfWeekMap = {
    0: 'Monday',
    1: 'Tuesday',
    2: 'Wednesday',
    3: 'Thursday',
    4: 'Friday',
    5: 'Saturday',
    6: 'Sunday'
}

monthMap = {
    1: 'January',
    2: 'February',
    3: 'March',
    4: 'April',
    5: 'May',
    6: 'June',
    7: 'July',
    8: 'August',
    9: 'September',
    10: 'October',
    11: 'November',
    12: 'December'
}

engDates['month'] = engDates['month'].map(monthMap)
engDates['dayOfWeek'] = engDates['dayOfWeek'].map(dayOfWeekMap)

# Clear out Datetime field
engDates = engDates.drop(columns=['date'])

print(engDates.head().to_markdown())

|    | gender   |   age | product_category   |   quantity |   price_per_unit |   total_amount |   year | month    |   day | dayOfWeek   |
|---:|:---------|------:|:-------------------|-----------:|-----------------:|---------------:|-------:|:---------|------:|:------------|
|  0 | Male     |    34 | Beauty             |          3 |               50 |            150 |   2023 | November |    24 | Friday      |
|  1 | Female   |    26 | Clothing           |          2 |              500 |           1000 |   2023 | February |    27 | Monday      |
|  2 | Male     |    50 | Electronics        |          1 |               30 |             30 |   2023 | January  |    13 | Friday      |
|  3 | Male     |    37 | Clothing           |          1 |              500 |            500 |   2023 | May      |    21 | Sunday      |
|  4 | Male     |    30 | Beauty             |          2 |               50 |            100 |   2023 | May      |     6 | Saturday    |


In [None]:
# Create Age Bins:
ageGrouped = engDates.copy()
ageBins = [20,30,40,50,60,70]
ageLabels = ['20-29','30-39','40-49','50-59','60+']
ageGrouped['age_group'] = pd.cut(ageGrouped['age'], bins=ageBins, labels=ageLabels, right=True, include_lowest=True)


|    | gender   |   age | product_category   |   quantity |   price_per_unit |   total_amount |   year | month     |   day | dayOfWeek   | age_group   |
|---:|:---------|------:|:-------------------|-----------:|-----------------:|---------------:|-------:|:----------|------:|:------------|:------------|
|  8 | Male     |    63 | Electronics        |          2 |              300 |            600 |   2023 | December  |    13 | Wednesday   | 60+         |
| 13 | Male     |    64 | Clothing           |          4 |               30 |            120 |   2023 | January   |    17 | Tuesday     | 60+         |
| 18 | Female   |    62 | Clothing           |          2 |               25 |             50 |   2023 | September |    16 | Saturday    | 60+         |
| 24 | Female   |    64 | Beauty             |          1 |               50 |             50 |   2023 | December  |    26 | Tuesday     | 60+         |
| 56 | Female   |    63 | Beauty             |          1 |               30 |    

In [5]:
# One Hot Encoding for gender and product_category
# oneHotEncoded = ageGrouped.copy()
# oneHotColumns = ['gender', 'product_category', 'month', 'dayOfWeek']

# oneHotEncoded = pd.get_dummies(oneHotEncoded, columns=oneHotColumns)
# print(oneHotEncoded.head().to_markdown())

In [6]:
engineeredData = ageGrouped.copy()
# Save the DataFrame to a CSV file
engineeredData.to_csv('data/eng_data.csv', index=False)