**Setup**

In [1]:
# Imports
import warnings
from infrastructure import DataEngineering

# Supressors
warnings.simplefilter(action="ignore", category=FutureWarning)

# Data Engineering Object Declaration
engineering = DataEngineering()

**Main Data Section**

- Change Dates to Integer Numbers

In [2]:
# updated_train_data = engineering.handle_date(train_data)
# updated_test_data = engineering.handle_date(test_data)
# updated_holiday_data = engineering.handle_date(holiday_data)
# updated_oil_data = engineering.handle_date(oil_data)
# updated_transactions_data = engineering.handle_date(transactions_data, transactions=True)

- (Alternative Approach) Leave Data as they are so it may be handled by Time Series

In [3]:
# Copy and do modifications to Data
updated_train_data = engineering.copy_and_format_date(engineering.train_data)
updated_test_data = engineering.copy_and_format_date(engineering.test_data)
updated_holiday_data = engineering.copy_and_format_date(engineering.holiday_data)
updated_oil_data = engineering.copy_and_format_date(engineering.oil_data)
updated_transactions_data = engineering.copy_and_format_date(
    engineering.transactions_data
)

# Shallow Copy Only
updated_store_data = engineering.copy_data(engineering.store_data)

- Create an NP Array that holds all data that needs to be Categorized where:
    - 'AUTOMOTIVE': 0
    - 'BABY CARE': 1 
    - 'BEAUTY': 2
    - 'BEVERAGES': 3
    - 'BOOKS': 4
    - 'BREAD/BAKERY': 5
    - 'CELEBRATION': 6
    - 'CLEANING': 7
    - 'DAIRY': 8
    - 'DELI': 9
    - 'EGGS': 10
    - 'FROZEN FOODS': 11
    - 'GROCERY I': 12
    - 'GROCERY II': 13
    - 'HARDWARE': 14
    - 'HOME AND KITCHEN I': 15
    - 'HOME AND KITCHEN II': 16
    - 'HOME APPLIANCES': 17
    - 'HOME CARE': 18
    - 'LADIESWEAR': 19
    - 'LAWN AND GARDEN': 20
    - 'LINGERIE': 21
    - 'LIQUOR,WINE,BEER': 22
    - 'MAGAZINES': 23
    - 'MEATS': 24
    - 'PERSONAL CARE': 25
    - 'PET SUPPLIES': 26 
    - 'PLAYERS AND ELECTRONICS': 27
    - 'POULTRY': 28
    - 'PREPARED FOODS': 29
    - 'PRODUCE': 30
    - 'SCHOOL AND OFFICE SUPPLIES': 31
    - 'SEAFOOD': 32

In [4]:
target_feature = engineering.remove_duplicates_from_feature(series=updated_train_data['family'])

- Apply One Hot Encoder

In [5]:
updated_train_data = engineering.ohe_categorization(
    df=updated_train_data, field="family", cols=target_feature
)
updated_test_data = engineering.ohe_categorization(
    df=updated_test_data, field="family", cols=target_feature
)

- (Alternative) Apply Ordinal Encoder

In [6]:
# updated_train_data = engineering.ordinal_categorization(updated_train_data, "family")
# updated_test_data = engineering.ordinal_categorization(updated_test_data, "family")

**Oil Section**

- Query Oil Data for each of Train and Test

In [7]:
train_oil_data = engineering.query_data(updated_oil_data, "date <= '2014-08-13'")
test_oil_data = engineering.query_data(updated_oil_data, "date >= '2017-08-16'")

- Include Oil Price Column in Train Data

In [8]:
train_dataframe = engineering.merge_and_fill(
    df1=updated_train_data, df2=train_oil_data, on="date", fillna=True, filler=93.14
)

- Include Oil Price Column in Test Data

In [9]:
test_dataframe = engineering.merge_and_fill(
    df1=updated_test_data, df2=test_oil_data, on="date"
)

- Rename Column: dcoilwtico to oil_price for better readability

In [10]:
for data in [train_dataframe, test_dataframe]:
    data.rename(columns={"dcoilwtico": "oil_price"}, inplace=True)

**Holiday Section**

- Create an NP Array that holds all data that needs to be Categorized where:
  - 'Holiday': 0
  - 'Transfer': 1 
  - 'Additional': 2 
  - 'Bridge': 3
  - 'Work Day': 4
  - 'Event': 5

In [11]:
target_feature = engineering.remove_duplicates_from_feature(
    series=updated_holiday_data["type"]
)

Add Holiday Types Column for Train

In [12]:
train_dataframe = engineering.handle_holidays_data(
    left_df=train_dataframe,
    holiday_df=updated_holiday_data,
    query="'2013-01-01' <= date <= '2014-08-13'"
)

Add Holiday Types Column for Test

In [13]:
test_dataframe = engineering.handle_holidays_data(
    left_df=test_dataframe,
    holiday_df=updated_holiday_data,
    query="'2017-08-16' <= date <= '2017-08-31'",
)

- Apply One Hot Encoder

In [14]:
train_dataframe = engineering.ohe_categorization(
    df=train_dataframe, field="type", cols=target_feature
)

test_dataframe = engineering.ohe_categorization(
    df=test_dataframe, field="type", cols=target_feature
)

- Add The Remaining Holiday Types as Separate Columns in The Test

In [15]:
test_dataframe['additional'] = test_dataframe['holiday']
test_dataframe["bridge"] = test_dataframe["holiday"]
test_dataframe["work_day"] = test_dataframe["holiday"]

- (Alternative) Apply Ordinal Encoder

In [16]:
# updated_holiday_data = engineering.ordinal_categorization(updated_holiday_data, "type")

**Stores Section**

- Merge Stores Data with Train and Test

In [17]:
train_dataframe = engineering.merge_and_fill(
    df1=train_dataframe,
    df2=updated_store_data,
    on="store_nbr"
)
test_dataframe = engineering.merge_and_fill(
    df1=test_dataframe,
    df2=updated_store_data,
    on="store_nbr"
)

**Transactions Section (Droppable)**

- Query Transactions Data for each of Train and Test

In [18]:
# train_transactions_data = engineering.query_data(
#     updated_transactions_data, "date <= '2014-08-13'"
# )

- Include Transaction Column in Train Data

In [19]:
# train_dataframe = engineering.merge_and_fill(
#     df1=train_dataframe,
#     df2=updated_transactions_data,
#     on=["date", "store_nbr"],
#     fillna=True,
# )

**Generate Output Files**

- This Code takes less time to generate Files yet the Columns will be quite scattered

In [20]:
engineering.generate_file(df=train_dataframe, filename="train")
engineering.generate_file(df=test_dataframe, filename="test")

Unnamed: 0,id,date,store_nbr,sales,onpromotion,automotive,baby_care,beauty,beverages,books,...,produce,school_and_office_supplies,seafood,oil_price,holiday,transfer,additional,bridge,work_day,cluster
0,0.0,2013-01-01,1.0,0.000,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,93.14,0.0,0.0,1.0,0.0,0.0,13.0
1,1.0,2013-01-01,1.0,0.000,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,93.14,0.0,0.0,1.0,0.0,0.0,13.0
2,2.0,2013-01-01,1.0,0.000,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,93.14,0.0,0.0,1.0,0.0,0.0,13.0
3,3.0,2013-01-01,1.0,0.000,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,93.14,0.0,0.0,1.0,0.0,0.0,13.0
4,4.0,2013-01-01,1.0,0.000,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,93.14,0.0,0.0,1.0,0.0,0.0,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1064634,1047813.0,2014-12-08,9.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,97.61,0.0,0.0,0.0,0.0,1.0,6.0
1064635,1047814.0,2014-12-08,9.0,0.000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,97.61,0.0,0.0,0.0,0.0,1.0,6.0
1064636,1047815.0,2014-12-08,9.0,8.088,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,97.61,0.0,0.0,0.0,0.0,1.0,6.0
1064637,1047815.0,2014-02-11,9.0,8.088,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,97.61,0.0,0.0,1.0,0.0,0.0,6.0
