In [1]:
# 02_feature_engineering.ipynb

import os, sys
sys.path.append(os.path.abspath(".."))
sys.path.append(os.path.abspath("../src"))

import pandas as pd
from src.data_prep import build_daily_series
from src.feature_engineering import build_feature_frame
from src.config import DATA_PROCESSED_DIR, DATA_FEATURE_DIR

print("⚙ FEATURE ENGINEERING NOTEBOOK")

csv_name = input("Cleaned CSV in data/processed/ (e.g. cleaned_train.csv): ").strip()
clean_path = DATA_PROCESSED_DIR / csv_name

df = pd.read_csv(clean_path)
df.columns = df.columns.str.strip()

# Infer columns again just to be safe
from src.data_prep import _detect_date_column, _detect_target_column   # if you kept them private, expose wrappers
date_col = _detect_date_column(df)
target_col = _detect_target_column(df, date_col)

ts = build_daily_series(df, date_col, target_col)
data, feature_cols = build_feature_frame(ts)

print("\n✅ Feature frame created")
print("Feature columns:", feature_cols)

out_path = DATA_FEATURE_DIR / "final_training_dataset.csv"
print(f"Saved → {out_path}")
data.to_csv(out_path, index=False)
display(data.head())


⚙ FEATURE ENGINEERING NOTEBOOK


Cleaned CSV in data/processed/ (e.g. cleaned_train.csv):  adani_monthly_cleaned.csv


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\lalee\\Downloads\\Mini_Project_Shravani_Harel_Sales_Forecasting\\data\\processed\\adani_monthly_cleaned.csv'

In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("../data/cleaned/adani_monthly_cleaned.csv")
df.columns = df.columns.str.strip()

date_col = [c for c in df.columns if "date" in c.lower()][0]
df[date_col] = pd.to_datetime(df[date_col])
df = df.sort_values(date_col)

# Select Target (you choose later)
TARGET = "Auto_Select"    # placeholder

numeric_cols = df.select_dtypes(include=['float','int']).columns.tolist()
numeric_cols.remove("Date") if "Date" in numeric_cols else None

print("Select target from:", numeric_cols)

# Convert to time-series
ts = df[[date_col, numeric_cols[0]]]   # editable later
ts.set_index(date_col, inplace=True)
ts = ts.asfreq("D").interpolate()

# Generate Lag Features
for lag in [1, 2, 7, 14, 30]:
    ts[f"lag_{lag}"] = ts[numeric_cols[0]].shift(lag)

# Rolling Signals
ts["roll_7"]  = ts[numeric_cols[0]].rolling(7).mean()
ts["roll_30"] = ts[numeric_cols[0]].rolling(30).mean()
ts["volatility"] = ts[numeric_cols[0]].rolling(7).std()

ts = ts.dropna()
ts.to_csv("../data/features/feature_engineered.csv")

print("Feature dataset saved ✔")
ts.head()


Select target from: ['monthly_sales']
Feature dataset saved ✔


Unnamed: 0_level_0,monthly_sales,lag_1,lag_2,lag_7,lag_14,lag_30,roll_7,roll_30,volatility
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2016-03-01,1035674000000.0,1035802000000.0,1035941000000.0,1036636000000.0,1037608000000.0,1039832000000.0,1036081000000.0,1037678000000.0,297624700.0
2016-03-02,1035546000000.0,1035674000000.0,1035802000000.0,1036497000000.0,1037469000000.0,1039693000000.0,1035946000000.0,1037540000000.0,293371200.0
2016-03-03,1035418000000.0,1035546000000.0,1035674000000.0,1036358000000.0,1037330000000.0,1039554000000.0,1035811000000.0,1037402000000.0,288245300.0
2016-03-04,1035290000000.0,1035418000000.0,1035546000000.0,1036219000000.0,1037191000000.0,1039415000000.0,1035679000000.0,1037265000000.0,283088900.0
2016-03-05,1035163000000.0,1035290000000.0,1035418000000.0,1036080000000.0,1037052000000.0,1039276000000.0,1035548000000.0,1037127000000.0,278771700.0


In [None]:
from src.data_prep import load_and_clean_csv,build_daily_series
from src.feature_engineering import build_feature_frame

file=input("cleaned file in data/processed/: ")
df,date,target=load_and_clean_csv(f"data/processed/{file}")
ts=build_daily_series(df,date,target)
final,_=build_feature_frame(ts)
final.to_csv("data/features/final_training_dataset.csv",index=False)
final.head()
