In [None]:
import pandas as pd
import logging
from tqdm.notebook import tqdm
from sklearn.preprocessing import MinMaxScaler

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Load merged dataset in chunks
def load_merged_data_in_chunks(file_path, chunksize=10000):
    logging.info(f"Loading merged data in chunks from {file_path}")
    for chunk in pd.read_csv(file_path, chunksize=chunksize):
        yield chunk

# Apply feature engineering to data chunk
def apply_feature_engineering(chunk):
    logging.info("Applying feature engineering")
    chunk['NewCases_7day_avg'] = chunk['new_cases'].rolling(window=7).mean()
    scaler = MinMaxScaler()
    chunk[['new_cases', 'new_deaths', 'total_cases', 'total_deaths']] = scaler.fit_transform(chunk[['new_cases', 'new_deaths', 'total_cases', 'total_deaths']])
    return chunk

# Feature engineering in batches
def feature_engineering_in_batches(input_file, output_file, chunksize=10000):
    chunk_iter = load_merged_data_in_chunks(input_file, chunksize)
    engineered_chunks = []
    for chunk in tqdm(chunk_iter, desc="Applying feature engineering"):
        engineered_chunk = apply_feature_engineering(chunk)
        engineered_chunks.append(engineered_chunk)
    engineered_data = pd.concat(engineered_chunks, ignore_index=True)
    engineered_data.to_csv(output_file, index=False)
    logging.info(f"Saved engineered data to {output_file}")

# Feature engineering on merged dataset
logging.info("Starting feature engineering on merged dataset.")
input_file = "datasets/merged_final_dataset.csv"
output_file = "datasets/final_dataset_with_features.csv"
feature_engineering_in_batches(input_file, output_file)
logging.info("Finished feature engineering on merged dataset.")
