In [1]:
import pandas as pd
import os

In [2]:
input_dir = "data/raw/"
output_dir = "data/processed/filled_missing/"
os.makedirs(output_dir, exist_ok=True)

In [10]:
def process_file(file_path, output_path, use_resample=False):
    excel = pd.ExcelFile(file_path)
    writer = pd.ExcelWriter(output_path, engine='openpyxl')

    for sheet in excel.sheet_names:
        df = excel.parse(sheet)

        # Ensure 'Date' is datetime for interpolation/resample
        df['Date'] = pd.to_datetime(df['Date'])
        df = df.sort_values('Date')
        df.set_index('Date', inplace=True)

        if use_resample:
            # Monthly averaging
            df = df.resample('M').mean(numeric_only=True)
        else:
            # Linear interpolation (axis=0: interpolate each column over time)
            df = df.interpolate(method='linear', axis=0, limit_direction='both')

        df.reset_index(inplace=True)
        df.to_excel(writer, sheet_name=sheet, index=False)

    writer._save()
    print(f"Saved filled file: {os.path.basename(file_path)}")



for filename in os.listdir(input_dir):
    if filename.endswith(".xlsx"):
        in_path = os.path.join(input_dir, filename)
        out_path = os.path.join(output_dir, filename)

        # Set use_resample=True to switch to monthly mean method
        process_file(in_path, out_path, use_resample=False)


Saved filled file: Aurangabad_weather.xlsx
Saved filled file: Gangapur_weather.xlsx
Saved filled file: Kannad_weather.xlsx
Saved filled file: Khuldabad_weather.xlsx
Saved filled file: Paithan_weather.xlsx
Saved filled file: Phulambri_weather.xlsx
Saved filled file: Sillod_weather.xlsx
Saved filled file: Soyagaon_weather.xlsx
Saved filled file: Vaijapur_weather.xlsx
