# How to use this file?
1. Make sure that lgb_model_summarized.pkl and xgb_model_summarized.pkl are in the same directory as the notebook.
2. Just hit run all to run all the code blocks.
3. When prompted insert the path of the folder contatining the location of the folder containing all the input CSVs.
4. The predictions will be saved to demand_predictions_ensemble.csv and processed_weather_data.csv.

In [None]:
%%capture
!pip install jpholiday
!pip install joblib
!pip install xgboost
!pip install lightgbm
!pip install numpy
!pip install pandas
!pip install scikit-learn

In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
import jpholiday
from typing import Dict
import joblib

In [None]:
class WeatherDataPipeline:
    def __init__(self, city_weights: Dict[str, float]):
        self.city_weights = city_weights
        self.scaler = MinMaxScaler()
    
    def translate_wind_direction(self, direction_chinese: str) -> float:
        translations = {
            '南': 180, '南南西': 202.5, '南南東': 157.5, '西': 270, '北西': 315,
            '南西': 225, '西北西': 292.5, '西南西': 247.5, '南東': 135,
            '北北西': 337.5, '北東': 45, '東南東': 112.5, '東': 90, '北': 0,
            '北北東': 22.5, '東北東': 67.5, '静穏': 0, '×': -1
        }
        return translations.get(direction_chinese, -1)
    
    def process_single_file(self, file_path: str, city_name: str) -> pd.DataFrame:
        try:
            df = pd.read_csv(file_path, parse_dates=['datetime'])
            df['wind_direction'] = df['wind_direction'].apply(self.translate_wind_direction)
            df.rename(columns={col: f"{col}_{city_name}" for col in df.columns if col != 'datetime'}, inplace=True)
            return df
        except Exception as e:
            print(f"Error processing {city_name} file: {str(e)}")
            return None
    
    def weighted_mode(self, series: pd.Series) -> float:
        valid_data = series[series != -1]
        if valid_data.empty:
            return -1
        unique_values, counts = np.unique(valid_data, return_counts=True)
        return unique_values[np.argmax(counts)]
    
    def merge_city_data(self, city_dfs: Dict[str, pd.DataFrame]) -> pd.DataFrame:
        valid_dfs = {city: df for city, df in city_dfs.items() if df is not None}
        if not valid_dfs:
            raise ValueError("No valid DataFrames to merge")
        
        merged_df = list(valid_dfs.values())[0]
        for city, df in list(valid_dfs.items())[1:]:
            merged_df = pd.merge(merged_df, df, on='datetime', how='outer')
        
        merged_df.sort_values('datetime', inplace=True)
        
        for col in merged_df.columns:
            if col != 'datetime':
                merged_df[col] = merged_df[col].fillna(method='ffill', limit=1)
                merged_df[col] = merged_df[col].fillna(method='bfill', limit=1)
        
        weather_cols = [col for col in merged_df.columns if col != 'datetime']
        merged_df.dropna(subset=weather_cols, how='all', inplace=True)
        
        return merged_df
    
    def aggregate_features(self, merged_df: pd.DataFrame) -> pd.DataFrame:
        result_df = pd.DataFrame({'datetime': merged_df['datetime'].unique()})
        features = ['precipitation', 'temperature', 'dew_point_temperature', 'humidity', 'wind_speed', 'snowfall', 'wind_direction']
        total_weight = sum(self.city_weights.values())
        
        for feature in features:
            feature_cols = [col for col in merged_df.columns if col.startswith(feature)]
            
            if feature == 'wind_direction':
                result_df[feature] = merged_df[feature_cols].apply(self.weighted_mode, axis=1)
            else:
                weighted_sum = pd.Series(0, index=merged_df.index)
                weights_sum = 0
                
                for col in feature_cols:
                    city = col.split('_')[-1]
                    if city in self.city_weights:
                        weight = self.city_weights[city]
                        weighted_sum += merged_df[col].fillna(0) * weight
                        weights_sum += weight
                
                result_df[feature] = weighted_sum / weights_sum if weights_sum > 0 else 0
        
        return result_df
    
    def add_time_features(self, df: pd.DataFrame) -> pd.DataFrame:
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['month'] = df['datetime'].dt.month
        df['is_holiday'] = df['datetime'].apply(lambda x: jpholiday.is_holiday(x))
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
        return df
    
    def scale_features(self, df: pd.DataFrame) -> pd.DataFrame:
        numerical_columns = df.columns.difference(['datetime'])
        df[numerical_columns] = self.scaler.fit_transform(df[numerical_columns])
        return df
    
    def process_folder(self, folder_path: str) -> pd.DataFrame:
        city_dfs = {}
        
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.csv'):
                city_name = os.path.splitext(file_name)[0].lower()
                file_path = os.path.join(folder_path, file_name)
                city_dfs[city_name] = self.process_single_file(file_path, city_name)
        
        merged_df = self.merge_city_data(city_dfs)
        final_df = self.aggregate_features(merged_df)
        final_df = self.add_time_features(final_df)
        final_df = self.scale_features(final_df)
        
        return final_df



In [None]:

# city weights on the basis of diffrent factors such as population, industries, urban, rural etc...
city_weights = {
    'osaka': 1.00, 'kobe': 0.85, 'kyoto': 0.75, 'wakayama': 0.55,
    'hikone': 0.35, 'toyooka': 0.25, 'shionomisaki': 0.10, 'nara': 0.45
}

folder_path = input("Enter path of folder with all input CSVs:")

pipeline = WeatherDataPipeline(city_weights)
processed_data = pipeline.process_folder(folder_path)
processed_data.to_csv('processed_weather_data.csv', index=False)

In [None]:
# Load trained models
xgb_model = joblib.load("xgb_model_summarized.pkl")
lgb_model = joblib.load("lgb_model_summarized.pkl")

# Load dataset
processed_data = pd.read_csv("processed_weather_data.csv", parse_dates=["datetime"])

# Extract features
features = [col for col in processed_data.columns if col != "datetime"]

# Generate predictions from both models
xgb_pred = xgb_model.predict(processed_data[features])
lgb_pred = lgb_model.predict(processed_data[features])

# Ensemble using weighted averaging
w_xgb = 0.5  # Adjust weights based on model performance
w_lgb = 0.5
ensemble_pred = (w_xgb * xgb_pred) + (w_lgb * lgb_pred)

# Store predictions
processed_data["demand"] = ensemble_pred

# Save to CSV
processed_data[["datetime", "demand"]].to_csv("demand_predictions_ensemble.csv", index=False)

print("Ensemble predictions saved successfully to demand_predictions_ensemble.csv")


Ensemble predictions saved successfully to demand_predictions_ensemble.csv
