In [14]:
# Install all required libraries for energy consumption forecasting
import subprocess
import sys

def install_package(package):
    """Install a package using pip"""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
        print(f"✓ Successfully installed {package}")
    except subprocess.CalledProcessError as e:
        print(f"✗ Failed to install {package}: {e}")

# List of required packages
required_packages = [
    'ipykernel',           # Jupyter kernel support
    'pandas>=1.3.0',       # Data manipulation and analysis
    'numpy>=1.20.0',       # Numerical computing
    'matplotlib>=3.3.0',   # Plotting library
    'seaborn>=0.11.0',     # Statistical data visualization
    'scikit-learn>=1.0.0', # Machine learning library (for LabelEncoder, etc.)
    'lightgbm>=3.2.0',     # LightGBM for gradient boosting
    'jupyter',             # Jupyter notebook support
    'ipython',             # Interactive Python shell
    'tqdm',                # Progress bars
    'joblib',              # Parallel computing utilities
]

print("Installing required packages for Energy Consumption Forecasting...")
print("=" * 60)

for package in required_packages:
    install_package(package)

print("\n" + "=" * 60)
print("Installation completed!")
print("=" * 60)

# Verify installations
print("\nVerifying installations...")
try:
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    import sklearn
    import lightgbm as lgb
    import ipykernel
    print("✓ All core packages imported successfully!")
    
    # Display versions
    print(f"\nPackage versions:")
    print(f"  Python: {sys.version.split()[0]}")
    print(f"  Pandas: {pd.__version__}")
    print(f"  NumPy: {np.__version__}")
    print(f"  Matplotlib: {plt.matplotlib.__version__}")
    print(f"  Seaborn: {sns.__version__}")
    print(f"  Scikit-learn: {sklearn.__version__}")
    print(f"  LightGBM: {lgb.__version__}")
    
except ImportError as e:
    print(f"✗ Import error: {e}")
    print("Please restart the kernel after installation completes.")

Installing required packages for Energy Consumption Forecasting...
✓ Successfully installed ipykernel
✓ Successfully installed ipykernel
✓ Successfully installed pandas>=1.3.0
✓ Successfully installed pandas>=1.3.0
✓ Successfully installed numpy>=1.20.0
✓ Successfully installed numpy>=1.20.0
✓ Successfully installed matplotlib>=3.3.0
✓ Successfully installed matplotlib>=3.3.0
✓ Successfully installed seaborn>=0.11.0
✓ Successfully installed seaborn>=0.11.0
✓ Successfully installed scikit-learn>=1.0.0
✓ Successfully installed scikit-learn>=1.0.0
✓ Successfully installed lightgbm>=3.2.0
✓ Successfully installed lightgbm>=3.2.0
✓ Successfully installed jupyter
✓ Successfully installed jupyter
✓ Successfully installed ipython
✓ Successfully installed ipython
✓ Successfully installed tqdm
✓ Successfully installed tqdm
✓ Successfully installed joblib

Installation completed!

Verifying installations...
✓ All core packages imported successfully!

Package versions:
  Python: 3.12.12
  Pandas: 

# Energy Consumption Forecasting - Data Preprocessing

This notebook combines building metadata, weather data, and meter readings to create a comprehensive dataset for LGBM model training.

## Data Sources:
- `building_metadata.csv`: Building characteristics (site_id, building_id, primary_use, square_feet, year_built, floor_count)
- `weather_train.csv`: Weather data by site and timestamp
- `train.csv`: Meter readings by building and timestamp (assumed to exist)

## Objectives:
1. Load and explore all datasets
2. Merge data sources appropriately
3. Handle missing values and data quality issues
4. Feature engineering for LGBM model
5. Create final preprocessed dataset

In [15]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
plt.style.use('default')
sns.set_palette("husl")

print("Libraries imported successfully!")
print(f"Pandas version: {pd.__version__}")
print(f"NumPy version: {np.__version__}")

Libraries imported successfully!
Pandas version: 2.3.3
NumPy version: 2.3.4


## 1. Data Loading and Initial Exploration

In [16]:
# Define data paths
data_path = '/Users/saatwik/Documents/Energy-consumption-forecasting/data/'

# Load building metadata
building_metadata = pd.read_csv(f"{data_path}building_metadata.csv")
print(f"Building metadata shape: {building_metadata.shape}")

# Load weather data
weather_data = pd.read_csv(f"{data_path}weather_train.csv")
print(f"Weather data shape: {weather_data.shape}")

# Load meter readings
try:
    meter_readings = pd.read_csv(f"{data_path}train.csv")
    print(f"Meter readings shape: {meter_readings.shape}")
    
    # Check for expected columns
    expected_cols = ['building_id', 'meter_reading', 'timestamp']
    missing_cols = [col for col in expected_cols if col not in meter_readings.columns]
    if missing_cols:
        print(f"Warning: Missing expected columns in train.csv: {missing_cols}")
    
    # Display basic info about meter readings
    print(f"Meter readings date range: {meter_readings['timestamp'].min()} to {meter_readings['timestamp'].max()}")
    print(f"Unique buildings in meter readings: {meter_readings['building_id'].nunique()}")
    if 'meter' in meter_readings.columns:
        print(f"Meter types in data: {meter_readings['meter'].value_counts().to_dict()}")
    
except FileNotFoundError:
    print("Warning: train.csv not found. Creating sample data for demonstration.")
    # Create sample meter readings data for demonstration
    np.random.seed(42)
    sample_buildings = building_metadata['building_id'].sample(min(100, len(building_metadata)))
    sample_dates = pd.date_range('2016-01-01', '2016-03-31', freq='H')
    
    meter_readings = []
    for building_id in sample_buildings:
        for date in sample_dates[:24*7]:  # 1 week of hourly data
            meter_readings.append({
                'building_id': building_id,
                'timestamp': date,
                'meter': np.random.choice([0, 1, 2, 3]),  # Different meter types
                'meter_reading': np.random.exponential(100) + np.random.normal(50, 10)
            })
    
    meter_readings = pd.DataFrame(meter_readings)
    print(f"Sample meter readings shape: {meter_readings.shape}")
    print("Note: Using sample data. Replace with actual train.csv when available.")

print("\n" + "="*60)
print("Data loaded successfully!")
print("="*60)

Building metadata shape: (1449, 6)
Weather data shape: (139773, 9)
Meter readings shape: (20216100, 4)
Meter readings shape: (20216100, 4)
Meter readings date range: 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Unique buildings in meter readings: 1449
Meter types in data: {0: 12060910, 1: 4182440, 2: 2708713, 3: 1264037}

Data loaded successfully!
Meter readings date range: 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Unique buildings in meter readings: 1449
Meter types in data: {0: 12060910, 1: 4182440, 2: 2708713, 3: 1264037}

Data loaded successfully!


In [17]:
# Display basic information about each dataset
print("BUILDING METADATA OVERVIEW")
print("="*40)
print(building_metadata.info())
print("\nFirst 5 rows:")
print(building_metadata.head())
print("\nUnique values in key columns:")
print(f"Unique sites: {building_metadata['site_id'].nunique()}")
print(f"Unique buildings: {building_metadata['building_id'].nunique()}")
print(f"Primary uses: {building_metadata['primary_use'].value_counts().head()}")

print("\n" + "="*80)
print("WEATHER DATA OVERVIEW")
print("="*40)
print(weather_data.info())
print("\nFirst 5 rows:")
print(weather_data.head())
print(f"\nUnique sites: {weather_data['site_id'].nunique()}")
print(f"Date range: {weather_data['timestamp'].min()} to {weather_data['timestamp'].max()}")

print("\n" + "="*80)
print("METER READINGS OVERVIEW")
print("="*40)
print(meter_readings.info())
print("\nFirst 5 rows:")
print(meter_readings.head())
print(f"\nUnique buildings: {meter_readings['building_id'].nunique()}")
if 'meter' in meter_readings.columns:
    print(f"Meter types: {meter_readings['meter'].value_counts()}")
print(f"Meter reading range: {meter_readings['meter_reading'].min():.2f} to {meter_readings['meter_reading'].max():.2f}")

BUILDING METADATA OVERVIEW
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1449 entries, 0 to 1448
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   site_id      1449 non-null   int64  
 1   building_id  1449 non-null   int64  
 2   primary_use  1449 non-null   object 
 3   square_feet  1449 non-null   int64  
 4   year_built   675 non-null    float64
 5   floor_count  355 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 68.1+ KB
None

First 5 rows:
   site_id  building_id primary_use  square_feet  year_built  floor_count
0        0            0   Education         7432      2008.0          NaN
1        0            1   Education         2720      2004.0          NaN
2        0            2   Education         5376      1991.0          NaN
3        0            3   Education        23685      2002.0          NaN
4        0            4   Education       116607      1975.0          NaN

Uni

## 2. Data Preprocessing and Datetime Handling

In [18]:
# Convert timestamp columns to datetime
print("Converting timestamps to datetime format...")

# Weather data timestamp
weather_data['timestamp'] = pd.to_datetime(weather_data['timestamp'])
print(f"Weather timestamp range: {weather_data['timestamp'].min()} to {weather_data['timestamp'].max()}")

# Meter readings timestamp (if exists in column)
if 'timestamp' in meter_readings.columns:
    meter_readings['timestamp'] = pd.to_datetime(meter_readings['timestamp'])
    print(f"Meter readings timestamp range: {meter_readings['timestamp'].min()} to {meter_readings['timestamp'].max()}")
else:
    print("No timestamp column found in meter readings - creating one")
    # If timestamp doesn't exist, create it (for sample data)
    if len(meter_readings) > 0:
        meter_readings['timestamp'] = pd.date_range('2016-01-01', periods=len(meter_readings), freq='H')

# Extract datetime features for better model performance
def extract_datetime_features(df, datetime_col='timestamp'):
    """Extract useful datetime features from timestamp column"""
    df = df.copy()
    df['hour'] = df[datetime_col].dt.hour
    df['day'] = df[datetime_col].dt.day
    df['month'] = df[datetime_col].dt.month
    df['year'] = df[datetime_col].dt.year
    df['weekday'] = df[datetime_col].dt.weekday  # 0=Monday, 6=Sunday
    df['is_weekend'] = (df['weekday'] >= 5).astype(int)
    df['day_of_year'] = df[datetime_col].dt.dayofyear
    
    # Cyclical encoding for better model understanding
    df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
    df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)
    df['month_sin'] = np.sin(2 * np.pi * df['month'] / 12)
    df['month_cos'] = np.cos(2 * np.pi * df['month'] / 12)
    
    return df

# Apply datetime feature extraction to weather data
weather_data = extract_datetime_features(weather_data)
print(f"Weather data columns after datetime features: {weather_data.columns.tolist()}")

# Apply datetime feature extraction to meter readings
meter_readings = extract_datetime_features(meter_readings)
print(f"Meter readings columns after datetime features: {meter_readings.columns.tolist()}")

print("\nDatetime preprocessing completed!")

Converting timestamps to datetime format...
Weather timestamp range: 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Meter readings timestamp range: 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Weather data columns after datetime features: ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'hour', 'day', 'month', 'year', 'weekday', 'is_weekend', 'day_of_year', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
Meter readings timestamp range: 2016-01-01 00:00:00 to 2016-12-31 23:00:00
Weather data columns after datetime features: ['site_id', 'timestamp', 'air_temperature', 'cloud_coverage', 'dew_temperature', 'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed', 'hour', 'day', 'month', 'year', 'weekday', 'is_weekend', 'day_of_year', 'hour_sin', 'hour_cos', 'month_sin', 'month_cos']
Meter readings columns after datetime features: ['building_id', 'meter', 'timestamp', 'met

## 3. Data Merging Strategy

The merging strategy follows this logical flow:
1. **First merge**: Meter readings + Building metadata (on building_id)
2. **Second merge**: Result + Weather data (on site_id and timestamp)

This ensures we have all building characteristics and weather conditions for each meter reading.

In [19]:
# Step 1: Merge meter readings with building metadata
print("Step 1: Merging meter readings with building metadata...")
print(f"Meter readings shape before merge: {meter_readings.shape}")
print(f"Building metadata shape: {building_metadata.shape}")

# First merge: meter_readings + building_metadata on building_id
merged_data = pd.merge(
    meter_readings, 
    building_metadata, 
    on='building_id', 
    how='left'
)

print(f"Data shape after first merge: {merged_data.shape}")
print(f"Missing site_id values after merge: {merged_data['site_id'].isnull().sum()}")

# Check for buildings in meter_readings that are not in building_metadata
missing_buildings = meter_readings[~meter_readings['building_id'].isin(building_metadata['building_id'])]
if len(missing_buildings) > 0:
    print(f"Warning: {len(missing_buildings)} meter readings have building_ids not found in metadata")
    print(f"Missing building_ids: {missing_buildings['building_id'].unique()[:10]}")  # Show first 10

print("\n" + "="*60)
print("First merge completed successfully!")
print("="*60)

Step 1: Merging meter readings with building metadata...
Meter readings shape before merge: (20216100, 15)
Building metadata shape: (1449, 6)
Data shape after first merge: (20216100, 20)
Missing site_id values after merge: 0

First merge completed successfully!
Data shape after first merge: (20216100, 20)
Missing site_id values after merge: 0

First merge completed successfully!


In [20]:
# Step 2: Merge with weather data
print("Step 2: Merging with weather data...")
print(f"Current merged data shape: {merged_data.shape}")
print(f"Weather data shape: {weather_data.shape}")

# Second merge: merged_data + weather_data on site_id and timestamp
final_dataset = pd.merge(
    merged_data,
    weather_data,
    on=['site_id', 'timestamp'],
    how='left'
)

print(f"Final dataset shape: {final_dataset.shape}")
print(f"Missing weather data: {final_dataset['air_temperature'].isnull().sum()}")

print("\n" + "="*60)
print("Data merging completed!")
print("="*60)

# Display final dataset info
print("\nFINAL MERGED DATASET OVERVIEW")
print("="*40)
print(final_dataset.info())
print("\nFirst 3 rows:")
print(final_dataset.head(3))

# Summary statistics
print(f"\nDataset Summary:")
print(f"- Total records: {len(final_dataset):,}")
print(f"- Unique buildings: {final_dataset['building_id'].nunique()}")
print(f"- Unique sites: {final_dataset['site_id'].nunique()}")
print(f"- Date range: {final_dataset['timestamp'].min()} to {final_dataset['timestamp'].max()}")
print(f"- Primary building uses: {final_dataset['primary_use'].nunique()}")

# Check data completeness
missing_data = final_dataset.isnull().sum()
print(f"\nMissing data summary:")
print(missing_data[missing_data > 0])

Step 2: Merging with weather data...
Current merged data shape: (20216100, 20)
Weather data shape: (139773, 20)
Final dataset shape: (20216100, 38)
Missing weather data: 96658

Data merging completed!

FINAL MERGED DATASET OVERVIEW
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20216100 entries, 0 to 20216099
Data columns (total 38 columns):
 #   Column              Dtype         
---  ------              -----         
 0   building_id         int64         
 1   meter               int64         
 2   timestamp           datetime64[ns]
 3   meter_reading       float64       
 4   hour_x              int32         
 5   day_x               int32         
 6   month_x             int32         
 7   year_x              int32         
 8   weekday_x           int32         
 9   is_weekend_x        int64         
 10  day_of_year_x       int32         
 11  hour_sin_x          float64       
 12  hour_cos_x          float64       
 13  month_sin_x         float64       
 14  month_co

## 4. Data Quality Assessment and Cleaning

In [21]:
# Analyze data quality issues
print("DATA QUALITY ASSESSMENT")
print("="*50)

# Add memory management and error handling
import gc

try:
    print(f"Dataset shape for quality assessment: {final_dataset.shape}")
    
    # 1. Check for duplicates (with memory safety)
    print("Checking for duplicates...")
    if len(final_dataset) > 500000:  # Large dataset
        print("Large dataset - using sample for duplicate check...")
        sample_data = final_dataset.sample(n=100000, random_state=42)
        duplicate_rows = sample_data.duplicated().sum()
        print(f"Duplicate rows in sample: {duplicate_rows}")
    else:
        duplicate_rows = final_dataset.duplicated().sum()
        print(f"Duplicate rows: {duplicate_rows}")

    # 2. Analyze missing values in detail (safely)
    print("Analyzing missing values...")
    missing_data = {}
    for col in final_dataset.columns:
        try:
            missing_count = final_dataset[col].isnull().sum()
            if missing_count > 0:
                missing_data[col] = {
                    'Missing_Count': missing_count,
                    'Missing_Percentage': (missing_count / len(final_dataset)) * 100
                }
        except Exception as e:
            print(f"  Error analyzing column {col}: {e}")
    
    if missing_data:
        print(f"\nMissing data summary:")
        for col, stats in sorted(missing_data.items(), key=lambda x: x[1]['Missing_Percentage'], reverse=True)[:10]:
            print(f"  {col}: {stats['Missing_Count']} ({stats['Missing_Percentage']:.1f}%)")
    else:
        print("\nNo missing values found!")
        
except Exception as e:
    print(f"Error in quality assessment: {e}")
    duplicate_rows = 0  # Safe fallback

# 3. Check for outliers in meter readings
print(f"\nMeter reading statistics:")
print(final_dataset['meter_reading'].describe())

# Identify potential outliers (values > 99th percentile or < 1st percentile)
q99 = final_dataset['meter_reading'].quantile(0.99)
q01 = final_dataset['meter_reading'].quantile(0.01)
outliers = final_dataset[(final_dataset['meter_reading'] > q99) | (final_dataset['meter_reading'] < q01)]
print(f"Potential outliers (beyond 1st-99th percentile): {len(outliers)} rows ({len(outliers)/len(final_dataset)*100:.2f}%)")

# 4. Check for negative meter readings (should not exist)
negative_readings = (final_dataset['meter_reading'] < 0).sum()
print(f"Negative meter readings: {negative_readings}")

# 5. Check data consistency
print(f"\nData consistency checks:")
print(f"Building year range: {final_dataset['year_built'].min()} to {final_dataset['year_built'].max()}")
print(f"Square feet range: {final_dataset['square_feet'].min()} to {final_dataset['square_feet'].max()}")

# Handle missing values
print("\n" + "="*50)
print("CLEANING ACTIONS")
print("="*50)

try:
    # Create a cleaned dataset with memory management
    print("Creating cleaned dataset...")
    cleaned_dataset = final_dataset.copy()
    
    # Force garbage collection
    gc.collect()

    # 1. Handle missing weather data with error handling
    weather_columns = ['air_temperature', 'cloud_coverage', 'dew_temperature', 
                      'precip_depth_1_hr', 'sea_level_pressure', 'wind_direction', 'wind_speed']

    print("Handling missing weather data...")
    for col in weather_columns:
        if col in cleaned_dataset.columns:
            try:
                print(f"  Processing {col}...")
                # For large datasets, use chunked processing
                if len(cleaned_dataset) > 1000000:
                    print(f"    Using chunked processing for large dataset...")
                    # Process in chunks to avoid memory issues
                    chunk_size = 100000
                    for i in range(0, len(cleaned_dataset), chunk_size):
                        chunk_end = min(i + chunk_size, len(cleaned_dataset))
                        chunk = cleaned_dataset.iloc[i:chunk_end]
                        chunk[col] = chunk.groupby('site_id')[col].ffill().bfill()
                        cleaned_dataset.iloc[i:chunk_end, cleaned_dataset.columns.get_loc(col)] = chunk[col]
                else:
                    # Standard processing for smaller datasets
                    cleaned_dataset[col] = cleaned_dataset.groupby('site_id')[col].ffill().bfill()
                
                # Fill remaining missing values with median
                missing_count = cleaned_dataset[col].isnull().sum()
                if missing_count > 0:
                    median_val = cleaned_dataset[col].median()
                    cleaned_dataset[col].fillna(median_val, inplace=True)
                    print(f"    {col}: Filled {missing_count} values with median ({median_val:.2f})")
                else:
                    print(f"    {col}: No missing values after forward/backward fill")
                    
            except Exception as e:
                print(f"    Error processing {col}: {e}")
                # Fallback: fill with overall median
                if col in cleaned_dataset.columns:
                    median_val = cleaned_dataset[col].median()
                    if not pd.isna(median_val):
                        cleaned_dataset[col].fillna(median_val, inplace=True)
                        print(f"    {col}: Used fallback median fill")

except Exception as e:
    print(f"Error in weather data cleaning: {e}")
    cleaned_dataset = final_dataset.copy()  # Fallback

# 2. Handle missing building metadata
print("\nHandling missing building metadata...")
try:
    if 'square_feet' in cleaned_dataset.columns and cleaned_dataset['square_feet'].isnull().sum() > 0:
        print("  Processing square_feet...")
        # Fill missing square_feet with median by primary_use
        median_by_use = cleaned_dataset.groupby('primary_use')['square_feet'].median()
        missing_mask = cleaned_dataset['square_feet'].isnull()
        
        for use_type in cleaned_dataset[missing_mask]['primary_use'].unique():
            if pd.notna(use_type) and use_type in median_by_use.index:
                type_mask = (cleaned_dataset['primary_use'] == use_type) & missing_mask
                cleaned_dataset.loc[type_mask, 'square_feet'] = median_by_use[use_type]
        
        # Fill any remaining missing values with overall median
        remaining_missing = cleaned_dataset['square_feet'].isnull().sum()
        if remaining_missing > 0:
            overall_median = cleaned_dataset['square_feet'].median()
            cleaned_dataset['square_feet'].fillna(overall_median, inplace=True)
            
        print(f"    square_feet: Filled missing values with median by building type")

    if 'year_built' in cleaned_dataset.columns and cleaned_dataset['year_built'].isnull().sum() > 0:
        print("  Processing year_built...")
        median_year = cleaned_dataset['year_built'].median()
        cleaned_dataset['year_built'].fillna(median_year, inplace=True)
        print(f"    year_built: Filled missing values with median ({median_year})")

    if 'floor_count' in cleaned_dataset.columns and cleaned_dataset['floor_count'].isnull().sum() > 0:
        print("  Processing floor_count...")
        # Estimate floor count based on square feet (rough approximation)
        if 'square_feet' in cleaned_dataset.columns:
            estimated_floors = np.maximum(1, np.round(cleaned_dataset['square_feet'] / 15000))
            cleaned_dataset['floor_count'] = cleaned_dataset['floor_count'].fillna(estimated_floors)
        else:
            # Fallback to median
            median_floors = cleaned_dataset['floor_count'].median()
            if pd.notna(median_floors):
                cleaned_dataset['floor_count'].fillna(median_floors, inplace=True)
            else:
                cleaned_dataset['floor_count'].fillna(1, inplace=True)
        print(f"    floor_count: Estimated based on available data")

except Exception as e:
    print(f"  Error handling building metadata: {e}")

# 3. Remove or cap extreme outliers
print(f"\nHandling outliers...")
try:
    # Cap meter readings at 99.5th percentile to handle extreme outliers
    q995 = cleaned_dataset['meter_reading'].quantile(0.995)
    outliers_mask = cleaned_dataset['meter_reading'] > q995
    outliers_capped = outliers_mask.sum()
    
    if outliers_capped > 0:
        cleaned_dataset.loc[outliers_mask, 'meter_reading'] = q995
        print(f"  Capped {outliers_capped} extreme meter readings at 99.5th percentile ({q995:.2f})")
    else:
        print(f"  No extreme outliers found to cap")
        
except Exception as e:
    print(f"  Error handling outliers: {e}")

# 4. Remove duplicate rows if any
print(f"\nRemoving duplicates...")
try:
    if duplicate_rows > 0 and len(cleaned_dataset) < 1000000:  # Only for manageable datasets
        before_dedup = len(cleaned_dataset)
        cleaned_dataset.drop_duplicates(inplace=True)
        actual_removed = before_dedup - len(cleaned_dataset)
        print(f"  Removed {actual_removed} duplicate rows")
    else:
        print(f"  Skipping duplicate removal for large dataset or no duplicates found")
        
except Exception as e:
    print(f"  Error removing duplicates: {e}")

# Final cleanup and summary
try:
    print(f"\nCleaned dataset shape: {cleaned_dataset.shape}")
    remaining_nulls = cleaned_dataset.isnull().sum().sum()
    print(f"Remaining missing values: {remaining_nulls}")
    
    # Force garbage collection
    gc.collect()
    
except Exception as e:
    print(f"Error in final summary: {e}")
    # Ensure we have a valid cleaned dataset
    if 'cleaned_dataset' not in locals():
        cleaned_dataset = final_dataset.copy()
        print("Using fallback: original dataset as cleaned dataset")

DATA QUALITY ASSESSMENT
Dataset shape for quality assessment: (20216100, 38)
Checking for duplicates...
Large dataset - using sample for duplicate check...
Duplicate rows in sample: 0
Analyzing missing values...
Duplicate rows in sample: 0
Analyzing missing values...

Missing data summary:
  floor_count: 16709167 (82.7%)
  year_built: 12127645 (60.0%)
  cloud_coverage: 8825365 (43.7%)
  precip_depth_1_hr: 3749023 (18.5%)
  wind_direction: 1449048 (7.2%)
  sea_level_pressure: 1231669 (6.1%)
  wind_speed: 143676 (0.7%)
  dew_temperature: 100140 (0.5%)
  air_temperature: 96658 (0.5%)
  hour_y: 90495 (0.4%)

Meter reading statistics:

Missing data summary:
  floor_count: 16709167 (82.7%)
  year_built: 12127645 (60.0%)
  cloud_coverage: 8825365 (43.7%)
  precip_depth_1_hr: 3749023 (18.5%)
  wind_direction: 1449048 (7.2%)
  sea_level_pressure: 1231669 (6.1%)
  wind_speed: 143676 (0.7%)
  dew_temperature: 100140 (0.5%)
  air_temperature: 96658 (0.5%)
  hour_y: 90495 (0.4%)

Meter reading stat

## 5. Feature Engineering for LGBM Model

In [22]:
# Create additional features for the LGBM model
print("FEATURE ENGINEERING FOR LGBM MODEL")
print("="*50)

# Add memory management and error handling
import gc

try:
    # Start with the cleaned dataset
    feature_dataset = cleaned_dataset.copy()
    print(f"Starting feature engineering on dataset with shape: {feature_dataset.shape}")
    
    # Check memory usage and apply sampling if dataset is too large
    if len(feature_dataset) > 2000000:  # More than 2M rows
        print("Large dataset detected - applying memory-efficient processing...")
        # Use a representative sample for feature engineering if dataset is extremely large
        sample_size = min(500000, len(feature_dataset))
        print(f"Using sample of {sample_size:,} rows for memory efficiency...")
        feature_dataset = feature_dataset.sample(n=sample_size, random_state=42)
        
except Exception as e:
    print(f"Error initializing feature engineering: {e}")
    # Fallback to a smaller sample
    sample_size = min(10000, len(cleaned_dataset))
    print(f"Using fallback sample of {sample_size} rows...")
    feature_dataset = cleaned_dataset.sample(n=sample_size, random_state=42)

# 1. Building-related features
print("Creating building-related features...")

# Building age
if 'year_built' in feature_dataset.columns:
    # Ensure we have a year column (extract from timestamp if needed)
    if 'year' not in feature_dataset.columns and 'timestamp' in feature_dataset.columns:
        feature_dataset['year'] = pd.to_datetime(feature_dataset['timestamp']).dt.year
    
    # Calculate building age
    if 'year' in feature_dataset.columns:
        feature_dataset['building_age'] = feature_dataset['year'] - feature_dataset['year_built']
        feature_dataset['building_age'] = np.maximum(0, feature_dataset['building_age'])  # Ensure non-negative
    else:
        # Fallback: use current year if no year column available
        current_year = datetime.now().year
        feature_dataset['building_age'] = current_year - feature_dataset['year_built']
        feature_dataset['building_age'] = np.maximum(0, feature_dataset['building_age'])

# Log of square feet (to handle skewness)
if 'square_feet' in feature_dataset.columns:
    feature_dataset['log_square_feet'] = np.log1p(feature_dataset['square_feet'])

# Square feet per floor
if 'square_feet' in feature_dataset.columns and 'floor_count' in feature_dataset.columns:
    feature_dataset['sqft_per_floor'] = feature_dataset['square_feet'] / np.maximum(1, feature_dataset['floor_count'])

# Encode categorical variables
if 'primary_use' in feature_dataset.columns:
    # Label encode primary use
    from sklearn.preprocessing import LabelEncoder
    le_primary_use = LabelEncoder()
    feature_dataset['primary_use_encoded'] = le_primary_use.fit_transform(feature_dataset['primary_use'])
    print(f"  Primary use categories: {len(le_primary_use.classes_)}")

# 2. Weather-related features
print("Creating weather-related features...")

# Temperature features
if 'air_temperature' in feature_dataset.columns:
    # Convert to Celsius if in Fahrenheit (assuming Celsius if max < 50)
    if feature_dataset['air_temperature'].max() > 50:
        feature_dataset['temp_celsius'] = (feature_dataset['air_temperature'] - 32) * 5/9
    else:
        feature_dataset['temp_celsius'] = feature_dataset['air_temperature']
    
    # Temperature categories
    feature_dataset['temp_category'] = pd.cut(feature_dataset['temp_celsius'], 
                                            bins=[-50, 0, 10, 20, 30, 100], 
                                            labels=['very_cold', 'cold', 'mild', 'warm', 'hot'])
    
    # Encode temperature category
    le_temp = LabelEncoder()
    feature_dataset['temp_category_encoded'] = le_temp.fit_transform(feature_dataset['temp_category'])

# Weather comfort index (simple combination)
if all(col in feature_dataset.columns for col in ['temp_celsius', 'wind_speed', 'cloud_coverage']):
    feature_dataset['weather_comfort'] = (
        feature_dataset['temp_celsius'] * 0.6 - 
        feature_dataset['wind_speed'] * 0.2 + 
        (10 - feature_dataset['cloud_coverage']) * 0.2
    )

# 3. Time-based features (already created, but let's add more)
print("Creating additional time-based features...")

# Ensure datetime features exist (re-extract if needed)
if 'month' not in feature_dataset.columns and 'timestamp' in feature_dataset.columns:
    print("  Re-extracting datetime features...")
    feature_dataset = extract_datetime_features(feature_dataset, 'timestamp')

# Season based on month
def get_season(month):
    if month in [12, 1, 2]:
        return 'winter'
    elif month in [3, 4, 5]:
        return 'spring'
    elif month in [6, 7, 8]:
        return 'summer'
    else:
        return 'fall'

if 'month' in feature_dataset.columns:
    feature_dataset['season'] = feature_dataset['month'].apply(get_season)
else:
    print("  Warning: No month column available, skipping season feature")
if 'season' in feature_dataset.columns:
    le_season = LabelEncoder()
    feature_dataset['season_encoded'] = le_season.fit_transform(feature_dataset['season'])

# Time of day categories
def get_time_period(hour):
    if 6 <= hour < 12:
        return 'morning'
    elif 12 <= hour < 18:
        return 'afternoon'
    elif 18 <= hour < 22:
        return 'evening'
    else:
        return 'night'

if 'hour' in feature_dataset.columns:
    feature_dataset['time_period'] = feature_dataset['hour'].apply(get_time_period)
    le_time = LabelEncoder()
    feature_dataset['time_period_encoded'] = le_time.fit_transform(feature_dataset['time_period'])

# Working hours indicator
if all(col in feature_dataset.columns for col in ['hour', 'weekday']):
    feature_dataset['is_working_hours'] = ((feature_dataset['hour'] >= 8) & 
                                         (feature_dataset['hour'] <= 18) & 
                                         (feature_dataset['weekday'] < 5)).astype(int)
else:
    print("  Warning: Missing hour or weekday columns, skipping working hours feature")

# 4. Lag features (simplified version)
print("Creating lag features...")
try:
    # Sort by building and timestamp for proper lag calculation
    if 'timestamp' in feature_dataset.columns:
        print("  Sorting data by building and timestamp...")
        feature_dataset = feature_dataset.sort_values(['building_id', 'timestamp'])
        
        # Create lag features with memory-efficient processing
        print("  Creating 1-hour lag features...")
        feature_dataset['meter_reading_lag1h'] = feature_dataset.groupby('building_id')['meter_reading'].shift(1)
        
        # Only create longer lags if dataset is manageable
        if len(feature_dataset) < 1000000:  # Less than 1M rows
            print("  Creating 24-hour lag features...")
            feature_dataset['meter_reading_lag24h'] = feature_dataset.groupby('building_id')['meter_reading'].shift(24)
            
            print("  Creating 168-hour (7-day) lag features...")
            feature_dataset['meter_reading_lag168h'] = feature_dataset.groupby('building_id')['meter_reading'].shift(168)  # 7 days
            
            # Rolling statistics (mean of last 24 hours)
            print("  Creating rolling statistics...")
            feature_dataset['meter_reading_rolling_mean_24h'] = (
                feature_dataset.groupby('building_id')['meter_reading']
                .rolling(window=24, min_periods=1)
                .mean()
                .reset_index(0, drop=True)
            )
        else:
            print("  Skipping longer lag features due to dataset size")
            
        # Force garbage collection
        gc.collect()
        
    else:
        print("  Warning: No timestamp column found, skipping lag features")
        
except Exception as e:
    print(f"  Error creating lag features: {e}")
    print("  Continuing without lag features...")

# 5. Interaction features
print("Creating interaction features...")

# Building size × weather interactions
if all(col in feature_dataset.columns for col in ['log_square_feet', 'temp_celsius']):
    feature_dataset['size_temp_interaction'] = feature_dataset['log_square_feet'] * feature_dataset['temp_celsius']

# Time × building type interactions
if all(col in feature_dataset.columns for col in ['primary_use_encoded', 'hour']):
    feature_dataset['use_hour_interaction'] = feature_dataset['primary_use_encoded'] * feature_dataset['hour']

# Weekend × building type
if all(col in feature_dataset.columns for col in ['primary_use_encoded', 'is_weekend']):
    feature_dataset['use_weekend_interaction'] = feature_dataset['primary_use_encoded'] * feature_dataset['is_weekend']

print("\nFeature engineering completed!")

# 6. Prepare final feature set for modeling
print("\nPreparing final feature set...")

try:
    # Select features for modeling (exclude original timestamp and text columns)
    exclude_columns = ['timestamp', 'primary_use', 'temp_category', 'season', 'time_period']
    model_features = [col for col in feature_dataset.columns if col not in exclude_columns]

    print(f"Total features before filtering: {len(model_features)}")

    # Remove columns with high missing values (>50%)
    missing_threshold = 0.5
    print("Analyzing missing values...")
    
    # Safe missing value calculation
    missing_pct = {}
    for col in model_features:
        try:
            missing_pct[col] = feature_dataset[col].isnull().mean()
        except Exception:
            missing_pct[col] = 1.0  # Mark problematic columns as 100% missing
    
    valid_features = [col for col, pct in missing_pct.items() if pct <= missing_threshold]
    
    print(f"Features after removing high missing (>{missing_threshold*100}%): {len(valid_features)}")
    
    # Ensure we have essential columns
    essential_columns = ['building_id', 'meter_reading']
    for col in essential_columns:
        if col in feature_dataset.columns and col not in valid_features:
            valid_features.append(col)
            print(f"Added essential column: {col}")

    # Create final modeling dataset with error handling
    print("Creating final modeling dataset...")
    final_model_dataset = feature_dataset[valid_features].copy()

    # Fill any remaining missing values safely
    print("Filling remaining missing values...")
    numeric_columns = final_model_dataset.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 0:
        medians = final_model_dataset[numeric_columns].median()
        final_model_dataset[numeric_columns] = final_model_dataset[numeric_columns].fillna(medians)
    
    # Handle any remaining non-numeric missing values
    for col in final_model_dataset.columns:
        if final_model_dataset[col].isnull().sum() > 0:
            if final_model_dataset[col].dtype == 'object':
                final_model_dataset[col].fillna('unknown', inplace=True)
            else:
                final_model_dataset[col].fillna(0, inplace=True)

    print(f"\nFinal dataset shape: {final_model_dataset.shape}")
    print(f"Features in final dataset: {len(valid_features)}")
    
    # Only print feature list if it's manageable
    if len(valid_features) <= 50:
        print(f"Final feature list: {valid_features}")
    else:
        print(f"Final features (first 10): {valid_features[:10]}...")
        print(f"                (last 10): {valid_features[-10:]}")
    
    # Force garbage collection
    gc.collect()
    
except Exception as e:
    print(f"Error in final dataset preparation: {e}")
    # Create a minimal fallback dataset
    essential_cols = ['building_id', 'meter_reading']
    available_cols = [col for col in essential_cols if col in feature_dataset.columns]
    
    if available_cols:
        final_model_dataset = feature_dataset[available_cols].copy()
        valid_features = available_cols
        print(f"Created fallback dataset with {len(available_cols)} essential columns")
    else:
        print("Critical error: No essential columns available")
        final_model_dataset = feature_dataset.iloc[:100].copy()  # Very small sample
        valid_features = list(final_model_dataset.columns)

FEATURE ENGINEERING FOR LGBM MODEL
Starting feature engineering on dataset with shape: (20216100, 38)
Large dataset detected - applying memory-efficient processing...
Using sample of 500,000 rows for memory efficiency...
Starting feature engineering on dataset with shape: (20216100, 38)
Large dataset detected - applying memory-efficient processing...
Using sample of 500,000 rows for memory efficiency...
Creating building-related features...
  Primary use categories: 16
Creating weather-related features...
Creating additional time-based features...
  Re-extracting datetime features...
Creating building-related features...
  Primary use categories: 16
Creating weather-related features...
Creating additional time-based features...
  Re-extracting datetime features...
Creating lag features...
  Sorting data by building and timestamp...
  Creating 1-hour lag features...
  Creating 24-hour lag features...
  Creating 168-hour (7-day) lag features...
  Creating rolling statistics...
Creating l

## 6. Dataset Reduction for Efficient Training

To make the dataset more manageable while maintaining representativeness across all buildings (1-1448), we'll apply intelligent sampling strategies.

In [23]:
# Dataset reduction while maintaining all buildings (1-1448)
print("DATASET REDUCTION FOR EFFICIENT TRAINING")
print("="*50)

# Add memory cleanup and error handling
import gc

try:
    # Check current dataset size
    print(f"Current dataset shape: {final_model_dataset.shape}")
    
    # Safe memory usage calculation
    try:
        memory_mb = final_model_dataset.memory_usage(deep=True).sum() / 1024**2
        print(f"Memory usage: {memory_mb:.2f} MB")
    except:
        print("Memory usage: Unable to calculate (large dataset)")

    # Analyze the distribution of data across buildings
    print("Analyzing building distribution...")
    building_counts = final_model_dataset['building_id'].value_counts().sort_index()
    print(f"Buildings range: {building_counts.index.min()} to {building_counts.index.max()}")
    print(f"Unique buildings: {len(building_counts)}")
    print(f"Average records per building: {building_counts.mean():.1f}")
    print(f"Records per building - Min: {building_counts.min()}, Max: {building_counts.max()}")
    
except Exception as e:
    print(f"Error in initial analysis: {e}")
    # Use a smaller sample if there are issues
    sample_size = min(10000, len(final_model_dataset))
    print(f"Using sample of {sample_size} rows for analysis...")
    sample_data = final_model_dataset.sample(n=sample_size, random_state=42)
    building_counts = sample_data['building_id'].value_counts().sort_index()
    print(f"Sample analysis - Unique buildings: {len(building_counts)}")

# Strategy 1: Smart temporal sampling - keep representative time periods
# This ensures we maintain patterns across different seasons, weekdays, and hours
print("\nApplying intelligent sampling strategies...")

# Create a reduced dataset
reduced_dataset = final_model_dataset.copy()

# Add timestamp back if it was excluded from final_model_dataset
print("Preparing dataset for reduction...")
try:
    if 'timestamp' not in reduced_dataset.columns and 'timestamp' in feature_dataset.columns:
        print("  Adding timestamp column back...")
        # Create a smaller timestamp lookup to avoid memory issues
        timestamp_data = feature_dataset[['building_id', 'timestamp', 'meter_reading']].copy()
        
        # Use chunked merge for large datasets
        if len(reduced_dataset) > 100000:
            print("    Using chunked merge for large dataset...")
            chunk_size = 50000
            merged_chunks = []
            
            for i in range(0, len(reduced_dataset), chunk_size):
                chunk = reduced_dataset.iloc[i:i+chunk_size]
                merged_chunk = pd.merge(chunk, timestamp_data, on=['building_id', 'meter_reading'], how='left')
                merged_chunks.append(merged_chunk)
                print(f"    Processed chunk {i//chunk_size + 1}/{(len(reduced_dataset)-1)//chunk_size + 1}")
            
            reduced_dataset = pd.concat(merged_chunks, ignore_index=True)
            del merged_chunks  # Free memory
        else:
            # Direct merge for smaller datasets
            reduced_dataset = pd.merge(reduced_dataset, timestamp_data, on=['building_id', 'meter_reading'], how='left')
        
        del timestamp_data  # Free memory
        gc.collect()  # Force garbage collection
        print(f"  Successfully added timestamp column")
    else:
        print("  Timestamp column already available or not needed")
        
except Exception as e:
    print(f"  Warning: Could not add timestamp column: {e}")
    print("  Proceeding without timestamp-based sampling...")

# Remove rows with excessive missing lag features (they don't add much value)
if 'meter_reading_lag168h' in reduced_dataset.columns:
    # Keep only rows where we have at least 1-hour lag data (more meaningful)
    before_lag_filter = len(reduced_dataset)
    reduced_dataset = reduced_dataset.dropna(subset=['meter_reading_lag1h'])
    print(f"  Removed {before_lag_filter - len(reduced_dataset):,} rows with missing 1-hour lag data")

# Strategy 2: Systematic time-based sampling while preserving all buildings
# Sample every 4th hour to reduce by ~75% while maintaining temporal patterns
def smart_temporal_sample(df, sample_rate=4):
    """Sample every nth hour while ensuring all buildings are represented"""
    
    # Check if timestamp column exists, if not, use index-based sampling
    if 'timestamp' in df.columns:
        # Sort by building and timestamp
        df_sorted = df.sort_values(['building_id', 'timestamp'])
    else:
        print("    Warning: No timestamp column found, using index-based sampling")
        # Sort by building_id only
        df_sorted = df.sort_values(['building_id'])
    
    sampled_rows = []
    
    for building_id in df_sorted['building_id'].unique():
        building_data = df_sorted[df_sorted['building_id'] == building_id]
        
        # For each building, sample systematically but ensure we get diverse patterns
        if len(building_data) > 100:  # If building has many records
            # Take every nth record + some random samples for diversity
            systematic_sample = building_data.iloc[::sample_rate]
            
            # Add some random samples to capture variability (10% of systematic sample)
            additional_samples = min(len(systematic_sample) // 10, 20)
            if additional_samples > 0:
                random_sample = building_data.sample(n=additional_samples, random_state=42)
                combined = pd.concat([systematic_sample, random_sample]).drop_duplicates()
            else:
                combined = systematic_sample
                
        elif len(building_data) > 50:  # Medium amount of records
            # Take every 2nd record
            combined = building_data.iloc[::2]
        else:  # Small number of records - keep all
            combined = building_data
            
        sampled_rows.append(combined)
    
    return pd.concat(sampled_rows, ignore_index=True)

# Apply smart sampling with error handling
print("  Applying systematic temporal sampling...")
try:
    before_sampling = len(reduced_dataset)
    
    # Check dataset size and adjust sampling if too large
    if before_sampling > 1000000:  # More than 1M rows
        print(f"    Large dataset detected ({before_sampling:,} rows), using more aggressive sampling...")
        sample_rate = 8  # Sample every 8th row instead of 4th
    else:
        sample_rate = 4
    
    reduced_dataset = smart_temporal_sample(reduced_dataset, sample_rate=sample_rate)
    print(f"  Reduced from {before_sampling:,} to {len(reduced_dataset):,} rows ({len(reduced_dataset)/before_sampling*100:.1f}% retained)")
    
    # Force garbage collection after sampling
    gc.collect()
    
except Exception as e:
    print(f"  Error during sampling: {e}")
    print("  Applying simple random sampling as fallback...")
    # Fallback to simple random sampling
    max_samples = min(100000, len(reduced_dataset))  # Maximum 100k samples
    reduced_dataset = reduced_dataset.sample(n=max_samples, random_state=42)
    print(f"  Used fallback sampling: {len(reduced_dataset):,} rows")

# Verify all buildings are still represented
buildings_after = reduced_dataset['building_id'].nunique()
buildings_before = final_model_dataset['building_id'].nunique()
print(f"  Buildings retained: {buildings_after}/{buildings_before} ({buildings_after/buildings_before*100:.1f}%)")

if buildings_after < buildings_before:
    print("  Warning: Some buildings were lost during sampling!")
    missing_buildings = set(final_model_dataset['building_id'].unique()) - set(reduced_dataset['building_id'].unique())
    print(f"  Missing buildings: {sorted(list(missing_buildings))[:10]}...")  # Show first 10
    
    # Add back at least one record for each missing building
    for building_id in missing_buildings:
        building_data = final_model_dataset[final_model_dataset['building_id'] == building_id]
        if len(building_data) > 0:
            # Add one representative record (preferably with complete data)
            complete_rows = building_data.dropna()
            if len(complete_rows) > 0:
                sample_row = complete_rows.sample(n=1, random_state=42)
            else:
                sample_row = building_data.sample(n=1, random_state=42)
            reduced_dataset = pd.concat([reduced_dataset, sample_row], ignore_index=True)
    
    print(f"  Final buildings after restoration: {reduced_dataset['building_id'].nunique()}")

# Final cleanup and optimization
print("\nFinal dataset optimization...")

# Remove any remaining duplicates
before_dedup = len(reduced_dataset)
reduced_dataset = reduced_dataset.drop_duplicates()
print(f"  Removed {before_dedup - len(reduced_dataset)} duplicate rows")

# Sort by building and timestamp for better data locality (if timestamp exists)
if 'timestamp' in reduced_dataset.columns:
    reduced_dataset = reduced_dataset.sort_values(['building_id', 'timestamp']).reset_index(drop=True)
else:
    reduced_dataset = reduced_dataset.sort_values(['building_id']).reset_index(drop=True)
    print("  Warning: No timestamp column available for sorting")

# Final statistics
print(f"\nFINAL REDUCED DATASET:")
print(f"- Original size: {final_model_dataset.shape[0]:,} rows")
print(f"- Reduced size: {reduced_dataset.shape[0]:,} rows")
print(f"- Reduction: {(1 - len(reduced_dataset)/len(final_model_dataset))*100:.1f}%")
print(f"- Buildings maintained: {reduced_dataset['building_id'].nunique()}/1448")
print(f"- Features: {reduced_dataset.shape[1]}")
print(f"- Memory usage: {reduced_dataset.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

# Verify data quality is maintained
print(f"\nData quality check:")
if 'timestamp' in reduced_dataset.columns:
    print(f"- Date range maintained: {reduced_dataset['timestamp'].min()} to {reduced_dataset['timestamp'].max()}")
else:
    print("- Date range: Not available (timestamp column missing)")
print(f"- All building types present: {reduced_dataset['primary_use'].nunique() if 'primary_use' in reduced_dataset.columns else 'N/A'}")
print(f"- Target variable range: {reduced_dataset['meter_reading'].min():.2f} to {reduced_dataset['meter_reading'].max():.2f}")

# Update the final model dataset to use the reduced version
final_model_dataset = reduced_dataset.copy()
print(f"\n✓ Dataset successfully reduced while maintaining representativeness!")

DATASET REDUCTION FOR EFFICIENT TRAINING
Current dataset shape: (500000, 64)
Memory usage: 225.07 MB
Analyzing building distribution...
Buildings range: 0 to 1448
Unique buildings: 1449
Average records per building: 345.1
Records per building - Min: 13, Max: 911

Applying intelligent sampling strategies...
Preparing dataset for reduction...
  Adding timestamp column back...
    Using chunked merge for large dataset...
    Processed chunk 1/10
    Processed chunk 2/10
    Processed chunk 3/10
    Processed chunk 4/10
    Processed chunk 5/10
    Processed chunk 2/10
    Processed chunk 3/10
    Processed chunk 4/10
    Processed chunk 5/10
    Processed chunk 6/10
    Processed chunk 7/10
    Processed chunk 6/10
    Processed chunk 7/10
    Processed chunk 8/10
    Processed chunk 9/10
    Processed chunk 10/10
    Processed chunk 8/10
    Processed chunk 9/10
    Processed chunk 10/10
  Successfully added timestamp column
  Successfully added timestamp column
  Removed 0 rows with mis

In [24]:
# Save the processed dataset for model training
print("SAVING PROCESSED DATASET")
print("="*40)

# Save to CSV
output_path = '/Users/saatwik/Documents/Energy-consumption-forecasting/data/processed_dataset.csv'
final_model_dataset.to_csv(output_path, index=False)
print(f"Processed dataset saved to: {output_path}")

# Save feature names for later use
feature_names = [col for col in final_model_dataset.columns if col != 'meter_reading']
feature_names_path = '/Users/saatwik/Documents/Energy-consumption-forecasting/data/feature_names.txt'
with open(feature_names_path, 'w') as f:
    for feature in feature_names:
        f.write(f"{feature}\n")
print(f"Feature names saved to: {feature_names_path}")

# Display final summary
print(f"\nFINAL DATASET SUMMARY:")
print(f"- Shape: {final_model_dataset.shape}")
print(f"- Features: {len(feature_names)}")
print(f"- Target variable: meter_reading")
print(f"- Ready for LGBM training: ✓")

# Display data types
print(f"\nData types summary:")
print(final_model_dataset.dtypes.value_counts())

# Basic statistics
print(f"\nTarget variable (meter_reading) statistics:")
print(final_model_dataset['meter_reading'].describe())

print("\n" + "="*60)
print("PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY!")
print("="*60)
print("Next steps:")
print("1. Use 'processed_dataset.csv' for model training")
print("2. Target variable: 'meter_reading'")
print("3. All features are numeric and ready for LGBM")
print("4. Consider train/validation split based on timestamp")
print("="*60)

SAVING PROCESSED DATASET
Processed dataset saved to: /Users/saatwik/Documents/Energy-consumption-forecasting/data/processed_dataset.csv
Feature names saved to: /Users/saatwik/Documents/Energy-consumption-forecasting/data/feature_names.txt

FINAL DATASET SUMMARY:
- Shape: (1019524, 65)
- Features: 64
- Target variable: meter_reading
- Ready for LGBM training: ✓

Data types summary:
float64           39
int64             13
int32             12
datetime64[ns]     1
Name: count, dtype: int64

Target variable (meter_reading) statistics:
count    1.019524e+06
mean     4.090041e+02
std      1.844360e+03
min      0.000000e+00
25%      0.000000e+00
50%      0.000000e+00
75%      4.170000e-02
max      9.401151e+03
Name: meter_reading, dtype: float64

PREPROCESSING PIPELINE COMPLETED SUCCESSFULLY!
Next steps:
1. Use 'processed_dataset.csv' for model training
2. Target variable: 'meter_reading'
3. All features are numeric and ready for LGBM
4. Consider train/validation split based on timestamp
Pr