# Task 2 - Data Exploration, Analysis, and Preprocessing 

This notebook covers data quality, integration, comprehensive exploration, and preparing the data for the modeling tasks.

## 2.1 Setup and Data Loading

In [78]:
# import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the two primary datasets (Power Generation and Sensor Data for each plant)
plant1_gen = pd.read_csv('Plant_1_Generation_Data.csv')
plant1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
plant2_gen = pd.read_csv('Plant_2_Generation_Data.csv')
plant2_weather = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')


## 2.2 Data Quality and Integration

### 2.2.1 Data Quality Assessment

#### A. General checks

In [68]:
## Overall data shape check 

# Generation data

# ---------------------------------------------------------
# Plant 1 
print("\n--- Plant 1 Generation Data ---")

# Identify how many unique inverters there are
p1_inverters = plant1_gen['SOURCE_KEY'].unique()
p1_inverter_count = len(p1_inverters)
print(f"1. Unique Inverters identified: {p1_inverter_count}")

# Define how many readings are needed for 34 days, 15 minutes time-span
# Formula: 34 days * 24 hours * 4 intervals (15 min) * Number of Inverters
DAYS = 34
READINGS_PER_DAY = 24 * 4 # 96 readings
p1_expected_readings = DAYS * READINGS_PER_DAY * p1_inverter_count

print(f"2. Expected Readings (Target):  {p1_expected_readings:,}")
print(f"   (Calculation: {DAYS} days * {READINGS_PER_DAY} readings * {p1_inverter_count} inverters)")

# Check how many actual readings
p1_actual_readings = len(plant1_gen)
print(f"3. Actual Readings (Raw):       {p1_actual_readings:,}")

# Comparison
diff_p1 = p1_actual_readings - p1_expected_readings
print(f"4. Conclusion:                  {diff_p1:+,} Rows")

# ---------------------------------------------------------
# Plant 2 Analysis
print("\n--- Plant 2 Generation Data ---")

# Identify how many unique inverters there are
p2_inverters = plant2_gen['SOURCE_KEY'].unique()
p2_inverter_count = len(p2_inverters)
print(f"1. Unique Inverters identified: {p2_inverter_count}")

# Define how many readings are needed for 34 days, 15 minutes time-span
p2_expected_readings = DAYS * READINGS_PER_DAY * p2_inverter_count
print(f"2. Expected Readings (Target):  {p2_expected_readings:,}")

# Check how many actual readings
p2_actual_readings = len(plant2_gen)
print(f"3. Actual Readings (Raw):       {p2_actual_readings:,}")

# Comparison
diff_p2 = p2_actual_readings - p2_expected_readings
print(f"4. Conclusion:                  {diff_p2:+,} Rows")

#---------------------------------------------------------------------------------------------

# Weather data

import pandas as pd

print("=== WEATHER DATA SHAPE CHECK ===")

# Load Data
p1_weather = pd.read_csv('Plant_1_Weather_Sensor_Data.csv')
p2_weather = pd.read_csv('Plant_2_Weather_Sensor_Data.csv')

# Constants for 34 Days @ 15-min intervals
DAYS = 34
READINGS_PER_DAY = 24 * 4 # 96
EXPECTED_PER_SENSOR = DAYS * READINGS_PER_DAY

# ---------------------------------------------------------
# Plant 1 Analysis

print("\n--- Plant 1 Weather Data ---")

# 1. Identify Unique Sensors
p1_sensors = p1_weather['SOURCE_KEY'].unique()
p1_sensor_count = len(p1_sensors)
print(f"1. Unique Sensors identified:   {p1_sensor_count}")

# 2. Expected Readings
p1_expected = EXPECTED_PER_SENSOR * p1_sensor_count
print(f"2. Expected Readings (Target):  {p1_expected:,}")

# 3. Actual Readings
p1_actual = len(p1_weather)
print(f"3. Actual Readings (Raw):       {p1_actual:,}")

# 4. Conclusion
diff_p1 = p1_actual - p1_expected
if diff_p1 == 0:
    print(f"4. Conclusion:                  Exact Match (Perfect Data)")
else:
    print(f"4. Conclusion:                  {diff_p1:+,} Rows ")


# ---------------------------------------------------------
# Plant 2 Analysis

print("\n--- Plant 2 Weather Data ---")

# 1. Identify Unique Sensors
p2_sensors = p2_weather['SOURCE_KEY'].unique()
p2_sensor_count = len(p2_sensors)
print(f"1. Unique Sensors identified:   {p2_sensor_count}")

# 2. Expected Readings
p2_expected = EXPECTED_PER_SENSOR * p2_sensor_count
print(f"2. Expected Readings (Target):  {p2_expected:,}")

# 3. Actual Readings
p2_actual = len(p2_weather)
print(f"3. Actual Readings (Raw):       {p2_actual:,}")

# 4. Conclusion
diff_p2 = p2_actual - p2_expected
if diff_p2 == 0:
    print(f"4. Conclusion:                  Exact Match (Perfect Data)")
else:
    print(f"4. Conclusion:                  {diff_p2:+,} Rows ")


--- Plant 1 Generation Data ---
1. Unique Inverters identified: 22
2. Expected Readings (Target):  71,808
   (Calculation: 34 days * 96 readings * 22 inverters)
3. Actual Readings (Raw):       1,021,186
4. Conclusion:                  +949,378 Rows

--- Plant 2 Generation Data ---
1. Unique Inverters identified: 22
2. Expected Readings (Target):  71,808
3. Actual Readings (Raw):       1,421,196
4. Conclusion:                  +1,349,388 Rows
=== WEATHER DATA SHAPE CHECK ===

--- Plant 1 Weather Data ---
1. Unique Sensors identified:   1
2. Expected Readings (Target):  3,264
3. Actual Readings (Raw):       3,182
4. Conclusion:                  -82 Rows 

--- Plant 2 Weather Data ---
1. Unique Sensors identified:   1
2. Expected Readings (Target):  3,264
3. Actual Readings (Raw):       3,259
4. Conclusion:                  -5 Rows 


In [66]:
# Slice and do a visual inspection of the data to understand duplications

# Define "Primary Keys" to make a row unique.
key_columns = ['DATE_TIME', 'DC_POWER', 'SOURCE_KEY']

# Find rows with duplicate Keys AND NOT duplicate Data
has_duplicate_keys = plant1_gen.duplicated(subset=key_columns, keep=False)
is_exact_duplicate = plant1_gen.duplicated(keep=False)

# We only want rows where keys match, but data differs
conflict_mask = has_duplicate_keys & ~is_exact_duplicate

# Grab 1 random row to act as search parameter
target_row = plant1_gen[conflict_mask].sample(1)

# Retrieve the target row and its conflicts
result = plant1_gen.merge(target_row[key_columns], on=key_columns)

# Display result
print(result)

             DATE_TIME  PLANT_ID       SOURCE_KEY    DC_POWER    AC_POWER  \
0  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
1  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
2  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
3  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
4  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
5  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
6  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
7  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
8  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
9  2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
10 2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   
11 2020-05-16 09:00:00   4135001  ih0vzX44oOqAx2f  636.357143  623.414286   

**Conclusion:**
- When slicing data, we observe exact time stamps with all variables equal except for "operating_condition". Understanding this in detail and cleaning accordingly will be critical for the "Classification of Operating Conditions" task.

In [67]:
## Check data types
def get_types_report(df):
    """Generate a report of pandas dtypes and unique Python types for each column in the DataFrame."""
    types = {}
    for col in df.columns:
        # Get unique python types in the column (useful for spotting mixed types)
        py_types = df[col].map(lambda x: type(x).__name__).unique().tolist()
        types[col] = py_types

    result_df = pd.DataFrame({
        'column': list(types.keys()),
        'pandas_dtype': [df[col].dtype for col in types.keys()],
        'python_types': [types[col] for col in types.keys()]
    })
    return result_df

# Get unique Python types and pandas dtypes for the four datasets
print("Plant 1 Generation - Data Types Report:")
print(get_types_report(plant1_gen), "\n")

print("Plant 1 Weather - Data Types Report:")
print(get_types_report(plant1_weather), "\n")

print("Plant 2 Generation - Data Types Report:")
print(get_types_report(plant2_gen), "\n")

print("Plant 2 Weather - Data Types Report:")
print(get_types_report(plant2_weather))

Plant 1 Generation - Data Types Report:
                column    pandas_dtype  python_types
0            DATE_TIME  datetime64[ns]   [Timestamp]
1             PLANT_ID           int64         [int]
2           SOURCE_KEY          object         [str]
3             DC_POWER         float64       [float]
4             AC_POWER         float64       [float]
5          DAILY_YIELD         float64       [float]
6          TOTAL_YIELD         float64       [float]
7                  day           int64         [int]
8  Operating_Condition          object  [str, float] 

Plant 1 Weather - Data Types Report:
                column    pandas_dtype python_types
0            DATE_TIME  datetime64[ns]  [Timestamp]
1             PLANT_ID           int64        [int]
2           SOURCE_KEY          object        [str]
3  AMBIENT_TEMPERATURE         float64      [float]
4   MODULE_TEMPERATURE         float64      [float]
5          IRRADIATION         float64      [float] 

Plant 2 Generation - Data

**Conclusion:**
- Date formats: The variable "DATE_TIME" will be used to merge the data sets, however Currently, a type mismatch exists. "Plant 1 - Generation data" is stored in datetime format, while the other three are objects. Standarization will be required. Standardization is required not only to enable merging but to prevent temporal misalignment, specifically avoiding the parsing error where day/month swaps cause the 34-day timeline to erroneously appear as spanning 11 months

- Mixed formats: In "Plant 1 - Generation data" the variable "Operating_Condition" mariable exhibits mixed data types, strings and float. This means there are probably missing values in this column (NaN)

- Integers for categorical information: Variable Plant_ID is currently encoded as an integer across, is shown as an integer. However, it serves as a unique categorical identifier rather than a quantitative metric. This distinction must be explicitly noted during modeling to ensure algorithms do not misinterpret the ID as having numerical magnitude or order

#### B. Missing Values

In [46]:
## Check for missing values in the four data sets

def get_missing_data_report(df):
    """Generate a report of missing data percentages for each column in the DataFrame."""
    missing_data_report = pd.DataFrame({
        'Columns': df.columns,
        'Missing Values': df.isna().sum().values,
        'Percentage Missing': ((df.isna().sum().values / len(df)) * 100).round(2)
    })
    return missing_data_report

# Generate and print missing data report for the datasets
print("Power Generation 1 - Missing Values Report:")
print(get_missing_data_report(plant1_gen),"\n")
print("Weather Sensor 1 - Missing Values Report:")
print(get_missing_data_report(plant1_weather))
print("Power Generation 2 - Missing Values Report:")
print(get_missing_data_report(plant2_gen),"\n")
print("Weather Sensor 2- Missing Values Report:")
print(get_missing_data_report(plant2_weather))

Power Generation 1 - Missing Values Report:
               Columns  Missing Values  Percentage Missing
0            DATE_TIME               0                0.00
1             PLANT_ID               0                0.00
2           SOURCE_KEY               0                0.00
3             DC_POWER               0                0.00
4             AC_POWER               0                0.00
5          DAILY_YIELD               0                0.00
6          TOTAL_YIELD               0                0.00
7                  day               0                0.00
8  Operating_Condition           23098                2.26 

Weather Sensor 1 - Missing Values Report:
               Columns  Missing Values  Percentage Missing
0            DATE_TIME               0                 0.0
1             PLANT_ID               0                 0.0
2           SOURCE_KEY               0                 0.0
3  AMBIENT_TEMPERATURE               0                 0.0
4   MODULE_TEMPERATURE     

In [47]:
## Check for missing data ranges (rows) in the four data sets

# First we fix the Plan #1 - Generation dates

# Convert to datetime (this creates the errors like Jan 6 or May 6)
plant1_gen['DATE_TIME'] = pd.to_datetime(plant1_gen['DATE_TIME'])

# Identify ALL bad dates (Anything before the known start of May 15th to avoid false positives)
mask_bad_dates = plant1_gen['DATE_TIME'] < '2020-05-15'

if mask_bad_dates.sum() > 0:
    print(f"Fixing {mask_bad_dates.sum()} mis-parsed dates in Plant 1 Gen...")
    # Swap Day and Month for these specific rows
    plant1_gen.loc[mask_bad_dates, 'DATE_TIME'] = pd.to_datetime(
        plant1_gen.loc[mask_bad_dates, 'DATE_TIME'].dt.strftime('%Y-%d-%m %H:%M:%S')
    )

# -------------------------------------------------------
# Generate a report
# -------------------------------------------------------
def get_time_gap_report(df):
    """
    Generate a report of missing time intervals in the DataFrame.
    Assumes 'DATE_TIME' column exists.
    """
    # Ensure datetime and sort
    df = df.copy()
    df['DATE_TIME'] = pd.to_datetime(df['DATE_TIME'])
    df = df.sort_values('DATE_TIME')
    
    # Create the expected grid for the intervals (perfect 15 min intervals)
    start = df['DATE_TIME'].min()
    end = df['DATE_TIME'].max()
    expected_range = pd.date_range(start=start, end=end, freq='15T')
    
    # Calculate statistics
    expected_count = len(expected_range)
    actual_count = df['DATE_TIME'].nunique()
    missing_count = expected_count - actual_count
    pct_missing = (missing_count / expected_count) * 100
    
    # Create Report DataFrame
    report = pd.DataFrame({
        'Metric': ['Start Time', 'End Time', 'Expected Intervals', 'Actual Intervals', 'Missing Intervals', '% Missing'],
        'Value': [start, end, expected_count, actual_count, missing_count, round(pct_missing, 2)]
    })
    return report

# Generate and print time gap reports
print("Power Generation 1 - Time Gap Report:")
print(get_time_gap_report(plant1_gen),"\n")

print("Weather Sensor 1 - Time Gap Report:")
print(get_time_gap_report(plant1_weather),"\n")

print("Power Generation 2 - Time Gap Report:")
print(get_time_gap_report(plant2_gen),"\n")

print("Weather Sensor 2 - Time Gap Report:")
print(get_time_gap_report(plant2_weather))

Fixing 10426 mis-parsed dates in Plant 1 Gen...
Power Generation 1 - Time Gap Report:


  expected_range = pd.date_range(start=start, end=end, freq='15T')
  expected_range = pd.date_range(start=start, end=end, freq='15T')


               Metric                Value
0          Start Time  2020-05-15 00:00:00
1            End Time  2020-12-06 23:45:00
2  Expected Intervals                19776
3    Actual Intervals                 3158
4   Missing Intervals                16618
5           % Missing                84.03 

Weather Sensor 1 - Time Gap Report:
               Metric                Value
0          Start Time  2020-05-15 00:00:00
1            End Time  2020-06-17 23:45:00
2  Expected Intervals                 3264
3    Actual Intervals                 3182
4   Missing Intervals                   82
5           % Missing                 2.51 

Power Generation 2 - Time Gap Report:
               Metric                Value
0          Start Time  2020-05-15 00:00:00
1            End Time  2020-06-17 23:45:00
2  Expected Intervals                 3264
3    Actual Intervals                 3259
4   Missing Intervals                    5
5           % Missing                 0.15 

Weather Sensor 2 

  expected_range = pd.date_range(start=start, end=end, freq='15T')
  expected_range = pd.date_range(start=start, end=end, freq='15T')


**Conclusion:** 

Missing values (NaN):
- Plant 1: Generation data exhibits missing values. There are about 2.3% missing values in the Operating_Condition column of the data set. On the contrary, weather data shows 0 missing values (empty cells)
- Plant 2: There are no missing values in the generation or weather data.

Missing temporal dates (entire rows):
- Plant 1: Generation data is missing 106 intervals (3.25%) and weather data is missing 82 intervals (2.51%).
- Plant 2: The data set is nearly perfect, it only misses 5 intervals (0.15%) of information.

#### C. Inconsistencies & Anomalies

In [48]:
# SAFETY: Ensure all Time Columns are Datetime Objects ---
# plant1_gen, was already fixed in the previous section
plant1_weather['DATE_TIME'] = pd.to_datetime(plant1_weather['DATE_TIME'])
plant2_gen['DATE_TIME'] = pd.to_datetime(plant2_gen['DATE_TIME'])
plant2_weather['DATE_TIME'] = pd.to_datetime(plant2_weather['DATE_TIME'])

In [49]:
## Inconsistencies are checked for different situations for all data sets
# Grouping for iteration
gen_data = [("Plant 1", plant1_gen), ("Plant 2", plant2_gen)]
weather_data = [("Plant 1", plant1_weather), ("Plant 2", plant2_weather)]


## Part A: Generation data
print("\n--- GENERATION CHECKS ---")

# CHECK 1: Efficiency violation 
# Impossible for AC Output > DC Input (Thermodynamics)

# Plant 1: DC is in 100W units. Divide by 10 to convert to kW. Plant 2 DC is in kW. AC is in kW.
plant1_gen['DC_POWER'] = plant1_gen['DC_POWER'] / 10.0

print("\n[Gen Check 1] Efficiency violation (AC > DC)")
for name, df in gen_data:
    # We allow a tiny buffer (0.1 kW) for sensor timing mismatch
    errors = df[df['AC_POWER'] > df['DC_POWER'] + 0.1]
    print(f"  {name}: {len(errors)} rows failed.")

# CHECK 2: Negative power
# Solar panels cannot consume power (cannot be negative)
print("\n[Gen Check 2] Negative power (AC or DC < 0)")
for name, df in gen_data:
    errors = df[(df['AC_POWER'] < 0) | (df['DC_POWER'] < 0)]
    print(f"  {name}: {len(errors)} rows failed.")

# CHECK 3: Total daily yield logic
# Daily Yield should strictly increase. It only drops if the inverter resets (usually midnight).
print("\n[Gen Check 3] Yield logic break ")
for name, df in gen_data:
    # Vital: Sort by Inverter and Time, otherwise different inverters mix up the data
    df_sorted = df.sort_values(by=['SOURCE_KEY', 'DATE_TIME'])
    
    # Calculate the change in yield from the previous reading OF THE SAME INVERTER
    df_sorted['yield_diff'] = df_sorted.groupby('SOURCE_KEY')['DAILY_YIELD'].diff()
    
    # Check if the day changed (Midnight reset is normal)
    day_changed = df_sorted['DATE_TIME'].dt.date != df_sorted['DATE_TIME'].shift(1).dt.date
    
    # Flag: Yield dropped (negative diff) AND the day did NOT change
    errors = df_sorted[(df_sorted['yield_diff'] < 0) & (~day_changed)]
    print(f"  {name}: {len(errors)} rows failed.")

## Part B: Weather data

print("\n--- WEATHER CHECKS ---")

# CHECK 1: No irradiance at night
# Irradiance > 0 when it is dark (10 PM - 4 AM short range assuming its summer)
print("\n[Weather Check 1] Night irradiance (Irradiation > 0 at Night)")
for name, df in weather_data:
    hour = df['DATE_TIME'].dt.hour
    night_mask = (hour >= 22) | (hour < 4)
    errors = df[night_mask & (df['IRRADIATION'] > 0)]
    print(f"  {name}: {len(errors)} rows failed.")

# CHECK 2: Panel being hot without irradiance at night
# No Sun (0 Irr) but Module is significantly hotter than Ambient
print("\n[Weather Check 2] Hot panel at night (Mod Temp > Amb Temp + 5°C w/ No Sun)")
for name, df in weather_data:
    # If Irradiance is 0, Module shouldn't be hot compared to air
    errors = df[(df['IRRADIATION'] == 0) & 
                (df['MODULE_TEMPERATURE'] > df['AMBIENT_TEMPERATURE'] + 5.0)]
    print(f"  {name}: {len(errors)} rows failed.")

# CHECK 3: Broken sensors
# Ambient Temp stays exactly the same for 4 consecutive readings (1 hour). Testing for a extreme case.
print("\n[Weather Check 3] Broken sensors (Ambient Temp frozen for 1hr)")
for name, df in weather_data:
    df_sorted = df.sort_values(by='DATE_TIME') 
    
    # True if current value equals previous value
    is_frozen = df_sorted['AMBIENT_TEMPERATURE'].diff() == 0
    
    # Rolling sum: if 4 consecutive rows are "True", we have a 1-hour freeze
    frozen_blocks = is_frozen.rolling(4).sum() == 4
    print(f"  {name}: {frozen_blocks.sum()} frozen sequences found.")


--- GENERATION CHECKS ---

[Gen Check 1] Efficiency violation (AC > DC)
  Plant 1: 396 rows failed.
  Plant 2: 396 rows failed.

[Gen Check 2] Negative power (AC or DC < 0)
  Plant 1: 0 rows failed.
  Plant 2: 0 rows failed.

[Gen Check 3] Yield logic break 
  Plant 1: 439 rows failed.
  Plant 2: 1888 rows failed.

--- WEATHER CHECKS ---

[Weather Check 1] Night irradiance (Irradiation > 0 at Night)
  Plant 1: 0 rows failed.
  Plant 2: 24 rows failed.

[Weather Check 2] Hot panel at night (Mod Temp > Amb Temp + 5°C w/ No Sun)
  Plant 1: 0 rows failed.
  Plant 2: 0 rows failed.

[Weather Check 3] Broken sensors (Ambient Temp frozen for 1hr)
  Plant 1: 0 frozen sequences found.
  Plant 2: 0 frozen sequences found.


**Conclusion:** 

Generation data
- Plant 1: There are 396 instances (~0.6%) of efficiency violations (system reports that AC output is higher than DC, which is physically imposible). Also, yield logic fails at 439 instances (~0.6%).
- Plant 2: There are 396 instances (~0.6%) of efficiency violations (system reports that AC output is higher than DC, which is physically imposible). Also, yield logic fails at 1,888 instances (~2.7%). This high failure rate indicates severe data logger instability causing frequent mid-day resets for some.

Weather data
- Plant 1: Passed all consistency checks
- Plant 2: 24 instances (~0.7%) of "Night Sun" (Irradiance > 0 at night) This culd be due to the sunrise/sunset exact timings or callibration

### 2.2.2 Data Handling

From the previous analysis, we would like to execute the following updates:

A. General data type standardization
- Date parsing (converting all information to date_time appropriately)
- Numeric coercion (in preparation for imputation of values)
- Categorical encoding
- Scaling updates (Power Unit Scaling: Normalize AC and DC power units)

B. Temporal alignment & completeness
- Grid re-indexing (creating a master temporal grid of 15 minute frequency to ensure full coverage)

C. Missing values imputation
- Linear interpolation to address the missing data

D. Anomaly correction
- Efficiency Validation ($AC > DC$)
- Cumulative Yield Reconstruction
-Irradiance Clamping ("Night Sun")

In [79]:
# --- Initialize Clean Dataframes ---
plant1_gen_clean = plant1_gen.copy()
plant1_weather_clean = plant1_weather.copy()
plant2_gen_clean = plant2_gen.copy()
plant2_weather_clean = plant2_weather.copy()

print("Data Loaded. '_clean' dataframes initialized.")

Data Loaded. '_clean' dataframes initialized.


In [None]:
# --- 1. General data type standardization ---

# Initial conversion for Plant 1 Gen
plant1_gen_clean['DATE_TIME'] = pd.to_datetime(plant1_gen_clean['DATE_TIME'])

# CUSTOM FIX: Identify mis-parsed dates (e.g. June 1st read as Jan 6th)
# Logic: We know data starts May 15th. Any date before that is a parsing error.
mask_bad_dates = plant1_gen_clean['DATE_TIME'] < '2020-05-15'

if mask_bad_dates.sum() > 0:
    print(f"Fixing {mask_bad_dates.sum()} mis-parsed dates in Plant 1 Gen...")
    # Swap Day and Month for these specific rows
    plant1_gen_clean.loc[mask_bad_dates, 'DATE_TIME'] = pd.to_datetime(
        plant1_gen_clean.loc[mask_bad_dates, 'DATE_TIME'].dt.strftime('%Y-%d-%m %H:%M:%S')
    )

# Standard conversion for others (using dayfirst=True to prevent new errors)
plant1_weather_clean['DATE_TIME'] = pd.to_datetime(plant1_weather_clean['DATE_TIME'], dayfirst=True)
plant2_gen_clean['DATE_TIME'] = pd.to_datetime(plant2_gen_clean['DATE_TIME'], dayfirst=True)
plant2_weather_clean['DATE_TIME'] = pd.to_datetime(plant2_weather_clean['DATE_TIME'], dayfirst=True)

# --- 2. Numeric Coercion ---
# Force 'Operating_Condition' to numeric; 'Suboptimal' strings become NaN
plant1_gen_clean['Operating_Condition'] = pd.to_numeric(plant1_gen_clean['Operating_Condition'], errors='coerce')

# --- 3. Categorical Encoding ---
# Ensure PLANT_ID is a string label
for df in [plant1_gen_clean, plant1_weather_clean, plant2_gen_clean, plant2_weather_clean]:
    df['PLANT_ID'] = df['PLANT_ID'].astype(str)

# --- 4. Power Unit Scaling ---
# Plant 1 DC_POWER is scaled down by 10 to match Plant 2's magnitude (Watts vs kW check)
plant1_gen_clean['DC_POWER'] = plant1_gen_clean['DC_POWER'] / 10

print("Step 1: Standardization & Scaling Complete.")

Fixing 10426 mis-parsed dates in Plant 1 Gen...


  plant1_weather_clean['DATE_TIME'] = pd.to_datetime(plant1_weather_clean['DATE_TIME'], dayfirst=True)
  plant2_gen_clean['DATE_TIME'] = pd.to_datetime(plant2_gen_clean['DATE_TIME'], dayfirst=True)
  plant2_weather_clean['DATE_TIME'] = pd.to_datetime(plant2_weather_clean['DATE_TIME'], dayfirst=True)


Step 1: Standardization & Scaling Complete.


### 2.2.3 Integration

## 2.3 Exploratory Data Analysis

### 2.3.1 Statistical Summary

### 2.3.2 Visualizations

### 2.3.3 Trend Analysis

### 2.3.4 Correlation Analysis

### 2.3.5 Pattern Identification

## 2.4 Feature Engineering

### 2.4.1 Feature Scaling

### 2.4.2 Feature Selection