# Task 2 - Data Exploration, Analysis, and Preprocessing 

This notebook covers data quality, integration, comprehensive exploration, and preparing the data for the modeling tasks.

## 2.1 Setup and Data Loading

In [1]:
# import necessary libraries
import pandas as pd

# Load the two primary datasets (Power Generation and Sensor Data)
power_generation_df = pd.read_csv('../data/Plant_1_Generation_Data.csv')
weather_sensor_df = pd.read_csv('../data/Plant_1_Weather_Sensor_Data.csv')

In [7]:
power_generation_df.groupby(['DATE_TIME','PLANT_ID','SOURCE_KEY','Operating_Condition']).size().reset_index(name='counts')

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,Operating_Condition,counts
0,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,Optimal,2
1,2020-05-15 00:00:00,4135001,1BY6WEcLGh8j5v7,Suboptimal,19
2,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,Optimal,2
3,2020-05-15 00:00:00,4135001,1IF53ai7Xc0U56Y,Suboptimal,19
4,2020-05-15 00:00:00,4135001,3PZuoBAID5Wc2HD,Optimal,2
...,...,...,...,...,...
86969,2020-06-17 23:45:00,4135001,uHbuxQJl8lW7ozc,Suboptimal,22
86970,2020-06-17 23:45:00,4135001,wCURE6d3bPkepu2,Suboptimal,22
86971,2020-06-17 23:45:00,4135001,z9Y9gH1T5YWrNuG,Suboptimal,22
86972,2020-06-17 23:45:00,4135001,zBIq5rxdHJRwDNY,Suboptimal,22


## 2.2 Data Quality and Integration

### 2.2.1 Data Quality Assessment

#### Missing Values

In [9]:
def get_missing_data_report(df):
    """Generate a report of missing data percentages for each column in the DataFrame."""
    missing_data_report = pd.DataFrame({
        'Columns': df.columns,
        'Missing Values': df.isna().sum().values,
        'Percentage Missing': ((df.isna().sum().values / len(df)) * 100).round(2)
    })
    return missing_data_report

# Generate and print missing data report for the datasets
print("Power Generation - Missing Values Report:")
print(get_missing_data_report(power_generation_df),"\n")
print("Weather Sensor - Missing Values Report:")
print(get_missing_data_report(weather_sensor_df))

Power Generation - Missing Values Report:
               Columns  Missing Values  Percentage Missing
0            DATE_TIME               0                0.00
1             PLANT_ID               0                0.00
2           SOURCE_KEY               0                0.00
3             DC_POWER               0                0.00
4             AC_POWER               0                0.00
5          DAILY_YIELD               0                0.00
6          TOTAL_YIELD               0                0.00
7                  day               0                0.00
8  Operating_Condition           23098                2.26 

Weather Sensor - Missing Values Report:
               Columns  Missing Values  Percentage Missing
0            DATE_TIME               0                 0.0
1             PLANT_ID               0                 0.0
2           SOURCE_KEY               0                 0.0
3  AMBIENT_TEMPERATURE               0                 0.0
4   MODULE_TEMPERATURE         

- There are about 2.3% missing values in the _Operation_Condition_ column of the __Power Generation__ dataset.
- There are no missing values in the __Weather Sensor__ dataset.

#### Data Types

In [10]:
def get_types_report(df):
    """Generate a report of pandas dtypes and unique Python types for each column in the DataFrame."""
    types = {}
    for col in df.columns:
        py_types = df[col].map(lambda x: type(x).__name__).unique().tolist()
        types[col] = py_types

    result_df = pd.DataFrame({
        'column': list(types.keys()),
        'pandas_dtype': [df[col].dtype for col in types.keys()],
        'python_types': [types[col] for col in types.keys()]
    })
    return result_df

# Get unique Python types and pandas dtypes for the datasets
print("\nPower Generation - Data Types Report:")
print(get_types_report(power_generation_df))
print("\nWeather Sensor - Data Types Report:")
print(get_types_report(weather_sensor_df))


Power Generation - Data Types Report:
                column pandas_dtype  python_types
0            DATE_TIME       object         [str]
1             PLANT_ID        int64         [int]
2           SOURCE_KEY       object         [str]
3             DC_POWER      float64       [float]
4             AC_POWER      float64       [float]
5          DAILY_YIELD      float64       [float]
6          TOTAL_YIELD      float64       [float]
7                  day        int64         [int]
8  Operating_Condition       object  [str, float]

Weather Sensor - Data Types Report:
                column pandas_dtype python_types
0            DATE_TIME       object        [str]
1             PLANT_ID        int64        [int]
2           SOURCE_KEY       object        [str]
3  AMBIENT_TEMPERATURE      float64      [float]
4   MODULE_TEMPERATURE      float64      [float]
5          IRRADIATION      float64      [float]


### 2.2.2 Data Handling

#### Handling Data Types and Sorting

In [11]:
# Convert 'DATE_TIME' columns to datetime objects
power_generation_df['DATE_TIME'] = pd.to_datetime(power_generation_df['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')
weather_sensor_df['DATE_TIME'] = pd.to_datetime(weather_sensor_df['DATE_TIME'], format='%Y-%m-%d %H:%M:%S')

# Sort both DataFrames by 'DATE_TIME'
power_generation_df = power_generation_df.sort_values(by='DATE_TIME').reset_index(drop=True)
weather_sensor_df = weather_sensor_df.sort_values(by='DATE_TIME').reset_index(drop=True)

#### Addressing Missing Values

In [12]:
power_generation_df['day_of_year'] = power_generation_df['DATE_TIME'].dt.dayofyear
missing_operating_condition_df = power_generation_df[power_generation_df['Operating_Condition'].isna()]

In [13]:
missing_operating_condition_df

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,day,Operating_Condition,day_of_year
0,2020-01-06 00:00:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,829.0,6377931.0,6,,6
1,2020-01-06 00:00:00,4135001,zVJPv84UY57bAof,0.0,0.0,0.0,7242024.0,6,,6
2,2020-01-06 00:00:00,4135001,zBIq5rxdHJRwDNY,0.0,0.0,0.0,6463239.0,6,,6
3,2020-01-06 00:00:00,4135001,z9Y9gH1T5YWrNuG,0.0,0.0,1575.0,7133897.0,6,,6
4,2020-01-06 00:00:00,4135001,wCURE6d3bPkepu2,0.0,0.0,0.0,6909405.0,6,,6
...,...,...,...,...,...,...,...,...,...,...
1021181,2020-12-06 23:45:00,4135001,1IF53ai7Xc0U56Y,0.0,0.0,5903.0,6398227.0,6,,341
1021182,2020-12-06 23:45:00,4135001,1BY6WEcLGh8j5v7,0.0,0.0,5441.0,6455679.0,6,,341
1021183,2020-12-06 23:45:00,4135001,zVJPv84UY57bAof,0.0,0.0,5863.0,7328550.0,6,,341
1021184,2020-12-06 23:45:00,4135001,ZoEaEvLYb1n2sOq,0.0,0.0,5753.0,7307385.0,6,,341


#### Data Ranges

## 2.3 Exploratory Data Analysis

### 2.3.1 Statistical Summary

### 2.3.2 Visualizations

### 2.3.3 Trend Analysis

### 2.3.4 Correlation Analysis

### 2.3.5 Pattern Identification

## 2.4 Feature Engineering

### 2.4.1 Feature Scaling

### 2.4.2 Feature Selection