# Data exploration

## Step 0: Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
pd.set_option('display.max_columns', 200)
from src.data.data_fetcher import get_all_features, get_raw_data
from src.features.feature_engineering import create_time_features_from_date

## Step 0.5: Reading data

In [None]:
targets_A, targets_B, targets_C, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()
all_features = get_raw_data()

# Step 1: Data Understanding

### Compute statistics for each location

In [None]:
# Compute statistics for each location
stats_A = targets_A['pv_measurement'].describe()
stats_B = targets_B['pv_measurement'].describe()
stats_C = targets_C['pv_measurement'].describe()
print("Statistics for Location A:\n", stats_A)
print("\nStatistics for Location B:\n", stats_B)
print("\nStatistics for Location C:\n", stats_C)

## Trend Analysis
This code will plot the actual pv_measurement values along with their rolling mean trend (computed over a 7-day window) for Location A. The shaded region represents the confidence intervals for the moving average. You can repeat similar plots for Locations B and C by calling the plot_moving_average function with their respective datasets.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from src.visualization.plotting import plot_moving_average
# Load the datasets

# For Location A, B and C
# Compute moving averages for trend analysis
plot_moving_average(targets_A['pv_measurement'], window=24*7, plot_intervals=True, title="Moving Average Trend for Location A")
plot_moving_average(targets_B['pv_measurement'], window=24*7, plot_intervals=True, title="Moving Average Trend for Location B")
plot_moving_average(targets_C['pv_measurement'], window=24*7, plot_intervals=True, title="Moving Average Trend for Location C")




## Seasonality analysis
Identifying recurring patterns or cycles in the data. For hourly data, you might find daily or monthly seasonality.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf

# Load the dataset for Location A
targets_A = pd.read_csv('path_to_A_train_targets.csv', parse_dates=['date'], index_col='date')

# Decomposition
result = seasonal_decompose(targets_A['pv_measurement'], model='additive')
result.plot()
plt.show()

# Seasonal plot for daily patterns
daily_seasonal = result.seasonal['2022-01-01':'2022-01-02']  # Adjust dates to pick a representative 2-day period
daily_seasonal.plot(figsize=(15,6))
plt.title('Daily Seasonal Pattern')
plt.show()

# Autocorrelation plot to identify seasonality
plot_acf(targets_A['pv_measurement'], lags=168)  # 168 hours for a weekly pattern
plt.title('Autocorrelation Plot')
plt.show()


## Cyclicity: 
Unlike seasonality, which happens at fixed known periods, cycles are fluctuations that are not of a fixed frequency.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

# Load the dataset for Location A
targets_A = pd.read_csv('path_to_A_train_targets.csv', parse_dates=['date'], index_col='date')

# Visual Inspection
plt.figure(figsize=(15,6))
targets_A['pv_measurement'].plot()
plt.title('Time Series Plot for Visual Inspection of Cyclicity')
plt.show()

# Autocorrelation plot
plot_acf(targets_A['pv_measurement'], lags=500)  # Adjust lags as needed to inspect longer periods
plt.title('Autocorrelation Plot')
plt.show()

# Partial autocorrelation plot
plot_pacf(targets_A['pv_measurement'], lags=500)  # Adjust lags as needed
plt.title('Partial Autocorrelation Plot')
plt.show()


## Autocorrelation: 
It measures the relationship between a variable's current value and its past values. A lag plot or an autocorrelation function (ACF) plot can help in understanding this.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.graphics.tsaplots import plot_acf

# Load the dataset for Location A
targets_A = pd.read_csv('path_to_A_train_targets.csv', parse_dates=['date'], index_col='date')

# Plot the Autocorrelation Function
plt.figure(figsize=(15,6))
plot_acf(targets_A['pv_measurement'], lags=168)  # 168 hours to check for weekly patterns
plt.title('Autocorrelation Function (ACF) Plot for Location A')
plt.show()


## Outlier Detection: 
Identifying unusual data points that might be errors or rare events. This can be done visually or with statistical methods.

In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from src.visualization.plotting import detect_outliers
# Load the dataset for Location A
# targets_A = pd.read_csv('path_to_A_train_targets.csv', parse_dates=['date'], index_col='date')

# Detect outliers
outliers_a = detect_outliers(targets_A['pv_measurement'], "Outliers for Location A")
outliers_b = detect_outliers(targets_B['pv_measurement'], "Outliers for Location B")
outliers_c = detect_outliers(targets_C['pv_measurement'], "Outliers for Location C")


TypeError: detect_outliers() got an unexpected keyword argument 'title'

## Distribution Analysis: 
Understanding the distribution of data can provide insights into its nature (e.g., normal vs. skewed, presence of heavy tails).

## Feature Importance: 
If using machine learning models, understanding which features (in this case, weather parameters) are most influential in predicting solar energy production.

## Residual Analysis: 
After fitting a model, analyzing the residuals (difference between predictions and actual values) can give insights into the model's accuracy and potential areas of improvement.

## Correlation with External Factors: 
Understanding how external factors, such as weather parameters, correlate with solar production. A heatmap or correlation matrix can be useful.

## Domain-Specific Insights: 
Since this data deals with solar energy production, domain knowledge about factors affecting solar panel efficiency, degradation over time, and other domain-specific considerations can be invaluable.


## Missing Data Analysis: 
Understanding if there are any missing data points, the reason for their absence, and deciding on strategies to handle them (e.g., interpolation, imputation).

In [None]:
df = X_train_observed_a

dtest = train_a
dtest.shape
dtest.head(20)

# df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes
df["date_forecast"].dtype


In [None]:
df.describe()

# Step 2: Data preperation
- Dropping irrelevant columns and rows
- Identifying duplicated columns
- Renaming Columns
- Feature Creation

In [None]:
df[[
    'date_forecast', 'absolute_humidity_2m:gm3', 'air_density_2m:kgm3',
       'ceiling_height_agl:m', 'clear_sky_energy_1h:J', 'clear_sky_rad:W',
       'cloud_base_agl:m', 'dew_or_rime:idx', 'dew_point_2m:K',
       'diffuse_rad:W', 'diffuse_rad_1h:J', 'direct_rad:W', 'direct_rad_1h:J',
       'effective_cloud_cover:p', 'elevation:m', 'fresh_snow_12h:cm',
       'fresh_snow_1h:cm', 'fresh_snow_24h:cm', 'fresh_snow_3h:cm',
       'fresh_snow_6h:cm', 'is_day:idx', 'is_in_shadow:idx',
       'msl_pressure:hPa', 'precip_5min:mm', 'precip_type_5min:idx',
       'pressure_100m:hPa', 'pressure_50m:hPa', 'prob_rime:p',
       'rain_water:kgm2', 'relative_humidity_1000hPa:p', 'sfc_pressure:hPa',
       'snow_density:kgm3', 'snow_depth:cm', 'snow_drift:idx',
       'snow_melt_10min:mm', 'snow_water:kgm2', 'sun_azimuth:d',
       'sun_elevation:d', 'super_cooled_liquid_water:kgm2', 't_1000hPa:K',
       'total_cloud_cover:p', 'visibility:m', 'wind_speed_10m:ms',
       'wind_speed_u_10m:ms', 'wind_speed_v_10m:ms',
       'wind_speed_w_1000hPa:ms'
]]

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated(subset=["date_forecast"])]

Adding month column to dataset:

In [None]:
df = create_time_features_from_date(df)


Adding season column


In [None]:
df.head(100)

# Data types, ranges, missing values and outliers
There is much we do not know about the data. We need to find out more about it. We need to know the data types, ranges, missing values and outliers. We will use the describe function to get a summary of the data.

In [None]:
train_a, train_b, train_c, X_train_estimated_a, X_train_estimated_b, X_train_estimated_c, X_train_observed_a, X_train_observed_b, X_train_observed_c, X_test_estimated_a, X_test_estimated_b, X_test_estimated_c = get_raw_data()
all_features = get_raw_data()

missing_a = X_train_estimated_a.isna().sum()
missing_b = X_train_estimated_b.isna().sum()
missing_c = X_train_estimated_c.isna().sum()
missing_a, missing_b, missing_c
