In [None]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import itertools
import json
import matplotlib.pyplot as plt

## Read Original Data

In [None]:
df = pd.read_csv("./data/us_renewable_energy_data.csv", index_col=0)
df.drop(["date"], axis=1, inplace=True)
# find solar,winds  columns
solar_columns = df.filter(like="solar.1")
wind_columns = df.filter(like="wind")
geo_columns = df.filter(like="geo")
# concat dataframe
df = pd.concat([solar_columns, wind_columns, geo_columns], axis=1)

# MinMax scale by column wise
scaler = MinMaxScaler()
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

## Make variance matrix 

In [None]:

N = 4

solarNColumns = itertools.combinations(solar_columns.columns, N)
windNColumns = itertools.combinations(wind_columns.columns, N)
geoNColumns = itertools.combinations(geo_columns.columns, N)

def matrix_generator():
    for i, columns  in enumerate(zip(windNColumns ,geoNColumns,solarNColumns)):

        if i >= 1000:
            break  # 1000개까지만 처리
        windNColumn ,geoNColumn,solarNColumn =columns
        # Pick 4 wind columns and 1 solar column
        selected_columns = list(windNColumn) + list(geoNColumn) + list(solarNColumn)
        df_selected = df_normalized[selected_columns]

        # Calculate covariance matrix
        yield df_selected.cov().values.tolist()  # 바로 리스트 변환 후 반환


# JSON 파일에 한 번만 저장
with open(f"GeoWindSunEquityMatrices{N*3}by{N*3}.json", "w") as f:
    json.dump(list(matrix_generator()), f)

## Data visualization

In [None]:

# Calculate the covariance matrix


# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(solar_columns.corr(), annot=False, cmap="coolwarm", fmt=".2f")
plt.title('Covariance Matrix Heatmap')
plt.show()

plt.figure(figsize=(10, 8))
sns.heatmap(wind_columns.corr(), annot=False, cmap="coolwarm", fmt=".2f")
plt.title("Covariance Matrix Heatmap")
plt.show()

# Calculate the sum of all values in the covariance matrices
solar_cov_sum = solar_columns.corr().values.mean()
wind_cov_sum = wind_columns.corr().values.mean()

# Plot the sums
plt.figure(figsize=(6, 4))
plt.bar(['Solar', 'Wind'], [solar_cov_sum, wind_cov_sum], color=['orange', 'blue'])
plt.ylabel('Sum of Covariance Values')
plt.title('Sum of Covariance Values for Solar and Wind Columns')
plt.show()

# Box plot for the distribution of correlation values
plt.figure(figsize=(10, 6))
sns.boxplot(data=[solar_columns.corr().values.flatten(), wind_columns.corr().values.flatten()], palette=['orange', 'blue'])
plt.xticks([0, 1], ['Solar', 'Wind'])
plt.xlim(-1, 1)
plt.title('Box Plot of Correlation Values for Solar and Wind Columns')
plt.show()

# Plot the PDF of the correlation values for solar and wind columns
# Plot the PDF of the correlation values for solar and wind columns with filled color
plt.figure(figsize=(10, 6))
sns.kdeplot(solar_columns.corr().values.flatten(), color='orange', label='Solar' ,fill =True )
sns.kdeplot(wind_columns.corr().values.flatten(), color='blue', label='Wind',fill=True)
sns.kdeplot(combined_df.corr().values.flatten(), color="green", label="Solar + Wind", fill=True)

plt.xlim(-1, 1)
plt.title('PDF of Correlation Values for Solar and Wind Columns')
plt.legend()
plt.show()