In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pgscen.engine import GeminiEngine
from pgscen.utils.data_utils import split_actuals_hist_future, split_forecasts_hist_future
from pathlib import Path
from pgscen.pca import PCAGeminiEngine, PCAGeminiModel
from pgscen.spca import SPCAGeminiEngine, SPCAGeminiModel
from pgscen.utils.solar_utils import get_yearly_date_range
import random

In [None]:
from pgscen.spca import spca

In [None]:
!ls ../data/MetaData

In [None]:
def load_solar_data(data_path):
    solar_site_actual_df = pd.read_csv(
        Path(data_path, 'Solar', 'NREL', 'Actual',
             'solar_actual_1h_site_2017_2018_utc.csv'),
        parse_dates=['Time'], index_col='Time'
        )

    solar_site_forecast_df = pd.read_csv(
        Path(data_path, 'Solar', 'NREL', 'Day-ahead',
             'solar_day_ahead_forecast_site_2017_2018_utc.csv'),
        parse_dates=['Issue_time', 'Forecast_time']
        )

    solar_meta_df = pd.read_excel(
        Path(data_path, 'MetaData', 'solar_meta.xlsx'))

    return solar_site_actual_df, solar_site_forecast_df, solar_meta_df

In [None]:
scen_start_time = pd.to_datetime('2018-02-07 06:00:00',utc=True)
nscen = 1000

In [None]:
solar_site_actual_df, solar_site_forecast_df, solar_meta_df = load_solar_data('../data/')

In [None]:
scen_timesteps = pd.date_range(start=scen_start_time,periods=24, freq='H')

(solar_site_actual_hists,
            solar_site_actual_futures) = split_actuals_hist_future(
                    solar_site_actual_df, scen_timesteps)

(solar_site_forecast_hists,
            solar_site_forecast_futures) = split_forecasts_hist_future(
                    solar_site_forecast_df, scen_timesteps)

In [None]:
hist_dates = sorted(get_yearly_date_range(date=scen_start_time,num_of_days=50,
                      start=str(solar_site_actual_hists.index.min().date()),
                      end=str(solar_site_actual_hists.index.max().date())))[:-1]
hist_fcst_issue_times = [t-pd.Timedelta(6,unit='H') for t in hist_dates]

In [None]:
solar_site_forecast_hists = solar_site_forecast_hists[solar_site_forecast_hists['Issue_time'].isin(hist_fcst_issue_times)]
hist_start = solar_site_forecast_hists['Forecast_time'].min()
hist_end = solar_site_forecast_hists['Forecast_time'].max()
solar_site_actual_hists = solar_site_actual_hists[(solar_site_actual_hists.index>=hist_start) & \
                                                  (solar_site_actual_hists.index<=hist_end)]

In [None]:
pge = PCAGeminiEngine(solar_site_actual_hists, solar_site_forecast_hists, scen_start_time, solar_meta_df)
dist = pge.asset_distance().values
pge.fit(10, dist / (10 * dist.max()), 5e-2)

In [None]:
pge.create_scenario(1000, solar_site_forecast_futures)
pge.model.scen_df.round(2)

In [None]:
# pge.model.asset_cov.to_csv('/Users/xy3134/Research/PERFORM/notebooks/NREL/solar_pca/data/asset_cov.csv',index=False)

In [None]:
save_dir = '/Users/xy3134/Research/PERFORM/Data/Outputs/PGscen/PCA'

pge.write_to_csv(save_dir,solar_site_actual_futures,write_forecasts=True)

In [None]:
datadir = '/Users/xy3134/Research/PERFORM/Data/Outputs/PGscen/PCA/20180102/solar/'
pd.read_csv(datadir+'Adamstown_Solar.csv')

In [None]:
pge.scenarios['solar']

In [None]:
idx = random.sample(range(0, 1000), 1)[0]

asset_list = ['solar288','solar289','solar290','solar291']


plt.figure(figsize=(16,8))
for (i,asset) in enumerate(asset_list):
    cols = [(asset,ts) for ts in pge.scen_timesteps]
    df = pge.scenarios['solar'][cols]
    plt.subplot(2,2,i+1)
    plt.plot(df.loc[idx].values)

In [None]:
df.loc[idx]

In [None]:
pge.model.horizon_cov

In [None]:
cov = pge.model.asset_cov.values
corr = np.diag(1/np.sqrt(np.diag(cov)))@cov@np.diag(1/np.sqrt(np.diag(cov)))
corr_df = pd.DataFrame(data=corr,index=pge.model.asset_cov.columns,columns=pge.model.asset_cov.columns)

In [None]:
corr_df

In [None]:
corr_df[asset_list].loc[asset_list]

In [None]:
ncomp = 3

plt.figure(figsize=(16,8))
for (i,asset) in enumerate(asset_list):
    plt.subplot(2,2,i+1)
    plt.plot(pge.model.pca_dict[asset]['pca'].components_[ncomp,:])

# Test SparsePCA

In [None]:
from sklearn.decomposition import SparsePCA

In [None]:
df = pge.model.hist_dev_df

n = df.shape[0]
arr = np.zeros((n*226, 24))

for i in range(226):
    arr[i*n:(i+1)*n,:] = df.values[:,i*24:(i+1)*24]
    
arr = arr-np.mean(arr,axis=0)

In [None]:
pca = SparsePCA(n_components=10,random_state=0, ridge_alpha=0.)
Y = pca.fit_transform(arr)

In [None]:
Y.shape

In [None]:
for i in range(10):
    plt.figure()
    plt.plot(pca.components_[i,:])

In [None]:
pca.components_@arr[0,:]

In [None]:
# pca.components_@pca.components_.T

In [None]:
np.linalg.inv(pca.components_@pca.components_.T)@Y[0,:]

In [None]:
np.linalg.lstsq(pca.components_.T,arr[0,:])

In [None]:
(np.linalg.inv(pca.components_@pca.components_.T))

In [None]:
pca.components_.T@Y[0,:]

In [None]:
arr[0,:]

In [None]:
pd.DataFrame(data=arr).to_csv('/Users/xy3134/Desktop/temp_files/test.csv',index=False)

# Sparse PCA

In [None]:
sge = SPCAGeminiEngine(solar_site_actual_hists, solar_site_forecast_hists, scen_start_time, solar_meta_df)
dist = sge.asset_distance().values
sge.fit(10, dist / (10 * dist.max()), 5e-2)

In [None]:
sge.create_scenario(1000, solar_site_forecast_futures)
sge.model.scen_df.round(2)

In [None]:
for i in range(10):
    plt.figure()
    plt.plot(pge.model.pca.components_[i,:],sge.model.pca['loadings'][:,i],'o')