# Research Thesis 

## Preprocessing of PV output data

Neil Leiser - Nowcasting solar PV using satellite images

In [None]:
import xarray as xr
import matplotlib.pyplot as plt
import numpy as np
from itertools import product
import pandas as pd
import os
import time
from datetime import timedelta
import datetime
import copy

## Load data + metadata

In [None]:
pv_data = xr.open_dataset('/Users/SeanNassimiha/Desktop/MSc CSML/MSc Project/Data/passiv.netcdf').to_dataframe()
pv_metadata = pd.read_csv('/Users/SeanNassimiha/Desktop/MSc CSML/MSc Project/Data/system_metadata_location_rounded.csv', index_col='ss_id')
pv_metadata.dropna(subset=['latitude_rounded', 'longitude_rounded'], how='any', inplace=True)
pv_metadata.index.dropna().shape



In [None]:
pv_data['2018':'2019'].iloc[:,0].plot()
plt.title('PV Output in kW for one Farm over Years 2018-2019')
plt.ylabel('Kilowatts production')

## Preprocessing

Split:

- Train: 2018-01-01 06:05:00	to 2020-01-01 06:05:00	
- Validation: 2020-01-01 06:10:00 to 2021-01-01 06:05:00
- Test: 2021-01-01 06:10:00 to 2021-10-27 23:55:00


Comments:

I will need to get 3 datasets ready for train, validation, and testing. I need to make sure that each of these 3 datasets contains the same solar farms (or at least, that the set of solar farms in the train dataset contains the other two sets). Also I need to scale validation and test datasets with the same scalings as the train data. 

In [None]:
#SPLIT:
pv_power_selected = pv_data['2018-01-01 06:05:00': '2020-01-01 06:05:00']
pv_power_validation = pv_data['2020-01-01 06:10:00': '2021-01-01 06:05:00']
pv_power_test = pv_data['2021-01-01 06:10:00': '2021-10-27 23:55:00']

print(f'Thus the split is {100 * len(pv_power_selected) / len(pv_data)} - {100 * len(pv_power_validation) / len(pv_data)} - {100 * len(pv_power_test) / len(pv_data)}')
            

### Clip values

In [None]:
pv_power_df = pv_power_selected.dropna(axis='columns', how='all')
pv_power_df = pv_power_df.clip(lower=0, upper=5E7)
pv_power_df.columns = [np.int64(col) for col in pv_power_df.columns]

### Align PV data with metadata

In [None]:
# Only pick PV systems for which we have good metadata
def align_pv_system_ids(pv_metadata, pv_power_df):
    pv_system_ids = pv_metadata.index.intersection(pv_power_df.columns)
    pv_system_ids = np.sort(pv_system_ids)

    pv_power_df = pv_power_df[pv_system_ids]
    pv_metadata = pv_metadata.loc[pv_system_ids]
    return pv_metadata, pv_power_df
    
pv_metadata, pv_power_df = align_pv_system_ids(pv_metadata, pv_power_df)

pv_power_df

### Rescaling to 0-1

In [None]:
# Scale to the range [0, 1]
pv_power_min = pv_power_df.min()
pv_power_max = pv_power_df.max()

pv_power_df /= pv_power_max

### Drop systems which are producing over night

In [None]:
# Drop systems which are producing over night
NIGHT_YIELD_THRESHOLD = 0.4
night_hours = list(range(21, 24)) + list(range(0, 4))
bad_systems = np.where((pv_power_df[pv_power_df.index.hour.isin(night_hours)] > NIGHT_YIELD_THRESHOLD).sum())[0]
bad_systems = pv_power_df.columns[bad_systems]
print(len(bad_systems), 'bad systems found.')

pv_power_df.drop(bad_systems, axis='columns', inplace=True)

### Aligning again

In [None]:
# Align again, after removing dud PV systems
pv_metadata_5min, pv_power_df_5min = align_pv_system_ids(pv_metadata, pv_power_df)

### Remove night data (Analysis between 8am - 4pm)

In [None]:
day_max = 16
day_min = 8
start_night = datetime.time(day_max,0)
end_night = datetime.time(day_min,0)

day_index = [time_period for time_period in pv_power_df_5min.index if time_period.time() <= start_night and time_period.time() >= end_night]
pv_power_df_5day = pv_power_df_5min.loc[day_index]



### Analysis of nans

In [None]:
threshold = 20 #write percentage

count_nan = []
few_data_systems = []
for cols in pv_power_df_5day.columns:
    
    a = len([it for it in list(pv_power_df_5day[cols]) if (it >=0) ==False])
    count_nan.append(a)
    
    if a/len(pv_power_df_5day)*100 > threshold:
        
        few_data_systems.append(cols)
        
        
# import matplotlib as mpl
# %matplotlib inline
# dpi = 120
# mpl.rcParams['figure.dpi']= dpi

# fig = plt.figure(figsize=(10,5))
# plt.hist(np.array(count_nan)/len(pv_power_df_5day)*100,bins=25,color='#0504aa',rwidth=0.85)
# plt.xlabel('Percentage NaN values [%]')
# plt.ylabel('PV systems count')
# plt.grid(axis='y', alpha=0.3)
# plt.show()

pv_power_df_5day = pv_power_df_5day.drop(few_data_systems,1)
pv_power_df_5day


### Replace Nans by interpolation

In [None]:
pv_power_df_5day = pv_power_df_5day.interpolate(method='linear', axis=0,limit_direction='both')
pv_power_df_5day.iloc[:, 30:50].plot(figsize=(15,10))

### Convert to csv

In [None]:
# pv_power_df_5day.to_csv('/Users/SeanNassimiha/Desktop/MSc CSML/MSc Project/Data/pv_power_df_5day.csv')

## Now Doing the same for Validation set (not test yet)

In [None]:
#get the olumns from train
pv_power_val = pv_power_validation[pv_power_df_5day.columns.values.astype('str')]

#clip values
pv_power_df_val = pv_power_val.dropna(axis='columns', how='all')
pv_power_df_val = pv_power_df_val.clip(lower=0, upper=5E7)
pv_power_df_val.columns = [np.int64(col) for col in pv_power_df_val.columns]


# #align
pv_metadata_val, pv_power_df_val = align_pv_system_ids(pv_metadata, pv_power_df_val)

# #rescaling by max in training
pv_power_df_val /= pv_power_max

# #drop systems producing in night
# Drop systems which are producing over night
bad_systems_val = np.where((pv_power_df_val[pv_power_df_val.index.hour.isin(night_hours)] > NIGHT_YIELD_THRESHOLD).sum())[0]
bad_systems_val = pv_power_df_val.columns[bad_systems_val]
print(len(bad_systems_val), 'bad systems found.')

pv_power_df_val.drop(bad_systems_val, axis='columns', inplace=True)


#align
pv_metadata_5min, pv_power_df_val_5min = align_pv_system_ids(pv_metadata_val, pv_power_df_val)

#remove night
day_index = [time_period for time_period in pv_power_df_val_5min.index if time_period.time() <= start_night and time_period.time() >= end_night]
pv_power_df_val_5day = pv_power_df_val_5min.loc[day_index]

#remove systems with nan
count_nan = []
few_data_systems = []
for cols in pv_power_df_val_5day.columns:
    
    a = len([it for it in list(pv_power_df_val_5day[cols]) if (it >=0) ==False])
    count_nan.append(a)
    
    if a/len(pv_power_df_val_5day)*100 > threshold:
        
        few_data_systems.append(cols)

pv_power_df_val_5day = pv_power_df_val_5day.drop(few_data_systems,1)
       

#interpolate nans
pv_power_df_val_5day = pv_power_df_val_5day.interpolate(method='linear', axis=0,limit_direction='both')
pv_power_df_val_5day.iloc[:, 30:50].plot(figsize=(15,10))


# pv_power_df_val_5day.to_csv('/Users/SeanNassimiha/Desktop/MSc CSML/MSc Project/Data/pv_power_df_5day_validation.csv')