In [1]:
import os
import pandas as pd
import json
import datetime

In [2]:
def createDayDataframe(filename):
    path = "tongsdata/"
    filedate = filename.split(".")[0].split("-")

    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())

    #  Create X DataFrame
    todayData = {time: data_json[time]['currently'] for time in data_json}
    todayData_DF = pd.DataFrame(todayData).T

    #  Create y DataFrame
    energyBalanceFilename = "energyBalance/Energy_Balance_{}_{}_{}.csv".format(filedate[0], filedate[1], filedate[2]) #change due to json filename
    energyBalance = pd.read_csv(path + energyBalanceFilename, delimiter=";")

    energyBalance_DF = pd.DataFrame(index=energyBalance.iloc[:-1,0].apply(lambda x: x[2:-1]))
    energyBalance_DF['PV Power Generation'] = energyBalance['PV power generation / Mean values [W]  '].values[:-1]
    energyBalance_DF = energyBalance_DF[energyBalance_DF != ' ']
    energyBalance_DF = energyBalance_DF.dropna()
    
    #  Join X y DataFrame
    data = todayData_DF.join(energyBalance_DF)
    return data

def cleanDataframe(masterDF):
    masterData_cleaned = masterDF.reset_index()
    masterData_cleaned = masterData_cleaned.rename(columns={'index': 'time_of_day'})
    masterData_cleaned = masterData_cleaned[masterData_cleaned['PV Power Generation'].notna()]
    masterData_cleaned['month'] = masterData_cleaned['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
    masterData_cleaned['cloudCover'] = masterData_cleaned['cloudCover'].fillna(value=masterData_cleaned['cloudCover'].mean())
    masterData_cleaned['PV Power Generation'] = masterData_cleaned['PV Power Generation'].apply(lambda x: x.replace(",", "")).astype('float')
    return masterData_cleaned

In [3]:
def featureEngineering(masterDF):
    masterDF = masterDF.drop(columns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure'])
    masterDF = pd.get_dummies(masterDF, columns=['time_of_day', 'precipType', 'uvIndex', 'month']) 
    return masterDF

In [4]:
files = os.listdir("tongsdata/")

#files.remove(".DS_Store")
files.remove("energyBalance")

masterData = pd.DataFrame()

for file in files:
    r = createDayDataframe(file).loc['06:30':'18:00']
    print(file, r.shape)
    masterData = pd.concat([masterData, r], axis=0)
    
masterData = cleanDataframe(masterData)
masterData = featureEngineering(masterData)

2018-10-01.json (47, 15)
2018-10-02.json (47, 14)
2018-10-03.json (47, 14)
2018-10-04.json (47, 15)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


2018-10-05.json (47, 15)
2018-10-06.json (47, 15)
2018-10-07.json (47, 15)
2018-10-08.json (47, 15)
2018-10-09.json (47, 15)
2018-10-10.json (47, 15)
2018-10-11.json (47, 15)
2018-10-12.json (47, 15)
2018-10-13.json (47, 14)
2018-10-14.json (47, 15)
2018-10-15.json (47, 15)
2018-10-16.json (47, 15)
2018-10-17.json (47, 14)
2018-10-18.json (47, 14)
2018-10-19.json (47, 15)
2018-10-20.json (47, 15)
2018-10-21.json (47, 15)
2018-10-22.json (47, 15)
2018-10-23.json (47, 15)
2018-10-24.json (47, 14)
2018-10-25.json (47, 15)
2018-10-26.json (47, 15)
2018-10-27.json (47, 15)
2018-10-28.json (47, 15)
2018-10-29.json (47, 15)
2018-10-30.json (47, 15)
2018-10-31.json (47, 15)
2018-11-01.json (47, 15)
2018-11-02.json (47, 13)
2018-11-03.json (47, 15)
2018-11-04.json (47, 14)
2018-11-05.json (47, 14)
2018-11-06.json (47, 15)
2018-11-07.json (47, 14)
2018-11-08.json (47, 15)
2018-11-09.json (47, 14)
2018-11-10.json (47, 15)
2018-11-11.json (47, 14)
2018-11-12.json (47, 15)
2018-11-13.json (47, 14)


In [5]:
features = list(masterData.drop(columns=['PV Power Generation']).columns)

In [6]:
masterData = masterData.dropna()

X = masterData.drop(columns=['PV Power Generation']).astype('float').values
y = masterData['PV Power Generation'].astype('float').values


In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

training_features, testing_features, training_target, testing_target = train_test_split(X, y)

# Average CV score on the training set was:-91867.4890210794
exported_pipeline = RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=1, min_samples_split=7, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
print(mean_squared_error(testing_target, results), 
mean_absolute_error(testing_target, results), 
r2_score(testing_target, results))


105518.81825667394 217.15825978698908 0.8265820660332565


In [9]:
def pvpredict(predict_date) :
    path = "tongsdata/"
    filename = predict_date.replace("_",'-') + '.json'

    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())

    todayData = {time: data_json[time]['currently'] for time in data_json}
    todayData_DF = pd.DataFrame(todayData).T
    todayData_DF.index.names=['time_of_day']
    todayData_DF = todayData_DF.loc['06:30':'18:00']
    todayData_DF = todayData_DF.reset_index()
    todayData_DF['month'] = todayData_DF['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
    todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
    dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure']
    for i in dropcolumns:
        if i in todayData_DF.columns:
            todayData_DF = todayData_DF.drop(columns=[i])
    todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])
    
    masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
    for i in range(len(masterData_for_predict.columns)) : #drop power balance
        if masterData_for_predict.columns[i] not in todayData_DF.columns:
            todayData_DF.insert(i, masterData.columns[i+1], 0)
    missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index
    import math
    for i in missingvalue:
        for j in todayData_DF.columns:
            if math.isnan(todayData_DF.loc[i,j]) :
                todayData_DF.loc[i,j] = (todayData_DF.loc[i-1,j]+todayData_DF.loc[i+1,j])/2
    first_predict = todayData_DF.astype('float').values
    
    results_first_predict = exported_pipeline.predict(first_predict)
    results_first_predict = np.array(results_first_predict).tolist()
    for i in range(25):
        results_first_predict.insert(0,0.0)
    for j in range (24):
        results_first_predict.append(0.0)
    return results_first_predict

In [10]:
pvpredict('2018_11_01')

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 506.2238333333333,
 469.73016666666683,
 508.63799999999986,
 594.4198333333333,
 737.8975000000002,
 986.4568333333333,
 983.5435000000001,
 1223.1914999999997,
 1475.5328333333323,
 1654.0153333333326,
 1877.610833333333,
 2094.1028333333334,
 2149.6083333333336,
 2141.559333333333,
 1993.9694999999995,
 1951.401333333333,
 1660.2973333333327,
 1659.7158333333327,
 1453.4845,
 1208.7473333333337,
 1191.3310000000004,
 1166.2571666666672,
 1173.475666666667,
 1202.3148333333331,
 1211.9375,
 1262.9864999999998,
 1155.5171666666668,
 1243.1585000000005,
 1405.8004999999998,
 1427.713833333333,
 1477.2316666666663,
 1410.5258333333331,
 1394.4878333333334,
 1201.9075000000003,
 1267.255,
 1269.1641666666667,
 1081.9695000000002,
 936.3208333333333,
 1104.1408333333338,
 1008.6811666666669,
 800.328,
 835.8183333333332,
 740.681,
 1286.036

In [11]:
len(todayData_DF.index)

NameError: name 'todayData_DF' is not defined

In [12]:
predict_date = '2018_11_01'
path = "tongsdata/"
filename = predict_date.replace("_",'-') + '.json'

with open(path + filename, 'r', encoding='utf-8') as fin:
    data_json = json.loads(fin.read())

todayData = {time: data_json[time]['currently'] for time in data_json}
todayData_DF = pd.DataFrame(todayData).T
todayData_DF.index.names=['time_of_day']
todayData_DF = todayData_DF.loc['06:30':'18:00']
todayData_DF = todayData_DF.reset_index()
todayData_DF['month'] = todayData_DF['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure']
for i in dropcolumns:
    if i in todayData_DF.columns:
        todayData_DF = todayData_DF.drop(columns=[i])
todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])

masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
for i in range(len(masterData_for_predict.columns)) : #drop power balance
    if masterData_for_predict.columns[i] not in todayData_DF.columns:
        todayData_DF.insert(i, masterData.columns[i+1], 0)
missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index
import math
for i in missingvalue:
    for j in todayData_DF.columns:
        if math.isnan(todayData_DF.loc[i,j]) :
            todayData_DF.loc[i,j] = (todayData_DF.loc[i-1,j]+todayData_DF.loc[i+1,j])/2
first_predict = todayData_DF.astype('float').values

results_first_predict = exported_pipeline.predict(first_predict)
results_first_predict = np.array(results_first_predict).tolist()
for i in range(25):
    results_first_predict.insert(0,0.0)
for j in range (24):
    results_first_predict.append(0.0)

In [14]:
len(todayData_DF.index)-1

46

In [19]:
for i in range(10):
    print(i)

0
1
2
3
4
5
6
7
8
9


9