In [5]:
def predictgen(predict_date):
    import os
    import pandas as pd
    import json
    import datetime

    def createDayDataframe(filename):
        path = "tongsdata/"
        filedate = filename.split(".")[0].split("-")

        with open(path + filename, 'r', encoding='utf-8') as fin:
            data_json = json.loads(fin.read())

        #  Create X DataFrame
        todayData = {time: data_json[time]['currently'] for time in data_json}
        todayData_DF = pd.DataFrame(todayData).T

        #  Create y DataFrame
        energyBalanceFilename = "energyBalance/Energy_Balance_{}_{}_{}.csv".format(filedate[0], filedate[1], filedate[2]) #change due to json filename
        energyBalance = pd.read_csv(path + energyBalanceFilename, delimiter=";")

        energyBalance_DF = pd.DataFrame(index=energyBalance.iloc[:-1,0].apply(lambda x: x[2:-1]))
        energyBalance_DF['PV Power Generation'] = energyBalance['PV power generation / Mean values [W]  '].values[:-1]
        energyBalance_DF = energyBalance_DF[energyBalance_DF != ' ']
        energyBalance_DF = energyBalance_DF.dropna()

        #  Join X y DataFrame
        data = todayData_DF.join(energyBalance_DF)
        return data

    def cleanDataframe(masterDF):
        masterData_cleaned = masterDF.reset_index()
        masterData_cleaned = masterData_cleaned.rename(columns={'index': 'time_of_day'})
        masterData_cleaned = masterData_cleaned[masterData_cleaned['PV Power Generation'].notna()]
        masterData_cleaned['month'] = masterData_cleaned['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
        masterData_cleaned['cloudCover'] = masterData_cleaned['cloudCover'].fillna(value=masterData_cleaned['cloudCover'].mean())
        masterData_cleaned['PV Power Generation'] = masterData_cleaned['PV Power Generation'].apply(lambda x: x.replace(",", "")).astype('float')
        return masterData_cleaned

    def featureEngineering(masterDF):
        masterDF = masterDF.drop(columns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure'])
        masterDF = pd.get_dummies(masterDF, columns=['time_of_day', 'precipType', 'uvIndex', 'month']) 
        return masterDF

    files = os.listdir("tongsdata/")

    #files.remove(".DS_Store")
    files.remove("energyBalance")

    masterData = pd.DataFrame()

    for file in files:
        r = createDayDataframe(file).loc['06:30':'18:00']
        print(file, r.shape)
        masterData = pd.concat([masterData, r], axis=0)

    masterData = cleanDataframe(masterData)
    masterData = featureEngineering(masterData)

    features = list(masterData.drop(columns=['PV Power Generation']).columns)

    masterData = masterData.dropna()

    X = masterData.drop(columns=['PV Power Generation']).astype('float').values
    y = masterData['PV Power Generation'].astype('float').values


    from sklearn.model_selection import train_test_split

    X_train, X_test, y_train, y_test = train_test_split(X, y)

    import numpy as np
    import pandas as pd
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split

    training_features, testing_features, training_target, testing_target = train_test_split(X, y)

    # Average CV score on the training set was:-91867.4890210794
    exported_pipeline = RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=1, min_samples_split=7, n_estimators=100)

    exported_pipeline.fit(training_features, training_target)
    results = exported_pipeline.predict(testing_features)

    from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
    print(mean_squared_error(testing_target, results), 
    mean_absolute_error(testing_target, results), 
    r2_score(testing_target, results))


    def pvpredict(predict_date) :
        path = "tongsdata/"
        filename = predict_date.replace("_",'-') + '.json'

        with open(path + filename, 'r', encoding='utf-8') as fin:
            data_json = json.loads(fin.read())

        todayData = {time: data_json[time]['currently'] for time in data_json}
        todayData_DF = pd.DataFrame(todayData).T
        todayData_DF.index.names=['time_of_day']
        todayData_DF = todayData_DF.loc['06:30':'18:00']
        todayData_DF = todayData_DF.reset_index()
        todayData_DF['month'] = todayData_DF['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
        todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
        dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure']
        for i in dropcolumns:
            if i in todayData_DF.columns:
                todayData_DF = todayData_DF.drop(columns=[i])
        todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])

        masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
        for i in range(len(masterData_for_predict.columns)) : #drop power balance
            if masterData_for_predict.columns[i] not in todayData_DF.columns:
                todayData_DF.insert(i, masterData.columns[i+1], 0)
        missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index
        import math
        for i in missingvalue:
            for j in todayData_DF.columns:
                if math.isnan(todayData_DF.loc[i,j]) :
                    todayData_DF.loc[i,j] = (todayData_DF.loc[i-1,j]+todayData_DF.loc[i+1,j])/2
        first_predict = todayData_DF.astype('float').values

        results_first_predict = exported_pipeline.predict(first_predict)
        results_first_predict = np.array(results_first_predict).tolist()
        for i in range(25):
            results_first_predict.insert(0,0.0)
        for j in range (24):
            results_first_predict.append(0.0)
        return results_first_predict

    Generation=pvpredict(predict_date)
    return Generation

In [7]:
predictgen('2018_11_01')

2018-10-01.json (47, 15)
2018-10-02.json (47, 14)
2018-10-03.json (47, 14)
2018-10-04.json (47, 15)
2018-10-05.json (47, 15)
2018-10-06.json (47, 15)
2018-10-07.json (47, 15)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




2018-10-08.json (47, 15)
2018-10-09.json (47, 15)
2018-10-10.json (47, 15)
2018-10-11.json (47, 15)
2018-10-12.json (47, 15)
2018-10-13.json (47, 14)
2018-10-14.json (47, 15)
2018-10-15.json (47, 15)
2018-10-16.json (47, 15)
2018-10-17.json (47, 14)
2018-10-18.json (47, 14)
2018-10-19.json (47, 15)
2018-10-20.json (47, 15)
2018-10-21.json (47, 15)
2018-10-22.json (47, 15)
2018-10-23.json (47, 15)
2018-10-24.json (47, 14)
2018-10-25.json (47, 15)
2018-10-26.json (47, 15)
2018-10-27.json (47, 15)
2018-10-28.json (47, 15)
2018-10-29.json (47, 15)
2018-10-30.json (47, 15)
2018-10-31.json (47, 15)
2018-11-01.json (47, 15)
2018-11-02.json (47, 13)
2018-11-03.json (47, 15)
2018-11-04.json (47, 14)
2018-11-05.json (47, 14)
2018-11-06.json (47, 15)
2018-11-07.json (47, 14)
2018-11-08.json (47, 15)
2018-11-09.json (47, 14)
2018-11-10.json (47, 15)
2018-11-11.json (47, 14)
2018-11-12.json (47, 15)
2018-11-13.json (47, 14)
2018-11-14.json (47, 15)
2018-11-15.json (47, 15)
2018-11-16.json (47, 15)


[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 453.67983333333353,
 396.7713333333333,
 501.6111666666669,
 513.8160000000003,
 686.3701666666666,
 799.9110000000001,
 878.4550000000002,
 1179.5031666666664,
 1315.507166666666,
 1566.7366666666667,
 1815.2428333333332,
 1999.7715000000005,
 1860.8580000000002,
 2158.687833333334,
 2121.081833333334,
 2017.3568333333328,
 1668.253,
 1681.5848333333333,
 1480.2691666666663,
 1552.7003333333334,
 1440.1,
 1183.701666666666,
 1215.7923333333329,
 1246.213333333333,
 1240.2039999999995,
 1251.865333333333,
 1211.3884999999996,
 1216.4503333333332,
 1254.1133333333332,
 1421.9410000000003,
 1461.5411666666669,
 1435.3824999999997,
 1434.7746666666665,
 1305.722833333333,
 1365.297833333333,
 1208.6601666666666,
 1091.2603333333336,
 950.3745,
 1058.3944999999999,
 1006.2441666666664,
 1037.9176666666663,
 1101.5104999999994,
 756.168333333

In [6]:
datelist=[]
for i in range(1,31):
    datelist.append('2018_11_'+str(i).zfill(2))
datelist    

['2018_11_01',
 '2018_11_02',
 '2018_11_03',
 '2018_11_04',
 '2018_11_05',
 '2018_11_06',
 '2018_11_07',
 '2018_11_08',
 '2018_11_09',
 '2018_11_10',
 '2018_11_11',
 '2018_11_12',
 '2018_11_13',
 '2018_11_14',
 '2018_11_15',
 '2018_11_16',
 '2018_11_17',
 '2018_11_18',
 '2018_11_19',
 '2018_11_20',
 '2018_11_21',
 '2018_11_22',
 '2018_11_23',
 '2018_11_24',
 '2018_11_25',
 '2018_11_26',
 '2018_11_27',
 '2018_11_28',
 '2018_11_29',
 '2018_11_30']

In [8]:
genlist=[]
for i in datelist:
    genlist.append(predictgen(i))

2018-10-01.json (47, 15)
2018-10-02.json (47, 14)
2018-10-03.json (47, 14)
2018-10-04.json (47, 15)
2018-10-05.json (47, 15)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




2018-10-06.json (47, 15)
2018-10-07.json (47, 15)
2018-10-08.json (47, 15)
2018-10-09.json (47, 15)
2018-10-10.json (47, 15)
2018-10-11.json (47, 15)
2018-10-12.json (47, 15)
2018-10-13.json (47, 14)
2018-10-14.json (47, 15)
2018-10-15.json (47, 15)
2018-10-16.json (47, 15)
2018-10-17.json (47, 14)
2018-10-18.json (47, 14)
2018-10-19.json (47, 15)
2018-10-20.json (47, 15)
2018-10-21.json (47, 15)
2018-10-22.json (47, 15)
2018-10-23.json (47, 15)
2018-10-24.json (47, 14)
2018-10-25.json (47, 15)
2018-10-26.json (47, 15)
2018-10-27.json (47, 15)
2018-10-28.json (47, 15)
2018-10-29.json (47, 15)
2018-10-30.json (47, 15)
2018-10-31.json (47, 15)
2018-11-01.json (47, 15)
2018-11-02.json (47, 13)
2018-11-03.json (47, 15)
2018-11-04.json (47, 14)
2018-11-05.json (47, 14)
2018-11-06.json (47, 15)
2018-11-07.json (47, 14)
2018-11-08.json (47, 15)
2018-11-09.json (47, 14)
2018-11-10.json (47, 15)
2018-11-11.json (47, 14)
2018-11-12.json (47, 15)
2018-11-13.json (47, 14)
2018-11-14.json (47, 15)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




2018-10-08.json (47, 15)
2018-10-09.json (47, 15)
2018-10-10.json (47, 15)
2018-10-11.json (47, 15)
2018-10-12.json (47, 15)
2018-10-13.json (47, 14)
2018-10-14.json (47, 15)
2018-10-15.json (47, 15)
2018-10-16.json (47, 15)
2018-10-17.json (47, 14)
2018-10-18.json (47, 14)
2018-10-19.json (47, 15)
2018-10-20.json (47, 15)
2018-10-21.json (47, 15)
2018-10-22.json (47, 15)
2018-10-23.json (47, 15)
2018-10-24.json (47, 14)
2018-10-25.json (47, 15)
2018-10-26.json (47, 15)
2018-10-27.json (47, 15)
2018-10-28.json (47, 15)
2018-10-29.json (47, 15)
2018-10-30.json (47, 15)
2018-10-31.json (47, 15)
2018-11-01.json (47, 15)
2018-11-02.json (47, 13)
2018-11-03.json (47, 15)
2018-11-04.json (47, 14)
2018-11-05.json (47, 14)
2018-11-06.json (47, 15)
2018-11-07.json (47, 14)
2018-11-08.json (47, 15)
2018-11-09.json (47, 14)
2018-11-10.json (47, 15)
2018-11-11.json (47, 14)
2018-11-12.json (47, 15)
2018-11-13.json (47, 14)
2018-11-14.json (47, 15)
2018-11-15.json (47, 15)
2018-11-16.json (47, 15)


KeyError: 'cloudCover'