In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import requests
import json
import os
from sqlalchemy import create_engine
import pymysql
%matplotlib inline
%config InlineBackend.figure_format='retina'

In [153]:
def createDayDataframe(filename):
    path = "tongsdata/"
    filedate = filename.split(".")[0].split("-")

    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())

    #  Create X DataFrame
    todayData = {time: data_json[time]['currently'] for time in data_json}
    todayData_DF = pd.DataFrame(todayData).T

    #  Create y DataFrame
    energyBalanceFilename = "energyBalance/Energy_Balance_{}_{}_{}.csv".format(filedate[0], filedate[1], filedate[2]) #change due to json filename
    energyBalance = pd.read_csv(path + energyBalanceFilename, delimiter=";")

    energyBalance_DF = pd.DataFrame(index=energyBalance.iloc[:-1,0].apply(lambda x: x[2:-1]))
    energyBalance_DF['PV Power Generation'] = energyBalance['PV power generation / Mean values [W]  '].values[:-1]
    energyBalance_DF = energyBalance_DF[energyBalance_DF != ' ']
    energyBalance_DF = energyBalance_DF.dropna()

    #  Join X y DataFrame
    data = todayData_DF.join(energyBalance_DF)
    return data

def cleanDataframe(masterDF):
    masterData_cleaned = masterDF.reset_index()
    masterData_cleaned = masterData_cleaned.rename(columns={'index': 'time_of_day'})
    masterData_cleaned = masterData_cleaned[masterData_cleaned['PV Power Generation'].notna()]
    #masterData_cleaned['month'] = masterData_cleaned['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
    masterData_cleaned['cloudCover'] = masterData_cleaned['cloudCover'].fillna(value=masterData_cleaned['cloudCover'].mean())
    masterData_cleaned['PV Power Generation'] = masterData_cleaned['PV Power Generation'].apply(lambda x: x.replace(",", "")).astype('float')
    return masterData_cleaned

def featureEngineering(masterDF):
    #masterDF = masterDF.drop(columns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure', 'precipType', 'uvIndex'])
    masterDF = masterDF.drop(columns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure', 'precipType'])
    
    #masterDF = pd.get_dummies(masterDF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])
    masterDF = pd.get_dummies(masterDF, columns=['time_of_day'])
    return masterDF

    

Darksky key : 2b423d0c90e52bb805ccf6af1f305d7c ,bf11a6cc1e3fb105f612b086f6cefe19

In [3]:
def callApi(date):
    key = "bf11a6cc1e3fb105f612b086f6cefe19"
    latitude = "13.8282"
    longitude = "100.614"
    
    url = "https://api.darksky.net/forecast/{}/{},{}".format(key, latitude, longitude) +"?exclude=currently,minutely,daily"
    
    response = requests.get(url=url)
    return response

In [4]:
def one_month(month, year):
    date = datetime.datetime(year=year, month=month, day=1) #change day here
    nextMonth = date.month + 1
    while(date.month != nextMonth):
        yield date
        date = date + datetime.timedelta(days=1)   
        
def one_day(date):
    tomorrow = date + datetime.timedelta(days=1)
    while(date.day != tomorrow.day):
        yield date
        date = date + datetime.timedelta(minutes=15)   

In [5]:
def write_json(filename, data):
    with open("Testdate/" + filename + ".json", "w", encoding='utf-8') as fout:
        fout.write(json.dumps(data))

In [6]:
def loadjson(predict_date):
    date = pd.to_datetime(predict_date)
    filename = date.strftime("%Y-%m-%d") #change to Y-m-d because file will arrange better
    result = callApi(date)
    d = json.loads(result.text)
    df = pd.DataFrame.from_dict(d['hourly']['data'])
    df['time'] = pd.to_datetime(df['time'],unit='s')
    df.set_index('time',inplace=True)
    df1 = df[filename]
    newdf = pd.DataFrame(np.repeat(df1.values,4,axis=0))
    newdf.columns = df1.columns
    
    maketime = []
    date = predict_date.replace("_","-")
    date = pd.to_datetime(date)
    tomorrow = date + datetime.timedelta(days=1)
    while(date.day != tomorrow.day):
        a = date.strftime("%H:%M")
        maketime.append(a)
        date = date + datetime.timedelta(minutes=15)
    newdf["time"] = maketime
    newdf.set_index('time',inplace=True)
    savetojson = newdf.to_json(r'C:\Users\User\Documents\Senior-Project\Testdate/' + filename + '.json')

In [7]:
def clean1(date):
    date = date.split("-")
    path = "tongsdata/"
    filename = "energyBalance/Energy_Balance_{}_{}_{}.csv".format(date[0], date[1], date[2])
    rawdata=pd.read_csv(path + filename, delimiter=";" )
    deleteindex=rawdata.loc[rawdata.loc[pd.IndexSlice[:,'Battery discharging / Mean values [W] ']]==' '].index
    rawdata.drop(deleteindex,inplace=True)
    rawdata.rename(columns={' ':'Time'},inplace=True)
    rawdata['Time']=rawdata['Time'].str.replace('[^\d]','')
    rawdata['Time']=date[0]+date[1]+date[2]+rawdata['Time']
    rawdata['Time']=pd.to_datetime(rawdata.Time)
    rawdata.set_index('Time', inplace=True)
    for i in rawdata.columns:
        if i != 'Direct consumption / Mean values [W] ' and i != 'Battery discharging / Mean values [W] ' and i!='External energy supply / Mean values [W] ' and i != 'PV power generation / Mean values [W]  ':
            rawdata.drop([i],axis=1,inplace=True)
    for i in rawdata.columns:
        rawdata[i].fillna('0',inplace=True)
        rawdata[i]=rawdata[i].apply(lambda x: x.replace(' ', '0'))
        rawdata[i]=rawdata[i].str.replace('\D','').astype(int)
    rawdata['Consumption']=rawdata['Direct consumption / Mean values [W] ']+rawdata['Battery discharging / Mean values [W] ']+rawdata['External energy supply / Mean values [W] ']
    rawdata.rename(columns={'PV power generation / Mean values [W]  ':'Generation'},inplace=True)
    rawdata.drop(['Direct consumption / Mean values [W] ', 'Battery discharging / Mean values [W] ','External energy supply / Mean values [W] '],axis=1,inplace=True)
    return rawdata

In [145]:
def predictgen(predict_date):
    path = "Testdate/" #from load into file above
    filename = predict_date.replace("_",'-') + '.json'
    
    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())
    
    todayData_DF = pd.DataFrame.from_dict(data_json)
    todayData_DF.index.names=['time_of_day']
    todayData_DF = todayData_DF.loc['06:30':'18:00']
    todayData_DF = todayData_DF.reset_index()
    todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
    dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure','precipType','uvIndex']
    for i in dropcolumns:
        if i in todayData_DF.columns:
            todayData_DF = todayData_DF.drop(columns=[i])
    todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day'])

    masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
    for i in range(len(masterData_for_predict.columns)) : #drop power balance
        if masterData_for_predict.columns[i] not in todayData_DF.columns:
            todayData_DF.insert(i, masterData.columns[i+1], 0)
    missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index
    
    col = ['apparentTemperature','cloudCover','dewPoint','humidity','temperature','visibility','windSpeed']
    for col in todayData_DF:
        todayData_DF[col] = pd.to_numeric(todayData_DF[col], errors='coerce')
    #if todayData_DF.isna().loc[0,'uvIndex']:
    #    todayData_DF.loc[0,'uvIndex'] = 0
    todayData_DF=todayData_DF.interpolate(limit_direction='both')
    
    first_predict = todayData_DF.astype('float').values
    
    results_first_predict = exported_pipeline.predict(first_predict)
    results_first_predict = np.array(results_first_predict).tolist()
    for i in range(25):
        results_first_predict.insert(0,0.0)
    for j in range (24):
        results_first_predict.append(0.0)
    return results_first_predict

แก้โดยใส่uvIndexเอง

In [155]:
def predictgen(predict_date):
    path = "tongsdata/"
    filename = predict_date.replace("_",'-') + '.json'
    
    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())
    
    todayData = {time: data_json[time]['currently'] for time in data_json}
    todayData_DF = pd.DataFrame(todayData).T
    
    todayData_DF.index.names=['time_of_day']
    todayData_DF = todayData_DF.loc['06:30':'18:00']
    todayData_DF = todayData_DF.reset_index()
    #todayData_DF['month'] = todayData_DF['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
    
    todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
    #todayData_DF['cloudCover'] = 0.19
    dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure','precipType']
    for i in dropcolumns:
        if i in todayData_DF.columns:
            todayData_DF = todayData_DF.drop(columns=[i])
    #todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])
    todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day'])

    masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
    for i in range(len(masterData_for_predict.columns)) : #drop power balance
        if masterData_for_predict.columns[i] not in todayData_DF.columns:
            todayData_DF.insert(i, masterData.columns[i+1], 0)
    missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index
    
    col = ['apparentTemperature','cloudCover','dewPoint','humidity','temperature','visibility','windSpeed']
    for col in todayData_DF:
        todayData_DF[col] = pd.to_numeric(todayData_DF[col], errors='coerce')
    if todayData_DF.isna().loc[0,'uvIndex']:
        todayData_DF.loc[0,'uvIndex'] = 0
    todayData_DF=todayData_DF.interpolate(limit_direction='both')
    
    first_predict = todayData_DF.astype('float').values
    
    results_first_predict = exported_pipeline.predict(first_predict)
    results_first_predict = np.array(results_first_predict).tolist()
    for i in range(25):
        results_first_predict.insert(0,0.0)
    for j in range (24):
        results_first_predict.append(0.0)
    return results_first_predict

In [129]:
def predictgen(predict_date):
    path = "Testdate/" #from load into file above
    filename = predict_date.replace("_",'-') + '.json'
    
    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())
    
    todayData_DF = pd.DataFrame.from_dict(data_json)
    todayData_DF.index.names=['time_of_day']
    todayData_DF = todayData_DF.loc['06:30':'18:00']
    todayData_DF = todayData_DF.reset_index()
    todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
    dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure','precipType']
    for i in dropcolumns:
        if i in todayData_DF.columns:
            todayData_DF = todayData_DF.drop(columns=[i])
    todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day'])
    
    masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
    for i in range(len(masterData_for_predict.columns)) : #drop power balancea
        if masterData_for_predict.columns[i] not in todayData_DF.columns:
            todayData_DF.insert(i, masterData.columns[i+1], 0)
    missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index

    col = ['apparentTemperature','cloudCover','dewPoint','humidity','temperature','visibility','windSpeed']
    for col in todayData_DF:
        todayData_DF[col] = pd.to_numeric(todayData_DF[col], errors='coerce')
    
    if todayData_DF.isna().loc[0,'uvIndex']:
        todayData_DF.loc[0,'uvIndex'] = 0
    if todayData_DF.loc[0,'uvIndex'] > 0:
        todayData_DF['uvIndex'] = np.nan
        todayData_DF.loc[0,'uvIndex'] = 0
        todayData_DF.loc[46,'uvIndex'] = 0
        todayData_DF.loc[23,'uvIndex'] = 10
    
    todayData_DF=todayData_DF.interpolate(limit_direction='both')
    first_predict = todayData_DF.astype('float').values
    
    results_first_predict = exported_pipeline.predict(first_predict)
    results_first_predict = np.array(results_first_predict).tolist()
    for i in range(25):
        results_first_predict.insert(0,0.0)
    for j in range (24):
        results_first_predict.append(0.0)
    return results_first_predict

### รันแค่ครั้งเดียวเพื่อ fit model

In [154]:
import os
import pandas as pd
import json
import datetime

files = os.listdir("tongsdata/")

#files.remove(".DS_Store")
files.remove("energyBalance")

masterData = pd.DataFrame()

for file in files:
    r = createDayDataframe(file).loc['06:30':'18:00']
    #print(file, r.shape)
    masterData = pd.concat([masterData, r], axis=0)

masterData = cleanDataframe(masterData)
masterData = featureEngineering(masterData)

features = list(masterData.drop(columns=['PV Power Generation']).columns)

masterData = masterData.dropna()

X = masterData.drop(columns=['PV Power Generation']).astype('float').values
y = masterData['PV Power Generation'].astype('float').values


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

training_features, testing_features, training_target, testing_target = train_test_split(X, y)

# Average CV score on the training set was:-91867.4890210794
exported_pipeline = RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=1, min_samples_split=7, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

#from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
#print(mean_squared_error(testing_target, results), 
#mean_absolute_error(testing_target, results), 
#r2_score(testing_target, results))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  app.launch_new_instance()


In [156]:
predict_date="2019-04-29"

In [76]:
loadjson(predict_date)

In [157]:
generation=predictgen(predict_date)

In [159]:
generation

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 71.96800000000002,
 134.66583333333338,
 296.7756666666666,
 537.4023333333333,
 662.8915000000002,
 702.9911666666667,
 728.4471666666666,
 1050.253333333333,
 946.1121666666669,
 693.319666666667,
 707.3696666666667,
 971.7358333333334,
 1274.6071666666667,
 1148.5955000000001,
 2305.726666666667,
 2071.1476666666667,
 1614.0868333333344,
 1576.1793333333342,
 1618.3253333333344,
 1650.7545000000007,
 1693.3195000000007,
 1513.0550000000003,
 1589.9498333333333,
 1600.006833333334,
 2131.434666666667,
 1620.3670000000004,
 1362.566,
 1384.1875,
 1544.8105000000003,
 1966.1743333333334,
 2655.651,
 2496.8419999999996,
 2466.763666666667,
 2481.2038333333335,
 2298.4158333333335,
 2167.912333333333,
 1923.6443333333327,
 1770.4380000000006,
 1489.575666666667,
 1297.6463333333334,
 1168.4656666666667,
 838.4598333333336,
 745.32316666666

In [160]:
df2=clean1(predict_date)

df2["Forecasted Generation"] = generation

df2

Unnamed: 0_level_0,Generation,Consumption,Forecasted Generation
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2019-04-29 00:15:00,0,3628,0.0
2019-04-29 00:30:00,0,2851,0.0
2019-04-29 00:45:00,0,2911,0.0
2019-04-29 01:00:00,0,2594,0.0
2019-04-29 01:15:00,0,2732,0.0
2019-04-29 01:30:00,0,2578,0.0
2019-04-29 01:45:00,0,2805,0.0
2019-04-29 02:00:00,0,2762,0.0
2019-04-29 02:15:00,0,2347,0.0
2019-04-29 02:30:00,0,2209,0.0


In [161]:
def maape(actual: np.ndarray, predicted: np.ndarray):
    """
    Mean Arctangent Absolute Percentage Error
    Note: result is NOT multiplied by 100
    """
    EPSILON = 1e-10
    return np.mean(np.arctan(np.abs((actual - predicted) / (actual + EPSILON))))*100

In [163]:
df2["AAPE"] = 0

for index, row in df2.iterrows():
    error = maape(row["Generation"], row["Forecasted Generation"])
    df2.loc[index,'AAPE'] = error

In [164]:
df2

Unnamed: 0_level_0,Generation,Consumption,Forecasted Generation,AAPE
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-04-29 00:15:00,0,3628,0.0,0.0
2019-04-29 00:30:00,0,2851,0.0,0.0
2019-04-29 00:45:00,0,2911,0.0,0.0
2019-04-29 01:00:00,0,2594,0.0,0.0
2019-04-29 01:15:00,0,2732,0.0,0.0
2019-04-29 01:30:00,0,2578,0.0,0.0
2019-04-29 01:45:00,0,2805,0.0,0.0
2019-04-29 02:00:00,0,2762,0.0,0.0
2019-04-29 02:15:00,0,2347,0.0,0.0
2019-04-29 02:30:00,0,2209,0.0,0.0


In [165]:
df2.loc[predict_date +' 06:30:00':predict_date +' 18:00:00','AAPE'].mean()

10.048564756244287

In [144]:
from sklearn.feature_selection import SelectFromModel
featuresImportance = pd.DataFrame()
featuresImportance['Feature Name'] = features
featuresImportance['Importance'] = exported_pipeline.feature_importances_

featuresImportance.sort_values('Importance', ascending=False).head()

Unnamed: 0,Feature Name,Importance
4,temperature,0.127848
3,humidity,0.115238
0,apparentTemperature,0.066648
5,visibility,0.057304
2,dewPoint,0.05449


In [149]:
path = "Testdate/" #from load into file above
filename = predict_date.replace("_",'-') + '.json'

with open(path + filename, 'r', encoding='utf-8') as fin:
    data_json = json.loads(fin.read())

todayData_DF = pd.DataFrame.from_dict(data_json)
todayData_DF.index.names=['time_of_day']
todayData_DF = todayData_DF.loc['06:30':'18:00']
todayData_DF = todayData_DF.reset_index()
todayData_DF['cloudCover'] = todayData_DF['cloudCover'].fillna(value=todayData_DF['cloudCover'].mean())
dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure','precipType']
for i in dropcolumns:
    if i in todayData_DF.columns:
        todayData_DF = todayData_DF.drop(columns=[i])
todayData_DF = pd.get_dummies(todayData_DF, columns=['time_of_day'])

masterData_for_predict = masterData.drop(columns=['PV Power Generation'])
for i in range(len(masterData_for_predict.columns)) : #drop power balance
    if masterData_for_predict.columns[i] not in todayData_DF.columns:
        todayData_DF.insert(i, masterData.columns[i+1], 0)
missingvalue = todayData_DF[todayData_DF.isna().any(axis=1)].index

col = ['apparentTemperature','cloudCover','dewPoint','humidity','temperature','visibility','windSpeed']
for col in todayData_DF:
    todayData_DF[col] = pd.to_numeric(todayData_DF[col], errors='coerce')

In [127]:
if todayData_DF.isna().loc[0,'uvIndex']:
    todayData_DF.loc[0,'uvIndex'] = 0
if todayData_DF.loc[0,'uvIndex'] > 0:
    todayData_DF['uvIndex'] = np.nan
    todayData_DF.loc[0,'uvIndex'] = 0
    todayData_DF.loc[46,'uvIndex'] = 0
    todayData_DF.loc[23,'uvIndex'] = 10

todayData_DF=todayData_DF.interpolate(limit_direction='both')

In [150]:
todayData_DF

Unnamed: 0,apparentTemperature,cloudCover,dewPoint,humidity,temperature,uvIndex,visibility,windSpeed,time_of_day_06:30,time_of_day_06:45,...,time_of_day_15:45,time_of_day_16:00,time_of_day_16:15,time_of_day_16:30,time_of_day_16:45,time_of_day_17:00,time_of_day_17:15,time_of_day_17:30,time_of_day_17:45,time_of_day_18:00
0,103.38,0.48,71.8,0.47,94.96,9,10.0,11.58,1,0,...,0,0,0,0,0,0,0,0,0,0
1,103.38,0.48,71.8,0.47,94.96,9,10.0,11.58,0,1,...,0,0,0,0,0,0,0,0,0,0
2,104.37,0.54,71.0,0.44,96.4,7,10.0,12.13,0,0,...,0,0,0,0,0,0,0,0,0,0
3,104.37,0.54,71.0,0.44,96.4,7,10.0,12.13,0,0,...,0,0,0,0,0,0,0,0,0,0
4,104.37,0.54,71.0,0.44,96.4,7,10.0,12.13,0,0,...,0,0,0,0,0,0,0,0,0,0
5,104.37,0.54,71.0,0.44,96.4,7,10.0,12.13,0,0,...,0,0,0,0,0,0,0,0,0,0
6,103.83,0.61,70.28,0.43,96.54,5,10.0,12.48,0,0,...,0,0,0,0,0,0,0,0,0,0
7,103.83,0.61,70.28,0.43,96.54,5,10.0,12.48,0,0,...,0,0,0,0,0,0,0,0,0,0
8,103.83,0.61,70.28,0.43,96.54,5,10.0,12.48,0,0,...,0,0,0,0,0,0,0,0,0,0
9,103.83,0.61,70.28,0.43,96.54,5,10.0,12.48,0,0,...,0,0,0,0,0,0,0,0,0,0


In [103]:
path = "tongsdata/"
filedate = filename.split(".")[0].split("-")

with open(path + filename, 'r', encoding='utf-8') as fin:
    data_json = json.loads(fin.read())

#  Create X DataFrame
todayData = {time: data_json[time]['currently'] for time in data_json}
todayData_DF = pd.DataFrame(todayData).T
dropcolumns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure','precipType']
for i in dropcolumns:
    if i in todayData_DF.columns:
        todayData_DF = todayData_DF.drop(columns=[i])
todayData_DF.loc['06:30':'18:00'].to_csv("Realweather.csv")