In [1]:
import datetime
import requests
import json

In [2]:
def callApi(date):
    key = "2b423d0c90e52bb805ccf6af1f305d7c"
    latitude = "13.8282"
    longitude = "100.614"
    time = str(date).replace(" ", "T")
    url = "https://api.darksky.net/forecast/{}/{},{},{}".format(key, latitude, longitude, time)
    
    response = requests.get(url=url)
    return response

In [3]:
def one_month(month, year):
    date = datetime.datetime(year=year, month=month, day=29)
    nextMonth = date.month + 1
    while(date.month != nextMonth):
        yield date
        date = date + datetime.timedelta(days=1)   
        
def one_day(date):
    tomorrow = date + datetime.timedelta(days=1)
    while(date.day != tomorrow.day):
        yield date
        date = date + datetime.timedelta(minutes=15)   

In [4]:
def write_json(filename, data):
    with open("tongsdata/" + filename + ".json", "w", encoding='utf-8') as fout:
        fout.write(json.dumps(data))

___

In [18]:
year=2018
month=11
date = datetime.datetime(year=year, month=month, day=30)
nextMonth = date.month + 1
while(date.month != nextMonth):
    date = date + datetime.timedelta(days=1)  
    print(date)

2018-12-01 00:00:00


In [5]:
#  LOAD INTO FILES

for day in one_month(11, 2018):
    day_result = {}
    filename = day.strftime("%d-%m-%Y")
    for minute in one_day(day):
        result = callApi(minute)
        day_result.update({ minute.strftime("%H:%M"): result.json()})  #  key is time of the day
    write_json(filename=filename, data=day_result)  #  filename is date    

___

In [1]:
import os
import pandas as pd
import json
import datetime

In [2]:
def createDayDataframe(filename):
    path = "tongsdata/"
    filedate = filename.split(".")[0].split("-")

    with open(path + filename, 'r', encoding='utf-8') as fin:
        data_json = json.loads(fin.read())

    #  Create X DataFrame
    todayData = {time: data_json[time]['currently'] for time in data_json}
    todayData_DF = pd.DataFrame(todayData).T

    #  Create y DataFrame
    energyBalanceFilename = "energyBalance/Energy_Balance_{}_{}_{}.csv".format(filedate[2], filedate[1], filedate[0])
    energyBalance = pd.read_csv(path + energyBalanceFilename, delimiter=";")

    energyBalance_DF = pd.DataFrame(index=energyBalance.iloc[:-1,0].apply(lambda x: x[2:-1]))
    energyBalance_DF['PV Power Generation'] = energyBalance['PV power generation / Mean values [W]  '].values[:-1]
    energyBalance_DF = energyBalance_DF[energyBalance_DF != ' ']
    energyBalance_DF = energyBalance_DF.dropna()
    
    #  Join X y DataFrame
    data = todayData_DF.join(energyBalance_DF)
    return data

def cleanDataframe(masterDF):
    masterData_cleaned = masterDF.reset_index()
    masterData_cleaned = masterData_cleaned.rename(columns={'index': 'time_of_day'})
    masterData_cleaned = masterData_cleaned[masterData_cleaned['PV Power Generation'].notna()]
    masterData_cleaned['month'] = masterData_cleaned['time'].apply(datetime.datetime.fromtimestamp).apply(lambda x: x.month)
    masterData_cleaned['cloudCover'] = masterData_cleaned['cloudCover'].fillna(value=masterData_cleaned['cloudCover'].mean())
    masterData_cleaned['PV Power Generation'] = masterData_cleaned['PV Power Generation'].apply(lambda x: x.replace(",", "")).astype('float')
    return masterData_cleaned

In [3]:
def featureEngineering(masterDF):
    masterDF = masterDF.drop(columns=['icon', 'summary', 'time', 'windBearing', 'windGust', 'ozone', 'precipIntensity', 'precipProbability','pressure'])
    masterDF = pd.get_dummies(masterDF, columns=['time_of_day', 'precipType', 'uvIndex', 'month'])
    return masterDF

In [4]:
files = os.listdir("tongsdata/")

#files.remove(".DS_Store")
files.remove("energyBalance")

masterData = pd.DataFrame()

for file in files:
    r = createDayDataframe(file).loc['06:30':'18:00']
    print(file, r.shape)
    masterData = pd.concat([masterData, r], axis=0)
    
masterData = cleanDataframe(masterData)
masterData = featureEngineering(masterData)

01-11-2018.json (47, 15)
02-11-2018.json (47, 13)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  # This is added back by InteractiveShellApp.init_path()


03-11-2018.json (47, 15)
04-11-2018.json (47, 14)
05-11-2018.json (47, 14)
06-11-2018.json (47, 15)
07-11-2018.json (47, 14)
08-11-2018.json (47, 15)
09-11-2018.json (47, 14)
10-11-2018.json (47, 15)
11-11-2018.json (47, 14)
12-11-2018.json (47, 15)
13-11-2018.json (47, 14)
14-11-2018.json (47, 15)
15-11-2018.json (47, 15)
16-11-2018.json (47, 15)
17-11-2018.json (47, 14)
18-11-2018.json (47, 15)
19-11-2018.json (47, 15)
20-11-2018.json (47, 14)
21-11-2018.json (47, 15)
22-11-2018.json (47, 15)
23-11-2018.json (47, 15)
24-11-2018.json (47, 19)
25-11-2018.json (47, 19)
26-11-2018.json (47, 19)
27-11-2018.json (47, 19)
28-11-2018.json (47, 19)
29-11-2018.json (47, 15)
30-11-2018.json (47, 14)


In [5]:
masterData.shape

(1393, 67)

In [6]:
pd.set_option('display.max_columns', 100)
masterData.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1393 entries, 0 to 1409
Data columns (total 67 columns):
PV Power Generation    1393 non-null float64
apparentTemperature    1389 non-null object
cloudCover             1393 non-null float64
dewPoint               1389 non-null object
humidity               1389 non-null object
temperature            1389 non-null object
visibility             1388 non-null object
windSpeed              1388 non-null object
time_of_day_06:30      1393 non-null uint8
time_of_day_06:45      1393 non-null uint8
time_of_day_07:00      1393 non-null uint8
time_of_day_07:15      1393 non-null uint8
time_of_day_07:30      1393 non-null uint8
time_of_day_07:45      1393 non-null uint8
time_of_day_08:00      1393 non-null uint8
time_of_day_08:15      1393 non-null uint8
time_of_day_08:30      1393 non-null uint8
time_of_day_08:45      1393 non-null uint8
time_of_day_09:00      1393 non-null uint8
time_of_day_09:15      1393 non-null uint8
time_of_day_09:30      

Y
    * 'PV Power Generation'

X
    * 'time_of_day'         ->  one-hot
    * 'apparentTemperature' ->  OK
    * 'cloudCover'          ->  OK 
    * 'dewPoint'            ->  OK
    * 'humidity'            ->  OK
    * 'icon'                ->  drop
    * 'precipType'          ->  one-hot
    * 'summary'             ->  drop
    * 'temperature'         ->  OK
    * 'time'                ->  drop
    * 'uvIndex'             ->  one-hot
    * 'visibility'          ->  OK
    * 'windBearing'         ->  drop (NN NE EE ...)
    * 'windGust'            ->  drop (what is wind gust?)
    * 'windSpeed'           ->  OK
    * 'month'               ->  one-hot (Should do season or quarter?)
    * 'ozone'               ->  drop (why less data)
    * 'precipIntensity'     ->  drop (why less data)
    * 'precipProbability'   ->  drop (why less data)
    * 'pressure'            ->  drop (why less data)

___

In [5]:
features = list(masterData.drop(columns=['PV Power Generation']).columns)

In [6]:
masterData = masterData.dropna()

X = masterData.drop(columns=['PV Power Generation']).astype('float').values
y = masterData['PV Power Generation'].astype('float').values


In [7]:
masterData.to_csv("masterData_11.csv")

In [10]:
X.shape

(1388, 66)

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
from sklearn.linear_model import LinearRegression

LR = LinearRegression()
LR = LR.fit(X_train, y_train)

y_hat = LR.predict(X_test)

In [13]:
from sklearn.metrics import mean_squared_error, r2_score
import math

math.sqrt(mean_squared_error(y_test, y_hat))

367.6810834221374

In [13]:
result = pd.DataFrame()
result['Actual'] = y_test
result['Prediction'] = y_hat

In [17]:
result
result.to_csv("result2.csv")

In [10]:
from tpot import TPOTRegressor

tpot = TPOTRegressor(generations=10, population_size=30, verbosity=2) # more generation, pop size will make it closer,but it will take more time
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('best_model_30days_5.py')



HBox(children=(IntProgress(value=0, description='Optimization Progress', max=330, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: -105775.4419997663
Generation 2 - Current best internal CV score: -104715.5640267997
Generation 3 - Current best internal CV score: -102922.85849748194
Generation 4 - Current best internal CV score: -97264.4468309271
Generation 5 - Current best internal CV score: -96071.53541158461
Generation 6 - Current best internal CV score: -95969.96946055714
Generation 7 - Current best internal CV score: -90565.14997834575
Generation 8 - Current best internal CV score: -90565.14997834575
Generation 9 - Current best internal CV score: -90554.09064087698
Generation 10 - Current best internal CV score: -90554.09064087698

Best pipeline: ExtraTreesRegressor(ExtraTreesRegressor(input_matrix, bootstrap=False, max_features=0.55, min_samples_leaf=4, min_samples_split=14, n_estimators=100), bootstrap=False, max_features=0.8, min_samples_leaf=6, min_samples_split=7, n_estimators=100)
-98727.50753663958


True

model 3 RandomForestRegressor(only 1 estimator)

In [14]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

training_features, testing_features, training_target, testing_target = train_test_split(X, y)

# Average CV score on the training set was:-91867.4890210794
exported_pipeline = RandomForestRegressor(bootstrap=False, max_features=0.3, min_samples_leaf=1, min_samples_split=7, n_estimators=100)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
print(mean_squared_error(results, testing_target), 
mean_absolute_error(results, testing_target), 
r2_score(results, testing_target))

71197.40161996237 192.84444956772336 0.8393134764406354


model 4 StackingEstimator(2 estimators) better mse, mae, r2 score

In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from sklearn.preprocessing import PolynomialFeatures
from tpot.builtins import StackingEstimator, ZeroCount

training_features, testing_features, training_target, testing_target = train_test_split(X, y)

# Average CV score on the training set was:-87019.59662711709
exported_pipeline = make_pipeline(
    PolynomialFeatures(degree=2, include_bias=False, interaction_only=False),
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.5, min_samples_leaf=3, min_samples_split=8, n_estimators=100)),
    ZeroCount(),
    RandomForestRegressor(bootstrap=False, max_features=0.15000000000000002, min_samples_leaf=2, min_samples_split=6, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)

from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
print(mean_squared_error(results, testing_target), 
mean_absolute_error(results, testing_target), 
r2_score(results, testing_target))

82411.68873688129 185.46956724303556 0.8431508099074713


In [12]:
import numpy as np
import pandas as pd
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator

training_features, testing_features, training_target, testing_target = train_test_split(X, y)

# Average CV score on the training set was:-90554.09064087698
exported_pipeline = make_pipeline(
    StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.55, min_samples_leaf=4, min_samples_split=14, n_estimators=100)),
    ExtraTreesRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=6, min_samples_split=7, n_estimators=100)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
print(mean_squared_error(results, testing_target), 
mean_absolute_error(results, testing_target), 
r2_score(results, testing_target))

87369.62423989213 198.87958467069757 0.8162386602211584


In [19]:
from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error
mean_squared_error(results, testing_target)
mean_absolute_error(results, testing_target)
#r2_score(results, testing_target)
#mean_absolute_percentage_error(testing_target, results)

213.94625504322767

In [15]:
#from sklearn.feature_selection import SelectFromModel
featuresImportance = pd.DataFrame()
featuresImportance['Feature Name'] = features
featuresImportance['Importance'] = exported_pipeline.feature_importances_

featuresImportance.sort_values('Importance', ascending=False).head()

Unnamed: 0,Feature Name,Importance
55,uvIndex_0,0.315219
3,humidity,0.099506
4,temperature,0.087068
56,uvIndex_1,0.082377
0,apparentTemperature,0.058746


In [16]:
result = pd.DataFrame()
result['Actual'] = testing_target
result['Prediction'] = results
result['Error'] = np.abs(testing_target - results)
result.to_csv("result_tpot_30days_3.csv")

___

In [394]:
edaData = masterData

In [397]:
edaData['windSpeed']

0      3.21
1      3.21
2      3.25
3      3.25
4      3.24
5      3.25
6      3.25
7      2.93
8      3.28
9      4.13
10     5.25
11     5.44
12     5.62
13     5.81
14        6
15     6.73
16     7.46
17     8.19
18     8.92
19     8.75
20     8.65
21     8.61
22     8.66
23      8.7
24     8.78
25     8.89
26     9.04
27     8.78
28     8.52
29     8.27
       ... 
439    6.56
440    6.63
441    6.73
442    6.29
443    5.88
444    5.49
445    5.13
446    5.41
447     5.7
448       6
449     6.3
450    6.89
451    7.56
452    8.27
453    9.03
454    8.41
455    7.81
456    7.26
457    6.75
458    6.99
459    7.34
460    7.77
461    8.27
462    7.85
463    7.45
464    7.08
465    6.75
466    6.73
467    6.72
468    6.73
Name: windSpeed, Length: 465, dtype: object

TEST

In [21]:
def mean_absolute_percentage_error(y_true, y_pred): 
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100