In [178]:
import sys
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import seaborn as sns


from sklearn import preprocessing
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from pandas.plotting import scatter_matrix


In [179]:

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)
pd.options.display.precision=3


In [180]:

generation_data = pd.read_csv('./Plant_2_Generation_Data.csv')
weather_data = pd.read_csv('./Plant_2_Weather_Sensor_Data.csv')
generation_data.sample(5).style.set_properties(
    **{
        'background-color': 'OliveDrab',
        'color': 'white',
        'border-color': 'darkblack'
    })
weather_data.sample(5).style.set_properties(
    **{
        'background-color': 'pink',
        'color': 'Black',
        'border-color': 'darkblack'
    })

Unnamed: 0,DATE_TIME,PLANT_ID,SOURCE_KEY,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
116,2020-05-16 05:15:00,4136001,iq8k7ZNt4Mwm3w0,23.608173,21.290092,0.0
1023,2020-05-25 16:15:00,4136001,iq8k7ZNt4Mwm3w0,38.036724,50.688926,0.442961
854,2020-05-23 22:00:00,4136001,iq8k7ZNt4Mwm3w0,28.255185,26.978491,0.0
86,2020-05-15 21:30:00,4136001,iq8k7ZNt4Mwm3w0,28.427109,27.413928,0.0
2328,2020-06-08 07:15:00,4136001,iq8k7ZNt4Mwm3w0,25.630998,29.729707,0.213617


In [181]:
generation_data['DATE_TIME'] = pd.to_datetime(generation_data['DATE_TIME'],format = '%Y-%m-%d %H:%M')
weather_data['DATE_TIME'] = pd.to_datetime(weather_data['DATE_TIME'],format = '%Y-%m-%d %H:%M:%S')

In [182]:
df_solar = pd.merge(generation_data.drop(columns = ['PLANT_ID']), weather_data.drop(columns = ['PLANT_ID', 'SOURCE_KEY']), on='DATE_TIME')
df_solar.sample(5).style.background_gradient(cmap='cool')

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION
35316,2020-06-02 15:45:00,9kRcWv60rDACzjR,542.546667,531.506667,6389.733333,2247849353.733334,31.931493,43.152596,0.335381
67271,2020-06-17 19:00:00,WcxssY2VbP4hApt,0.0,0.0,4331.0,181911918.0,23.547952,22.599791,0.0
33533,2020-06-01 19:30:00,81aHJ1q11NBPMrL,0.0,0.0,4195.0,1215395180.0,23.448935,23.228959,0.0
2816,2020-05-16 08:15:00,4UPUqMRk7TRMgml,643.8,630.678571,586.428571,2433798.428571,29.11319,42.300316,0.489959
5981,2020-05-17 20:00:00,vOuJvMaM2sgwLmb,0.0,0.0,5358.0,2231792.0,30.300569,29.353965,0.0


In [183]:
df_solar["DATE"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.date
df_solar["TIME"] = pd.to_datetime(df_solar["DATE_TIME"]).dt.time
df_solar['DAY'] = pd.to_datetime(df_solar['DATE_TIME']).dt.day
df_solar['MONTH'] = pd.to_datetime(df_solar['DATE_TIME']).dt.month
df_solar['WEEK'] = pd.to_datetime(df_solar['DATE_TIME']).dt.week


# add hours and minutes for ml models
df_solar['HOURS'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.hour
df_solar['MINUTES'] = pd.to_datetime(df_solar['TIME'],format='%H:%M:%S').dt.minute
df_solar['TOTAL MINUTES PASS'] = df_solar['MINUTES'] + df_solar['HOURS']*60

# add date as string column
df_solar["DATE_STRING"] = df_solar["DATE"].astype(str) # add column with date as string
df_solar["HOURS"] = df_solar["HOURS"].astype(str)
df_solar["TIME"] = df_solar["TIME"].astype(str)

df_solar.tail(1)

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DATE,TIME,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,DATE_STRING
67697,2020-06-17 23:45:00,xoJJ8DcxJEcupym,0.0,0.0,4316.0,209300000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17


In [184]:
df_solar.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67698 entries, 0 to 67697
Data columns (total 18 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   DATE_TIME            67698 non-null  datetime64[ns]
 1   SOURCE_KEY           67698 non-null  object        
 2   DC_POWER             67698 non-null  float64       
 3   AC_POWER             67698 non-null  float64       
 4   DAILY_YIELD          67698 non-null  float64       
 5   TOTAL_YIELD          67698 non-null  float64       
 6   AMBIENT_TEMPERATURE  67698 non-null  float64       
 7   MODULE_TEMPERATURE   67698 non-null  float64       
 8   IRRADIATION          67698 non-null  float64       
 9   DATE                 67698 non-null  object        
 10  TIME                 67698 non-null  object        
 11  DAY                  67698 non-null  int64         
 12  MONTH                67698 non-null  int64         
 13  WEEK                 67698 non-

In [185]:
df_solar.isnull().sum()

DATE_TIME              0
SOURCE_KEY             0
DC_POWER               0
AC_POWER               0
DAILY_YIELD            0
TOTAL_YIELD            0
AMBIENT_TEMPERATURE    0
MODULE_TEMPERATURE     0
IRRADIATION            0
DATE                   0
TIME                   0
DAY                    0
MONTH                  0
WEEK                   0
HOURS                  0
MINUTES                0
TOTAL MINUTES PASS     0
DATE_STRING            0
dtype: int64

In [186]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df_solar['SOURCE_KEY_NUMBER'] = encoder.fit_transform(df_solar['SOURCE_KEY'])
df_solar.tail(22)


Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DATE,TIME,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,DATE_STRING,SOURCE_KEY_NUMBER
67676,2020-06-17 23:45:00,4UPUqMRk7TRMgml,0.0,0.0,4446.0,2653000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,0
67677,2020-06-17 23:45:00,81aHJ1q11NBPMrL,0.0,0.0,4306.0,1215000000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,1
67678,2020-06-17 23:45:00,9kRcWv60rDACzjR,0.0,0.0,4197.0,2248000000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,2
67679,2020-06-17 23:45:00,Et9kgGMDl729KT4,0.0,0.0,4020.0,1854000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,3
67680,2020-06-17 23:45:00,IQ2d7wF4YD8zU1Q,0.0,0.0,4251.0,20190000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,4
67681,2020-06-17 23:45:00,LYwnQax7tkwH5Cb,0.0,0.0,3968.0,1795000000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,5
67682,2020-06-17 23:45:00,LlT2YUhhzqhg5Sw,0.0,0.0,4121.0,282800000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,6
67683,2020-06-17 23:45:00,Mx2yZCDsyf6DPfv,0.0,0.0,4441.0,2683000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,7
67684,2020-06-17 23:45:00,NgDl19wMapZy17u,0.0,0.0,4239.0,111700000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,8
67685,2020-06-17 23:45:00,PeE6FRyGXUgsRhN,0.0,0.0,4258.0,1349000000.0,23.203,22.536,0.0,2020-06-17,23:45:00,17,6,25,23,45,1425,2020-06-17,9


In [187]:


# Convert 'DATE_TIME' column to datetime dtype
df_solar['DATE_TIME'] = pd.to_datetime(df_solar['DATE_TIME'])

# Create a mask for the missing values (0) in DC_POWER and AC_POWER columns
mask = (df_solar['DC_POWER'] == 0) & (df_solar['AC_POWER'] == 0)

# Calculate group-wise means for DC_POWER and AC_POWER based on DATE and TIME
group_means = df_solar[~mask].groupby(['DATE', 'TIME'])[['DC_POWER', 'AC_POWER']].mean()

# Update the missing values based on the group means using 'fillna' method
df_solar[['DC_POWER', 'AC_POWER']] = df_solar[['DC_POWER', 'AC_POWER']].mask(mask).fillna(
    df_solar.groupby(['DATE', 'TIME'])[['DC_POWER', 'AC_POWER']].transform('mean')
)

df_solar.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DATE,TIME,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,DATE_STRING,SOURCE_KEY_NUMBER
0,2020-05-15,4UPUqMRk7TRMgml,0.0,0.0,9425.0,2429000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,0
1,2020-05-15,81aHJ1q11NBPMrL,0.0,0.0,0.0,1215000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,1
2,2020-05-15,9kRcWv60rDACzjR,0.0,0.0,3075.333,2248000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,2
3,2020-05-15,Et9kgGMDl729KT4,0.0,0.0,269.933,1704000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,3
4,2020-05-15,IQ2d7wF4YD8zU1Q,0.0,0.0,3177.0,19940000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,4


In [188]:
# Read the dataset

# Convert 'DATE_TIME' column to datetime format
df_solar['DATE_TIME'] = pd.to_datetime(df_solar['DATE_TIME'])

# Extract the time component from 'DATE_TIME'
df_solar['TIME'] = df_solar['DATE_TIME'].dt.time

# Calculate the average AC_POWER for each unique SOURCE_KEY_NUMBER and time
df_solar['AVERAGE_AC_POWER'] = df_solar.groupby(['SOURCE_KEY_NUMBER', 'TIME','WEEK'])['AC_POWER'].transform('mean')

# Print the updated dataset with the average AC_POWER column
df_solar.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DATE,TIME,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,DATE_STRING,SOURCE_KEY_NUMBER,AVERAGE_AC_POWER
0,2020-05-15,4UPUqMRk7TRMgml,0.0,0.0,9425.0,2429000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,0,0.0
1,2020-05-15,81aHJ1q11NBPMrL,0.0,0.0,0.0,1215000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,1,0.0
2,2020-05-15,9kRcWv60rDACzjR,0.0,0.0,3075.333,2248000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,2,0.0
3,2020-05-15,Et9kgGMDl729KT4,0.0,0.0,269.933,1704000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,3,0.0
4,2020-05-15,IQ2d7wF4YD8zU1Q,0.0,0.0,3177.0,19940000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,4,0.0


In [189]:
df_solar['PRICE'] = df_solar.apply(
    lambda row: 0 if row['AC_POWER'] == 0 or row['AVERAGE_AC_POWER'] == 0
    else (row['AVERAGE_AC_POWER'] - row['AC_POWER']),axis=1
)

df_solar.head()

Unnamed: 0,DATE_TIME,SOURCE_KEY,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DATE,TIME,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,DATE_STRING,SOURCE_KEY_NUMBER,AVERAGE_AC_POWER,PRICE
0,2020-05-15,4UPUqMRk7TRMgml,0.0,0.0,9425.0,2429000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,0,0.0,0.0
1,2020-05-15,81aHJ1q11NBPMrL,0.0,0.0,0.0,1215000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,1,0.0,0.0
2,2020-05-15,9kRcWv60rDACzjR,0.0,0.0,3075.333,2248000000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,2,0.0,0.0
3,2020-05-15,Et9kgGMDl729KT4,0.0,0.0,269.933,1704000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,3,0.0,0.0
4,2020-05-15,IQ2d7wF4YD8zU1Q,0.0,0.0,3177.0,19940000.0,27.005,25.061,0.0,2020-05-15,00:00:00,15,5,20,0,0,0,2020-05-15,4,0.0,0.0


In [190]:
columns_to_drop = ['DATE_TIME', 'SOURCE_KEY','DATE','TIME','DATE_STRING']
df_solar.drop(columns_to_drop, axis=1, inplace=True)

X = df_solar[['SOURCE_KEY_NUMBER','DAY','MONTH','WEEK','HOURS','MINUTES']]
y = df_solar['PRICE']
df_solar.head()

Unnamed: 0,DC_POWER,AC_POWER,DAILY_YIELD,TOTAL_YIELD,AMBIENT_TEMPERATURE,MODULE_TEMPERATURE,IRRADIATION,DAY,MONTH,WEEK,HOURS,MINUTES,TOTAL MINUTES PASS,SOURCE_KEY_NUMBER,AVERAGE_AC_POWER,PRICE
0,0.0,0.0,9425.0,2429000.0,27.005,25.061,0.0,15,5,20,0,0,0,0,0.0,0.0
1,0.0,0.0,0.0,1215000000.0,27.005,25.061,0.0,15,5,20,0,0,0,1,0.0,0.0
2,0.0,0.0,3075.333,2248000000.0,27.005,25.061,0.0,15,5,20,0,0,0,2,0.0,0.0
3,0.0,0.0,269.933,1704000.0,27.005,25.061,0.0,15,5,20,0,0,0,3,0.0,0.0
4,0.0,0.0,3177.0,19940000.0,27.005,25.061,0.0,15,5,20,0,0,0,4,0.0,0.0


In [191]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
y_train = y_train.astype(np.int64)
y_test = y_test.astype(np.int64)


print(y_train.dtype)
print(y_train)


int64
39196    -45
30871    -70
51676   -365
12992    -67
52090    192
22652      0
12402      0
51025      0
61217      0
36000      0
42339      0
49158      0
51808      0
49389   -222
16647    -61
42258      0
66116      5
13288     -2
2792      31
25971      0
43735   -227
57232      0
2005       0
63166      0
53291      0
5181      36
32077      0
261        0
8386       0
67661      0
14565    183
66350    271
33257    205
39469   -118
43997     -6
26526    -69
36168      0
67280      0
3445     109
19217      0
66886     27
25162   -103
59436      0
31971      0
43570   -215
36319      0
48973      0
54106    305
35321    -58
66471    219
52358    135
39667     49
58760    -13
43551    287
9094    -218
17951    310
55792    209
65751      0
34718     36
17439      0
44720      0
1495    -101
46854      0
44305      0
38349      0
61094      0
25192    -71
30500    -52
9999       0
31604      0
26677   -107
25088   -117
62248   -337
62358   -298
9822     517
61055      0
54001 

In [192]:
seed = 8
scoring = 'accuracy'

In [193]:
from sklearn.linear_model import LinearRegression
models = []
models.append(('KNN', KNeighborsClassifier(n_neighbors=5)))
#models.append(('LR',LinearRegression))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))


# Evaluate each model in turn
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=10, random_state=seed, shuffle=True)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)


KNN: 0.480982 (0.006021)
CART: 0.505502 (0.005945)
NB: 0.300972 (0.032833)


In [194]:
for name, model in models:
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(name)
    print(accuracy_score(y_test, predictions))
    print(classification_report(y_test, predictions))

KNN
0.48301329394387
              precision    recall  f1-score   support

        -777       0.00      0.00      0.00         0
        -720       0.00      0.00      0.00         0
        -707       0.00      0.00      0.00         0
        -699       0.00      0.00      0.00         0
        -692       0.00      0.00      0.00         0
        -679       0.00      0.00      0.00         0
        -676       0.00      0.00      0.00         0
        -675       0.00      0.00      0.00         1
        -674       0.00      0.00      0.00         0
        -666       0.00      0.00      0.00         1
        -664       0.00      0.00      0.00         1
        -656       0.00      0.00      0.00         0
        -653       0.00      0.00      0.00         1
        -647       0.00      0.00      0.00         0
        -646       0.00      0.00      0.00         1
        -638       0.00      0.00      0.00         0
        -637       0.00      0.00      0.00         1
      

In [195]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

0.5053175775480059


In [196]:
example = np.array([[7,17,5,20,12,30]])
example = example.reshape(len(example), -1)
prediction = clf.predict(example)
print(prediction)

[174]
