In [181]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.stats import chi2
from scipy.stats import chi2_contingency
np.random.seed(10)

In [159]:
df1 = pd.read_csv('price_demand_data.csv') # Read CSV file "price_demand_data.csv"
df2 = pd.read_csv('weather_data.csv')
display(df1)
df1.info()
display(df2)
df2.info()

Unnamed: 0,REGION,SETTLEMENTDATE,TOTALDEMAND,PRICECATEGORY
0,VIC1,1/01/2021 0:30,4179.21,LOW
1,VIC1,1/01/2021 1:00,4047.76,LOW
2,VIC1,1/01/2021 1:30,3934.70,LOW
3,VIC1,1/01/2021 2:00,3766.45,LOW
4,VIC1,1/01/2021 2:30,3590.37,LOW
...,...,...,...,...
11659,VIC1,31/08/2021 22:00,4861.91,MEDIUM
11660,VIC1,31/08/2021 22:30,4748.74,MEDIUM
11661,VIC1,31/08/2021 23:00,4620.09,MEDIUM
11662,VIC1,31/08/2021 23:30,4834.00,MEDIUM


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11664 entries, 0 to 11663
Data columns (total 4 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   REGION          11664 non-null  object 
 1   SETTLEMENTDATE  11664 non-null  object 
 2   TOTALDEMAND     11664 non-null  float64
 3   PRICECATEGORY   11664 non-null  object 
dtypes: float64(1), object(3)
memory usage: 364.6+ KB


Unnamed: 0,Date,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,Speed of maximum wind gust (km/h),Time of maximum wind gust,9am Temperature (°C),...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
0,1/01/2021,15.6,29.9,0.0,2.8,9.3,NNE,31.0,13:14,19.2,...,6,N,2,1018.8,28.1,43,5.0,E,13,1015.3
1,2/01/2021,18.4,29.0,0.0,9.4,1.3,NNW,30.0,8:22,23.3,...,7,NNW,17,1013.3,28.7,38,7.0,SW,4,1008.5
2,3/01/2021,17.0,26.2,12.6,4.8,7.1,WSW,33.0,17:55,18.3,...,8,WSW,4,1007.7,23.5,59,4.0,SSW,2,1005.2
3,4/01/2021,16.0,18.6,2.6,3.8,0.0,SSE,41.0,16:03,16.2,...,8,SSE,11,1010.0,18.2,82,8.0,SSW,17,1011.0
4,5/01/2021,15.9,19.1,11.2,1.0,0.0,SSE,35.0,11:02,17.2,...,8,SSE,13,1012.5,18.2,82,8.0,SSE,19,1013.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,27/08/2021,4.6,13.6,0.0,1.2,3.8,SSW,15.0,12:18,7.7,...,7,,Calm,1020.0,12.8,65,7.0,SSE,7,1017.4
239,28/08/2021,5.3,17.8,0.0,1.6,9.6,N,39.0,13:14,9.1,...,1,N,7,1018.6,17.4,31,3.0,NNW,24,1013.5
240,29/08/2021,9.1,16.2,0.6,6.4,4.3,NNE,33.0,1:50,10.6,...,7,N,13,1011.4,12.8,84,7.0,S,6,1010.4
241,30/08/2021,6.4,17.6,4.0,1.4,7.4,NNW,50.0,14:04,11.1,...,7,N,15,1016.1,16.8,45,1.0,NNW,28,1013.2


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Date                               243 non-null    object 
 1   Minimum temperature (°C)           242 non-null    float64
 2   Maximum temperature (°C)           242 non-null    float64
 3   Rainfall (mm)                      241 non-null    float64
 4   Evaporation (mm)                   243 non-null    float64
 5   Sunshine (hours)                   243 non-null    float64
 6   Direction of maximum wind gust     240 non-null    object 
 7   Speed of maximum wind gust (km/h)  240 non-null    float64
 8   Time of maximum wind gust          240 non-null    object 
 9   9am Temperature (°C)               242 non-null    float64
 10  9am relative humidity (%)          242 non-null    float64
 11  9am cloud amount (oktas)           243 non-null    int64  

In [160]:
'''
Display whole row for max TOTALDEMAND
'''
# Find max value per each 48 row,ie 48 time slots per day
df1['part'] = df1.index // 48
maxdemand = df1.groupby('part')['TOTALDEMAND'].transform(max) == df1['TOTALDEMAND']
df1 = df1[maxdemand].reset_index()
print(df1)

     index REGION    SETTLEMENTDATE  TOTALDEMAND PRICECATEGORY  part
0       34   VIC1   1/01/2021 17:30      5019.64           LOW     0
1       81   VIC1   2/01/2021 17:00      4964.35           LOW     1
2      132   VIC1   3/01/2021 18:30      4503.31           LOW     2
3      180   VIC1   4/01/2021 18:30      4764.18           LOW     3
4      225   VIC1   5/01/2021 17:00      4800.64           LOW     4
..     ...    ...               ...          ...           ...   ...
238  11461   VIC1  27/08/2021 19:00      6769.89          HIGH   238
239  11509   VIC1  28/08/2021 19:00      5716.32        MEDIUM   239
240  11557   VIC1  29/08/2021 19:00      6227.89       EXTREME   240
241  11604   VIC1  30/08/2021 18:30      6072.91        MEDIUM   241
242  11653   VIC1  31/08/2021 19:00      5779.56        MEDIUM   242

[243 rows x 6 columns]


In [170]:
'''
Extract date out of timestamp column, named new column of DataFrame as "DATE"
https://stackoverflow.com/questions/65775172/merging-pandas-dataframe-by-datetime-and-date-column
https://stackoverflow.com/questions/39662149/pandas-extract-date-and-time-from-timestamp
'''
df1['SETTLEMENTDATE'] = df1['SETTLEMENTDATE'].apply(lambda x : pd.to_datetime(str(x)))
df1['SETTLEMENTDATE'] = pd.to_datetime(df1['SETTLEMENTDATE'])
df1['Date'] = df1['SETTLEMENTDATE'].dt.date
df1['Time'] = df1['SETTLEMENTDATE'].dt.time
df2['Date'] = pd.to_datetime(df2['Date']).dt.date
df = df2.merge(df1, on='Date', how='left').drop('SETTLEMENTDATE', axis=1)

df = df.drop(columns = ['REGION','index','part'])

# Rearrage order of features
cols = df.columns.tolist()
cols = cols[:1] + cols[-3:] + cols[1:-3]
df = df[cols] 
df



Unnamed: 0,Date,TOTALDEMAND,PRICECATEGORY,Time,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
0,2021-01-01,5019.64,LOW,17:30:00,15.6,29.9,0.0,2.8,9.3,NNE,...,6,N,2,1018.8,28.1,43,5.0,E,13,1015.3
1,2021-02-01,4964.35,LOW,17:00:00,18.4,29.0,0.0,9.4,1.3,NNW,...,7,NNW,17,1013.3,28.7,38,7.0,SW,4,1008.5
2,2021-03-01,4503.31,LOW,18:30:00,17.0,26.2,12.6,4.8,7.1,WSW,...,8,WSW,4,1007.7,23.5,59,4.0,SSW,2,1005.2
3,2021-04-01,4764.18,LOW,18:30:00,16.0,18.6,2.6,3.8,0.0,SSE,...,8,SSE,11,1010.0,18.2,82,8.0,SSW,17,1011.0
4,2021-05-01,4800.64,LOW,17:00:00,15.9,19.1,11.2,1.0,0.0,SSE,...,8,SSE,13,1012.5,18.2,82,8.0,SSE,19,1013.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,2021-08-27,6769.89,HIGH,19:00:00,4.6,13.6,0.0,1.2,3.8,SSW,...,7,,Calm,1020.0,12.8,65,7.0,SSE,7,1017.4
239,2021-08-28,5716.32,MEDIUM,19:00:00,5.3,17.8,0.0,1.6,9.6,N,...,1,N,7,1018.6,17.4,31,3.0,NNW,24,1013.5
240,2021-08-29,6227.89,EXTREME,19:00:00,9.1,16.2,0.6,6.4,4.3,NNE,...,7,N,13,1011.4,12.8,84,7.0,S,6,1010.4
241,2021-08-30,6072.91,MEDIUM,18:30:00,6.4,17.6,4.0,1.4,7.4,NNW,...,7,N,15,1016.1,16.8,45,1.0,NNW,28,1013.2


In [175]:
'''
export df 'result' file to csv
'''
#df.to_csv('MAX_DAILY_DEMAND.csv')

Unnamed: 0,Date,TOTALDEMAND,PRICECATEGORY,Time,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
79,2021-03-21,4958.01,LOW,17:30:00,16.7,21.7,1.2,3.0,0.0,ESE,...,8,S,2,1022.6,21.5,86,8.0,,Calm,1020.0


In [176]:
# DATA imputation: cleaning 'Calm' 
df = df.replace(to_replace='Calm', value=0)
df.loc[[79]]

Unnamed: 0,Date,TOTALDEMAND,PRICECATEGORY,Time,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
79,2021-03-21,4958.01,LOW,17:30:00,16.7,21.7,1.2,3.0,0.0,ESE,...,8,S,2,1022.6,21.5,86,8.0,,0,1020.0


In [184]:
# DATA imputation: replacing null value with mean value
miss_mean_imputer = SimpleImputer(missing_values='NaN', strategy='mean')

miss_mean_imputer = miss_mean_imputer.fit(df[1:])

imputed_df = miss_mean_imputer.transform(df.values)

TypeError: float() argument must be a string or a number, not 'datetime.date'

In [166]:


print(df['PRICECATEGORY'].value_counts())

data = df[['Minimum temperature (°C)','Maximum temperature (°C)','Rainfall (mm)','Evaporation (mm)','Sunshine (hours)','Speed of maximum wind gust (km/h)',\
              '9am Temperature (°C)','9am wind speed (km/h)','3pm Temperature (°C)','3pm wind speed (km/h)']].astype(float)

classlabel = df['PRICECATEGORY']

chisqt = pd.crosstab()


MEDIUM     80
LOW        75
HIGH       50
EXTREME    38
Name: PRICECATEGORY, dtype: int64


TypeError: crosstab() missing 1 required positional argument: 'columns'

In [185]:
##randomly select 80% of the instances to be training and the rest to be testing

classlabel = df['PRICECATEGORY']
X_train, X_test, y_train, y_test = train_test_split(data,classlabel, train_size=0.8, random_state=42)

#normalise the data to have 0 mean and unit variance using the library functions.  This will help for later
#computation of distances between instances
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [168]:
y_pred=knn.predict(X_test)
print(accuracy_score(y_test, y_pred))

NotFittedError: This KNeighborsClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.