# Load the two dataframes

In [162]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from sklearn import model_selection
from sklearn import metrics
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import normalized_mutual_info_score as nmi
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from scipy.stats import chi2
from scipy.stats import chi2_contingency
np.random.seed(10)

# Load the date in timedate format
price_demand = pd.read_csv('price_demand_data.csv', parse_dates=['SETTLEMENTDATE'], dayfirst=True)
weather = pd.read_csv('weather_data.csv', parse_dates=['Date'], dayfirst=True)
display(price_demand)
display(weather)

Unnamed: 0,REGION,SETTLEMENTDATE,TOTALDEMAND,PRICECATEGORY
0,VIC1,2021-01-01 00:30:00,4179.21,LOW
1,VIC1,2021-01-01 01:00:00,4047.76,LOW
2,VIC1,2021-01-01 01:30:00,3934.70,LOW
3,VIC1,2021-01-01 02:00:00,3766.45,LOW
4,VIC1,2021-01-01 02:30:00,3590.37,LOW
...,...,...,...,...
11659,VIC1,2021-08-31 22:00:00,4861.91,MEDIUM
11660,VIC1,2021-08-31 22:30:00,4748.74,MEDIUM
11661,VIC1,2021-08-31 23:00:00,4620.09,MEDIUM
11662,VIC1,2021-08-31 23:30:00,4834.00,MEDIUM


Unnamed: 0,Date,Minimum temperature (°C),Maximum temperature (°C),Rainfall (mm),Evaporation (mm),Sunshine (hours),Direction of maximum wind gust,Speed of maximum wind gust (km/h),Time of maximum wind gust,9am Temperature (°C),...,9am cloud amount (oktas),9am wind direction,9am wind speed (km/h),9am MSL pressure (hPa),3pm Temperature (°C),3pm relative humidity (%),3pm cloud amount (oktas),3pm wind direction,3pm wind speed (km/h),3pm MSL pressure (hPa)
0,2021-01-01,15.6,29.9,0.0,2.8,9.3,NNE,31.0,13:14,19.2,...,6,N,2,1018.8,28.1,43,5.0,E,13,1015.3
1,2021-01-02,18.4,29.0,0.0,9.4,1.3,NNW,30.0,8:22,23.3,...,7,NNW,17,1013.3,28.7,38,7.0,SW,4,1008.5
2,2021-01-03,17.0,26.2,12.6,4.8,7.1,WSW,33.0,17:55,18.3,...,8,WSW,4,1007.7,23.5,59,4.0,SSW,2,1005.2
3,2021-01-04,16.0,18.6,2.6,3.8,0.0,SSE,41.0,16:03,16.2,...,8,SSE,11,1010.0,18.2,82,8.0,SSW,17,1011.0
4,2021-01-05,15.9,19.1,11.2,1.0,0.0,SSE,35.0,11:02,17.2,...,8,SSE,13,1012.5,18.2,82,8.0,SSE,19,1013.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
238,2021-08-27,4.6,13.6,0.0,1.2,3.8,SSW,15.0,12:18,7.7,...,7,,Calm,1020.0,12.8,65,7.0,SSE,7,1017.4
239,2021-08-28,5.3,17.8,0.0,1.6,9.6,N,39.0,13:14,9.1,...,1,N,7,1018.6,17.4,31,3.0,NNW,24,1013.5
240,2021-08-29,9.1,16.2,0.6,6.4,4.3,NNE,33.0,1:50,10.6,...,7,N,13,1011.4,12.8,84,7.0,S,6,1010.4
241,2021-08-30,6.4,17.6,4.0,1.4,7.4,NNW,50.0,14:04,11.1,...,7,N,15,1016.1,16.8,45,1.0,NNW,28,1013.2


# Data preparation

# Data mining

In [163]:
# Replace PRICECATEGORY values with Numeric values in new column PRICE_NUMERIC
price_demand['PRICECATEGORY'].replace( {'LOW' : 0, 'MEDIUM' : 1, 'HIGH' : 2, 'EXTREME': 3 }, inplace=True)

# Extract Date and Time seperately from SETTLEMENTDATE in date time format
price_demand['Date'] = pd.to_datetime(price_demand['SETTLEMENTDATE'].dt.date)
price_demand['Time'] = price_demand['SETTLEMENTDATE'].dt.time

# Due to the special definition of a day in raw dataset, a day invloving two dates. 
# Each ordinal day involves every 48 rows of 30-minute time slots
price_demand['Ordinal_Day'] = price_demand.index // 48

## Finding the maximum daily energy usage

In [164]:
demand = price_demand

max_demand = demand.groupby('Ordinal_Day')['TOTALDEMAND'].transform(max) == demand['TOTALDEMAND']
max_demand = demand[max_demand].reset_index()
max_demand['Max_TOTALDEMAND'] = max_demand['TOTALDEMAND']
max_demand.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 243 entries, 0 to 242
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   index            243 non-null    int64         
 1   REGION           243 non-null    object        
 2   SETTLEMENTDATE   243 non-null    datetime64[ns]
 3   TOTALDEMAND      243 non-null    float64       
 4   PRICECATEGORY    243 non-null    int64         
 5   Date             243 non-null    datetime64[ns]
 6   Time             243 non-null    object        
 7   Ordinal_Day      243 non-null    int64         
 8   Max_TOTALDEMAND  243 non-null    float64       
dtypes: datetime64[ns](2), float64(2), int64(3), object(2)
memory usage: 17.2+ KB


## Finding the maximum daily price category

In [165]:
price = price_demand

# Have to group by ordinal day, otherwise the first day of following month will show up.
max_price = price.groupby('Ordinal_Day')['PRICECATEGORY'].max()
max_price = max_price.reset_index()
max_price['Max_PRICECATEGORY'] = max_price['PRICECATEGORY']
max_price = max_price.drop(columns = ['PRICECATEGORY'])

max_price


Unnamed: 0,Ordinal_Day,Max_PRICECATEGORY
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
238,238,3
239,239,1
240,240,3
241,241,2


# Merge all datasets

In [166]:
weather.set_index('Date')
max_demand.set_index('Ordinal_Day')
max_price.set_index('Ordinal_Day')

max_price_demand = pd.merge(max_demand, max_price, how='outer', on='Ordinal_Day')
all_dataset = pd.merge(max_price_demand,weather, how='outer', on='Date')

In [167]:
print(list(all_dataset))
#list(all_dataset)


['index', 'REGION', 'SETTLEMENTDATE', 'TOTALDEMAND', 'PRICECATEGORY', 'Date', 'Time', 'Ordinal_Day', 'Max_TOTALDEMAND', 'Max_PRICECATEGORY', 'Minimum temperature (°C)', 'Maximum temperature (°C)', 'Rainfall (mm)', 'Evaporation (mm)', 'Sunshine (hours)', 'Direction of maximum wind gust ', 'Speed of maximum wind gust (km/h)', 'Time of maximum wind gust', '9am Temperature (°C)', '9am relative humidity (%)', '9am cloud amount (oktas)', '9am wind direction', '9am wind speed (km/h)', '9am MSL pressure (hPa)', '3pm Temperature (°C)', '3pm relative humidity (%)', '3pm cloud amount (oktas)', '3pm wind direction', '3pm wind speed (km/h)', '3pm MSL pressure (hPa)']


In [168]:
dataset = all_dataset[['Date','Max_TOTALDEMAND', 'Max_PRICECATEGORY',\
                    'Minimum temperature (°C)', 'Maximum temperature (°C)',\
                    '3pm MSL pressure (hPa)', '3pm Temperature (°C)', '3pm cloud amount (oktas)', '3pm relative humidity (%)', '3pm wind speed (km/h)', \
                    '9am MSL pressure (hPa)', '9am Temperature (°C)', '9am cloud amount (oktas)', '9am relative humidity (%)', '9am wind speed (km/h)', \
                    'Evaporation (mm)', 'Rainfall (mm)', 'Speed of maximum wind gust (km/h)','Sunshine (hours)']]
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 242
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Date                               243 non-null    datetime64[ns]
 1   Max_TOTALDEMAND                    243 non-null    float64       
 2   Max_PRICECATEGORY                  243 non-null    int64         
 3   Minimum temperature (°C)           242 non-null    float64       
 4   Maximum temperature (°C)           242 non-null    float64       
 5   3pm MSL pressure (hPa)             242 non-null    float64       
 6   3pm Temperature (°C)               243 non-null    float64       
 7   3pm cloud amount (oktas)           242 non-null    float64       
 8   3pm relative humidity (%)          243 non-null    int64         
 9   3pm wind speed (km/h)              243 non-null    object        
 10  9am MSL pressure (hPa)             241

### Data Cleaning

In [169]:
# DATA imputation: convert 'Calm' to 0 for wind speed columns
dataset = dataset.replace(to_replace='Calm', value=0)

# Convert object to float for certain columns
dataset['9am wind speed (km/h)'] = dataset['9am wind speed (km/h)'].astype(np.float64)
dataset['3pm wind speed (km/h)'] = dataset['3pm wind speed (km/h)'].astype(np.float64)

In [170]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 242
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Date                               243 non-null    datetime64[ns]
 1   Max_TOTALDEMAND                    243 non-null    float64       
 2   Max_PRICECATEGORY                  243 non-null    int64         
 3   Minimum temperature (°C)           242 non-null    float64       
 4   Maximum temperature (°C)           242 non-null    float64       
 5   3pm MSL pressure (hPa)             242 non-null    float64       
 6   3pm Temperature (°C)               243 non-null    float64       
 7   3pm cloud amount (oktas)           242 non-null    float64       
 8   3pm relative humidity (%)          243 non-null    int64         
 9   3pm wind speed (km/h)              243 non-null    float64       
 10  9am MSL pressure (hPa)             241

In [171]:
dataset.describe()

Unnamed: 0,Max_TOTALDEMAND,Max_PRICECATEGORY,Minimum temperature (°C),Maximum temperature (°C),3pm MSL pressure (hPa),3pm Temperature (°C),3pm cloud amount (oktas),3pm relative humidity (%),3pm wind speed (km/h),9am MSL pressure (hPa),9am Temperature (°C),9am cloud amount (oktas),9am relative humidity (%),9am wind speed (km/h),Evaporation (mm),Rainfall (mm),Speed of maximum wind gust (km/h),Sunshine (hours)
count,243.0,243.0,242.0,242.0,242.0,243.0,242.0,243.0,243.0,241.0,242.0,243.0,242.0,242.0,243.0,241.0,240.0,243.0
mean,6137.080165,1.596708,11.050826,19.445868,1015.824793,18.040329,5.301653,56.930041,13.139918,1017.740664,13.720661,5.164609,74.454545,9.842975,3.902469,1.576763,34.4125,5.349383
std,818.153258,1.06499,3.870242,5.354085,7.435859,4.963547,2.392051,14.017376,6.248055,7.683402,4.306618,2.562778,14.177593,5.237129,2.702141,4.498754,10.909319,3.604902
min,4473.05,0.0,1.7,10.6,989.0,8.6,0.0,21.0,0.0,989.7,3.0,0.0,25.0,0.0,0.0,0.0,15.0,0.0
25%,5507.765,1.0,8.1,15.5,1011.0,14.4,3.0,48.0,9.0,1012.8,10.925,3.0,65.0,7.0,1.9,0.0,28.0,2.15
50%,6234.67,2.0,10.9,18.3,1015.75,17.1,7.0,56.0,13.0,1018.1,13.4,7.0,75.0,9.0,3.2,0.0,33.0,4.9
75%,6763.515,3.0,13.8,21.8,1021.6,20.15,7.0,66.0,17.0,1023.7,16.4,7.0,84.0,13.0,5.6,0.6,41.0,8.35
max,8196.83,3.0,22.2,39.2,1032.4,35.2,8.0,98.0,37.0,1034.2,30.9,8.0,100.0,30.0,13.8,43.2,67.0,13.1


In [172]:
dataset.isna().sum()

Date                                 0
Max_TOTALDEMAND                      0
Max_PRICECATEGORY                    0
Minimum temperature (°C)             1
Maximum temperature (°C)             1
3pm MSL pressure (hPa)               1
3pm Temperature (°C)                 0
3pm cloud amount (oktas)             1
3pm relative humidity (%)            0
3pm wind speed (km/h)                0
9am MSL pressure (hPa)               2
9am Temperature (°C)                 1
9am cloud amount (oktas)             0
9am relative humidity (%)            1
9am wind speed (km/h)                1
Evaporation (mm)                     0
Rainfall (mm)                        2
Speed of maximum wind gust (km/h)    3
Sunshine (hours)                     0
dtype: int64

In [173]:
# Fill NaN with mean value for numeric cells
dataset = dataset.fillna(round(dataset.mean(),1))

  dataset = dataset.fillna(round(dataset.mean(),1))


In [174]:
dataset.isna().sum()

Date                                 0
Max_TOTALDEMAND                      0
Max_PRICECATEGORY                    0
Minimum temperature (°C)             0
Maximum temperature (°C)             0
3pm MSL pressure (hPa)               0
3pm Temperature (°C)                 0
3pm cloud amount (oktas)             0
3pm relative humidity (%)            0
3pm wind speed (km/h)                0
9am MSL pressure (hPa)               0
9am Temperature (°C)                 0
9am cloud amount (oktas)             0
9am relative humidity (%)            0
9am wind speed (km/h)                0
Evaporation (mm)                     0
Rainfall (mm)                        0
Speed of maximum wind gust (km/h)    0
Sunshine (hours)                     0
dtype: int64

In [175]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 243 entries, 0 to 242
Data columns (total 19 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   Date                               243 non-null    datetime64[ns]
 1   Max_TOTALDEMAND                    243 non-null    float64       
 2   Max_PRICECATEGORY                  243 non-null    int64         
 3   Minimum temperature (°C)           243 non-null    float64       
 4   Maximum temperature (°C)           243 non-null    float64       
 5   3pm MSL pressure (hPa)             243 non-null    float64       
 6   3pm Temperature (°C)               243 non-null    float64       
 7   3pm cloud amount (oktas)           243 non-null    float64       
 8   3pm relative humidity (%)          243 non-null    int64         
 9   3pm wind speed (km/h)              243 non-null    float64       
 10  9am MSL pressure (hPa)             243

In [176]:
# Pairwise pearson r correlation matrix between different variables
corr = dataset[['Max_TOTALDEMAND', 'Max_PRICECATEGORY',\
                    'Minimum temperature (°C)', 'Maximum temperature (°C)',\
                    '3pm MSL pressure (hPa)', '3pm Temperature (°C)', '3pm cloud amount (oktas)', '3pm relative humidity (%)', '3pm wind speed (km/h)', \
                    '9am MSL pressure (hPa)', '9am Temperature (°C)', '9am cloud amount (oktas)', '9am relative humidity (%)', '9am wind speed (km/h)', \
                    'Evaporation (mm)', 'Rainfall (mm)', 'Speed of maximum wind gust (km/h)','Sunshine (hours)']].corr(method='pearson')

# corr = dataset[['Minimum temperature (°C)', 'Maximum temperature (°C)', 'Rainfall (mm)', \
#                 'Evaporation (mm)', 'Sunshine (hours)', 'Speed of maximum wind gust (km/h)' \
#                 ]].corr(method='pearson')
# corr = dataset[['Minimum temperature (°C)', 'Maximum temperature (°C)', 'Evaporation (mm)', \
#                   '9am Temperature (°C)', '3pm Temperature (°C)' ]].corr(method='pearson')

corr

Unnamed: 0,Max_TOTALDEMAND,Max_PRICECATEGORY,Minimum temperature (°C),Maximum temperature (°C),3pm MSL pressure (hPa),3pm Temperature (°C),3pm cloud amount (oktas),3pm relative humidity (%),3pm wind speed (km/h),9am MSL pressure (hPa),9am Temperature (°C),9am cloud amount (oktas),9am relative humidity (%),9am wind speed (km/h),Evaporation (mm),Rainfall (mm),Speed of maximum wind gust (km/h),Sunshine (hours)
Max_TOTALDEMAND,1.0,0.68976,-0.489533,-0.289752,-0.002709,-0.324949,0.06917,0.063297,-0.041129,0.055153,-0.391435,-0.168528,0.101348,0.115612,-0.266111,-0.074812,0.080592,-0.137979
Max_PRICECATEGORY,0.68976,1.0,-0.598147,-0.510314,0.186315,-0.511981,0.077795,0.20266,-0.223119,0.217823,-0.563436,-0.123949,0.331852,-0.031631,-0.487292,0.014153,-0.187746,-0.25042
Minimum temperature (°C),-0.489533,-0.598147,1.0,0.707525,-0.278096,0.661579,0.062062,-0.055011,-0.022712,-0.32602,0.915726,0.206426,-0.333646,-0.003147,0.655091,0.04335,0.055604,0.081852
Maximum temperature (°C),-0.289752,-0.510314,0.707525,1.0,-0.17845,0.965177,-0.238622,-0.456635,-0.028451,-0.101658,0.820029,-0.17009,-0.309292,-0.073599,0.620727,-0.124851,-0.05615,0.469026
3pm MSL pressure (hPa),-0.002709,0.186315,-0.278096,-0.17845,1.0,-0.130976,-0.225539,-0.031038,-0.19454,0.953812,-0.277609,-0.121659,0.138686,-0.260846,-0.254218,-0.056866,-0.426542,0.132126
3pm Temperature (°C),-0.324949,-0.511981,0.661579,0.965177,-0.130976,1.0,-0.27448,-0.543138,0.006479,-0.039179,0.761603,-0.181595,-0.261369,-0.087414,0.560214,-0.126947,-0.097792,0.487546
3pm cloud amount (oktas),0.06917,0.077795,0.062062,-0.238622,-0.225539,-0.27448,1.0,0.342132,-0.056507,-0.287912,-0.00327,0.392265,0.03122,0.079501,-0.095794,0.111438,0.135828,-0.722236
3pm relative humidity (%),0.063297,0.20266,-0.055011,-0.456635,-0.031038,-0.543138,0.342132,1.0,-0.197626,-0.143348,-0.15713,0.424203,0.402234,-0.18858,-0.208511,0.194572,-0.083754,-0.550315
3pm wind speed (km/h),-0.041129,-0.223119,-0.022712,-0.028451,-0.19454,0.006479,-0.056507,-0.197626,1.0,-0.136046,0.02296,0.002685,-0.23155,0.396611,0.022986,0.047473,0.674301,0.09135
9am MSL pressure (hPa),0.055153,0.217823,-0.32602,-0.101658,0.953812,-0.039179,-0.287912,-0.143348,-0.136046,1.0,-0.285476,-0.198445,0.142855,-0.253368,-0.272172,-0.13861,-0.414861,0.206701


### 1st model: Predict max daily energy demand on weather
### LINEAR REGRESSION MODEL

In [177]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import linear_model

# Choose the features (input)
X = dataset[['Minimum temperature (°C)', 'Maximum temperature (°C)', 'Evaporation (mm)', '9am Temperature (°C)', '3pm Temperature (°C)']]

# What we have to predict (output)
Y = dataset['Max_TOTALDEMAND']

# Splitting the data into training set and testing set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.8, random_state = 1)

# Generate the regression model (import .linear_model to use .LinearRegression() func)
lm = linear_model.LinearRegression()

# Create/ Produce the model: Fit training data into model
model = lm.fit(X_train, Y_train)
display(model)

# Predict coefficient & intercept from linear regression model(lm)
print(lm.coef_, lm.intercept_)

[-179.74679919  138.97436703   -7.95064206   67.8489649  -137.16496524] 7061.4147822456125


In [178]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


X= dataset[['Minimum temperature (°C)','Maximum temperature (°C)','Evaporation (mm)','Sunshine (hours)','Speed of maximum wind gust (km/h)',\
          '9am Temperature (°C)','9am wind speed (km/h)','9am MSL pressure (hPa)','3pm Temperature (°C)',\
          '3pm wind speed (km/h)','3pm MSL pressure (hPa)']].astype(float)

y = dataset['Max_TOTALDEMAND']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# partial code here...
lm = linear_model.LinearRegression()
model = lm.fit(X_train, y_train)
y_test_predictions = lm.predict(X_test)
print('actual TOTALDEMAND values of the first 5 test data:')
print(y_test[0:5])
print('')
print('predicted TOTALDEMAND values of the first 5 test data:')
print(y_test_predictions[0:5])
print('')

# coefficients

print('Coefficients: ', end = ' ')
print(lm.coef_)
print('')

# intercept:
print('Intercept: ', end = ' ')
print(lm.intercept_)
print('')

# R^2
r2_test = lm.score(X_test, y_test)
r2_train = lm.score(X_train, y_train)

print('Coefficient of determination (test): {0:.2f}'.format(r2_test))
print('Coefficient of determination (training): {0:.2f}'.format(r2_train))

actual TOTALDEMAND values of the first 5 test data:
46     6816.18
225    6049.06
180    6648.79
116    6487.76
124    6389.08
Name: Max_TOTALDEMAND, dtype: float64

predicted TOTALDEMAND values of the first 5 test data:
[5550.47820681 6624.94305158 6643.5714578  5903.74863225 6283.67700031]

Coefficients:  [-141.56394485  108.99722      16.42491137  -27.37004129    4.89750583
    0.49282419   14.34326117   51.76216512 -101.01358225  -18.63543029
  -56.665091  ]

Intercept:  12268.297803043512

Coefficient of determination (test): 0.10
Coefficient of determination (training): 0.40


#### Access how regression model is doing

In [179]:
from sklearn.metrics import mean_squared_error, r2_score
r2_test = lm.score(X_test, Y_test) #on testing data
print(r2_test)

-0.4207872123686436


### 2nd model: Predicts Max Daily Price: CLASSIFICATION: KNN

In [180]:
import pandas as pd
from sklearn.model_selection import train_test_split # For splitting
from sklearn.metrics import accuracy_score # To check accuracy of the prediction
from sklearn import neighbors # To produce/generate KNeighborsClassifier
from sklearn import preprocessing # To scale/normalise the features

# Select features (input)
features = dataset[['Minimum temperature (°C)', 'Maximum temperature (°C)', 'Evaporation (mm)', 'Sunshine (hours)','9am Temperature (°C)', '3pm Temperature (°C)' ]]

# What we want to predict (output)
classlabel = dataset['Max_PRICECATEGORY']

# Splitting
features_train, features_test, class_train, class_test = train_test_split(features, classlabel, train_size = 0.8, random_state = 1)

# Scale/Normalize the features
scaler = preprocessing.StandardScaler().fit(features_train) 
features_train = scaler.transform(features_train) 
features_test = scaler.transform(features_test)

# Generating KNN classifier model & import neighbors from sklearn library
knn = neighbors.KNeighborsClassifier(n_neighbors = 5)

# Creating model: Fitting features & classlabel in training data set
knn.fit(features_train, class_train)

# Produce predictions & check its .accuracy_score() on testing data set
predictions = knn.predict(features_test)
print(accuracy_score(class_test, predictions))


0.5714285714285714


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### 2nd model: Predict max daily price: CLASSIFICATION: DECISION TREES

In [181]:
import pandas as pd
from sklearn.model_selection import train_test_split # For splitting
from sklearn.tree import DecisionTreeClassifier # For generating the model
from sklearn.metrics import accuracy_score # To check accuracy of the prediction
from sklearn import preprocessing

# Select features (input)
features = dataset[['Minimum temperature (°C)', 'Maximum temperature (°C)', 'Evaporation (mm)', 'Sunshine (hours)','9am Temperature (°C)', '3pm Temperature (°C)']]

# What we want to predict (output)
classlabel = dataset['Max_PRICECATEGORY']

# Splitting
features_train, features_test, class_train, class_test = train_test_split(features, classlabel, train_size = 0.8, random_state = 1)

# scaling/Normalizing the values
scaler = preprocessing.StandardScaler().fit(features_train)
features_train = scaler.transform(features_train)
features_test = scaler.transform(features_test)

#  Generating the decision tree model
dt = DecisionTreeClassifier(criterion = 'entropy', random_state = 1)

# Create tree = Fitting features and classlabel into the model
dt.fit(features_train, class_train)

# Check the accuracy_score of the prediction
predictions = dt.predict(features_test) # predictions based on testing set
print(accuracy_score(class_test, predictions)) # report how well these predictiosn based on acc_score

0.3877551020408163


## Chi Squared

In [182]:
import scipy.stats as stats

data = pd.DataFrame(np.array([[1,1,1],[1,0,1],[0,1,0],[0,0,0]]), \
            columns=['Minimum temperature (°C)','Maximum temperature (°C)','Max_PRICECATEGORY'])
features=data[['Minimum temperature (°C)','Maximum temperature (°C)']]
class_label = data['Max_PRICECATEGORY']
cont_table = pd.crosstab(class_label,features['Maximum temperature (°C)'])
chi2_val, p, dof, expected = stats.chi2_contingency(cont_table.values, correction=False)
print('Chi2 value: ',chi2_val)
if(p<0.05) : 
    print('Null hypothesis rejected, p value: ', p)
else :
    print('Null hypothesis accepted, p value: ', p)

Chi2 value:  0.0
Null hypothesis accepted, p value:  1.0


In [183]:
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif

dt = dataset[['Minimum temperature (°C)','Maximum temperature (°C)','Rainfall (mm)',\
            'Evaporation (mm)','Sunshine (hours)','Speed of maximum wind gust (km/h)',\
            '9am Temperature (°C)','9am relative humidity (%)'\
           ,'9am cloud amount (oktas)','9am wind speed (km/h)','9am MSL pressure (hPa)',\
            '3pm Temperature (°C)','3pm relative humidity (%)','3pm cloud amount (oktas)',\
            '3pm wind speed (km/h)','3pm MSL pressure (hPa)']]
cl = dataset['Max_PRICECATEGORY']

X_train, X_test, y_train, y_test = train_test_split(dt,cl, train_size =0.66, random_state = 42)


# Instantiate
feature_selector = SelectKBest(chi2, k=3)

# Perform selection
X_train = feature_selector.fit_transform(X_train, y_train)
X_test = feature_selector.transform(X_test)


#Scale the data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train=scaler.transform(X_train)
X_test=scaler.transform(X_test)

#Impute missing values via mean imputation
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)
    
#Train k-nn classifier
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
dt = DecisionTreeClassifier(criterion="entropy", max_depth = 5)
    
# STEP 2: Fitting data / Training
knn.fit(X_train, y_train)
dt.fit(X_train, y_train)

# STEP 3: Prediction / Test
y_pred=knn.predict(X_test)
y_pred_dt = dt.predict(X_test)
    
# STEP 4: Eval
acc_score.append(accuracy_score(y_test, y_pred))
acc_score_dt.append(accuracy_score(y_test, y_pred_dt))
    
print(acc_score)
#Display average of accuracy scores
avg_acc_score = sum(acc_score)/k
print(avg_acc_score)



print (acc_score_dt)
avg_acc_score_dt = sum(acc_score_dt)/k
print(avg_acc_score_dt)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


NameError: name 'acc_score' is not defined

## PCA

In [None]:
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier

data = dt

classlabel = dataset['Max_TOTALDEMAND']

# if too much k-fold, e.g. k=100, too much computational power 
k=5

# Initiation
kf = KFold(n_splits=k, shuffle=True, random_state=200) # change random_state = 125 and run next cell sum(...)/k

acc_score = []
acc_score_dt = []

for train_index, test_index in kf.split(data):
    #Perform the split for this fold
    X_train, X_test = data.iloc[train_index, :], data.iloc[test_index, :]
    y_train, y_test = classlabel[train_index], classlabel[test_index]
    
    
    # STEP 0
    scaler = preprocessing.StandardScaler()
    X_train=scaler.fit_transform(X_train)
    X_test=scaler.transform(X_test)

    imputer = SimpleImputer()
    X_train = imputer.fit_transform(X_train)
    X_test = imputer.transform(X_test)
    
    # Instantiate
    # Option 1: SelectKBest method
    feature_selector = SelectKBest(mutual_info_classif, k=15)
    X_train = feature_selector.fit_transform(X_train, y_train)
    X_test = feature_selector.transform(X_test)

    # Option 2: PCA
    pca = PCA(n_components=15)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    # STEP 1
    knn = neighbors.KNeighborsClassifier(n_neighbors=5) #  try n_neighbors 5 vs 3
    dt = DecisionTreeClassifier(criterion="entropy", max_depth=5)

    # STEP 2: Fitting data / Training
    knn.fit(X_train, y_train)
    dt.fit(X_train, y_train)

    # STEP 3: Prediction / Test
    y_pred=knn.predict(X_test)
    y_pred_dt = dt.predict(X_test)
    
    # STEP 4: Eval
    acc_score.append(accuracy_score(y_test, y_pred))
    acc_score_dt.append(accuracy_score(y_test, y_pred_dt))