# SVM implementation for Number of Persons with KNMI data (per day, including production & consumption)
# Model score: 36.1%

This script implements Support Vector Machine (SVM) for 2018, where the heating type of a dwelling is predicted (target), based on the total production per day, the total consumption per day and the KNMI data (features).

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import date, datetime
from datetime import timedelta
from pandas.plotting import register_matplotlib_converters

register_matplotlib_converters()
sns.set()

# Import the data and making a dataframe out of it
#file_name = '../Data/Final_data.cleaned.csv'
file_name = '../../Data/Final_data_cleaned_missing_houses.csv'
df = pd.read_csv(file_name)

file_name_2 = '../../Emilio Caba/houses_info.csv'
df_houses_info = pd.read_csv(file_name_2)

file_name_3 = '../../Data/KNMI_Voorschoten_20170711_20190601.csv'
KNMI_dataset = pd.read_csv(file_name_3)

# Rename the index columns
df = df.rename(columns={"ID-nummer" : "Index"})
df = df.set_index('Index')
df.index = pd.to_datetime(df.index)

df.head()

Unnamed: 0_level_0,H01_prod,H02_prod,H03_prod,H04_prod,H06_prod,H07_prod,H08_prod,H09_prod,H11_prod,H13_prod,...,H23_cons,H24_cons,H25_cons,H26_cons,H27_cons,H28_cons,H29_cons,H31_cons,H32_cons,H33_cons
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-09-12 00:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.044,0.07,0.021,0.298,0.096,0.025,0.024,0.299,0.027,0.022
2017-09-12 00:30:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.033,0.061,0.025,0.706,0.152,0.018,0.028,0.325,0.021,0.042
2017-09-12 00:45:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038,0.067,0.025,0.211,0.265,0.022,0.015,0.341,0.029,0.035
2017-09-12 01:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.038,0.062,0.025,0.042,0.062,0.019,0.031,0.35,0.029,0.038
2017-09-12 01:15:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.207,0.059,0.024,0.048,0.219,0.021,0.279,0.397,0.022,0.026


In [2]:
# Filter the dataset on 2018 data only (because we have all the data for 2018)
import datetime as dt
df_2018 = df[pd.to_datetime(df.index).year == 2018]

# Summing the production and consumption up per house per week
df_per_day_2018 = df_2018.groupby(pd.to_datetime(df_2018.index).date).agg(
    {'H01_prod': 'sum', 'H01_cons':'sum',
     'H02_prod': 'sum', 'H02_cons':'sum',
     'H03_prod': 'sum', 'H03_cons':'sum',
     'H04_prod': 'sum', 'H04_cons':'sum',
     'H06_prod': 'sum', 'H06_cons':'sum',
     'H07_prod': 'sum', 'H07_cons':'sum',
     'H08_prod': 'sum', 'H08_cons':'sum',
     'H09_prod': 'sum', 'H09_cons':'sum',
     'H11_prod': 'sum', 'H11_cons':'sum',
     'H13_prod': 'sum', 'H13_cons':'sum',
     'H15_prod': 'sum', 'H15_cons':'sum',
     'H16_prod': 'sum', 'H16_cons':'sum',
     'H17_prod': 'sum', 'H17_cons':'sum',
     'H18_prod': 'sum', 'H18_cons':'sum',
     'H19_prod': 'sum', 'H19_cons':'sum',
     'H20_prod': 'sum', 'H20_cons':'sum',
     'H21_prod': 'sum', 'H21_cons':'sum',
     'H22_prod': 'sum', 'H22_cons':'sum',
     'H23_prod': 'sum', 'H23_cons':'sum',
     'H24_prod': 'sum', 'H24_cons':'sum',
     'H25_prod': 'sum', 'H25_cons':'sum',
     'H26_prod': 'sum', 'H26_cons':'sum',
     'H27_prod': 'sum', 'H27_cons':'sum',
     'H28_prod': 'sum', 'H28_cons':'sum',
     'H29_prod': 'sum', 'H29_cons':'sum',
     'H31_prod': 'sum', 'H31_cons':'sum',
     'H32_prod': 'sum', 'H32_cons':'sum',
     'H33_prod': 'sum', 'H33_cons':'sum'})

df_per_day_2018.head()

Unnamed: 0,H01_prod,H01_cons,H02_prod,H02_cons,H03_prod,H03_cons,H04_prod,H04_cons,H06_prod,H06_cons,...,H28_prod,H28_cons,H29_prod,H29_cons,H31_prod,H31_cons,H32_prod,H32_cons,H33_prod,H33_cons
2018-01-04,0.214,11.276,0.077,26.411,0.111,9.274,0.015,9.319,0.139,8.125,...,0.279,35.109,0.006,37.625,0.145,10.887,0.09,48.863,0.138,11.057
2018-01-05,0.728,17.483,0.244,24.131,0.065,9.31,0.26,10.056,0.103,8.942,...,0.38,28.879,0.558,11.688,0.139,16.713,0.116,26.957,0.048,10.153
2018-01-06,1.155,15.07,0.373,34.901,0.318,9.694,0.349,9.592,0.323,12.124,...,0.934,35.355,0.0,34.097,0.231,16.668,0.287,37.575,0.235,9.887
2018-01-07,3.029,23.743,0.894,28.495,0.722,12.415,0.743,11.135,1.396,27.633,...,1.132,60.706,1.687,18.647,2.027,10.467,1.86,32.995,1.289,11.329
2018-01-08,0.867,35.529,0.186,35.917,0.876,10.059,0.538,13.932,0.649,9.214,...,0.757,58.092,0.952,19.796,1.476,15.804,1.427,41.457,0.6,8.658


In [3]:
KNMI_dataset.set_index(['Date_and_time'], inplace=True)
KNMI_dataset.index = pd.to_datetime(KNMI_dataset.index)
KNMI_dataset['N'] = pd.to_numeric(KNMI_dataset['N'], errors='coerce')
KNMI_dataset.head()

Unnamed: 0_level_0,T,SQ,Q,N
Date_and_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-07-11 01:00:00,16.6,0,0,8.0
2017-07-11 02:00:00,16.7,0,0,6.0
2017-07-11 03:00:00,16.9,0,0,7.0
2017-07-11 04:00:00,16.7,0,2,7.0
2017-07-11 05:00:00,17.0,2,13,6.0


In [4]:
KNMI_dataset_2018 = KNMI_dataset[pd.to_datetime(KNMI_dataset.index).year == 2018]

import datetime as dt
KNMI_dataset_per_day_2018 = KNMI_dataset_2018.groupby(pd.to_datetime(KNMI_dataset_2018.index).date).agg(
     {'T': 'mean',
      'SQ': 'mean',
      'Q': 'mean',
      'N': 'mean'})

# Remove missing rows from dataset also in KNMI dataset
for index in KNMI_dataset_per_day_2018.index:
    if(index not in df_per_day_2018.index.tolist()):
        KNMI_dataset_per_day_2018.drop(pd.to_datetime(index), inplace=True)
        
print('Size of df_per_day_2018: ' + str(len(df_per_day_2018)))
print('Size of KNMI_dataset_per_day_2018: ' + str(len(KNMI_dataset_per_day_2018)))

Size of df_per_week_2018: 348
Size of KNMI_dataset_per_week_2018: 348


In [5]:
df_houses_info.rename(index={0:'Heating system', 1:'Solar panels', 2:'Persons'}, inplace=True)
df_houses_info.head()

Unnamed: 0,H01,H02,H03,H04,H05,H06,H07,H08,H09,H10,...,H24,H25,H26,H27,H28,H29,H30,H31,H32,H33
Heating system,E,E,WP,WP,WP,E,E,E,E,Zon,...,WP,Zon,WP,WP,Zon,E,E,WP,Zon,WP
Solar panels,17,14,9,11,12,14,13,9,14,13,...,11,13,12,9,13,14,14,15,12,10
Persons,4,2,4,1,4,4,2,4,1,3,...,2,2,2,3,2,1,4,2,4,1


In [7]:
cons_list = []
number_of_persons_list = []
T_list = []
SQ_list = []
Q_list = []
N_list = []
                     
for house in df_per_day_2018.columns:
    if('_cons' in house):
        length = len(df_per_day_2018[house])
       # print(length)
        house_without_prod = house[:-5]
        
        if('H01' in house):
            cons_list.append(df_per_day_2018[house].values)
            number_of_persons_list.append([df_houses_info[house_without_prod][2:3]] * length)            
            T_list.append(KNMI_dataset_per_day_2018['T'].values)
            SQ_list.append(KNMI_dataset_per_day_2018['SQ'].values)
            Q_list.append(KNMI_dataset_per_day_2018['Q'].values)
            N_list.append(KNMI_dataset_per_day_2018['N'].values)
        else:
            cons_list[0] = np.concatenate((cons_list[0], df_per_day_2018[house].values), axis=None)  
            number_of_persons_list[0] = np.concatenate((number_of_persons_list[0], [df_houses_info[house_without_prod][2:3]] * length))
            T_list[0] = np.concatenate((T_list[0], KNMI_dataset_per_day_2018['T'].values), axis=None)
            SQ_list[0] = np.concatenate((SQ_list[0], KNMI_dataset_per_day_2018['SQ'].values), axis=None)
            Q_list[0] = np.concatenate((Q_list[0], KNMI_dataset_per_day_2018['Q'].values), axis=None)
            N_list[0] = np.concatenate((N_list[0], KNMI_dataset_per_day_2018['N'].values), axis=None)  
            
print('Length of cons_list: ' + str(len(cons_list[0])))
print('Length of number_of_persons_list: ' + str(len(number_of_persons_list[0])))
print('Length of T_list: ' + str(len(T_list[0])))
print('Length of SQ_list: ' + str(len(SQ_list[0])))
print('Length of Q_list: ' + str(len(Q_list[0])))
print('Length of N_list: ' + str(len(N_list[0])))        

flattened_number_of_persons_list = [val for sublist in number_of_persons_list for val in sublist]
flattened2_number_of_persons_list = [val for sublist in flattened_number_of_persons_list for val in sublist] 

Length of cons_list: 9744
Length of number_of_persons_list: 9744
Length of T_list: 9744
Length of SQ_list: 9744
Length of Q_list: 9744
Length of N_list: 9744


In [8]:
dict = {'Consumption':cons_list[0],'Number_of_Persons':flattened2_number_of_persons_list,'T':T_list[0], 'SQ':SQ_list[0], 'Q':Q_list[0], 'N':N_list[0]}
df_for_SVM = pd.DataFrame(dict)
print(df_for_SVM.head())

   Consumption Number_of_Persons         T        SQ          Q         N
0       11.276                 4  8.233333  0.083333   5.083333  6.791667
1       17.483                 4  6.187500  0.083333   4.541667  6.333333
2       15.070                 4  4.654167  0.291667   8.291667  8.000000
3       23.743                 4  1.587500  2.708333  18.333333  2.083333
4       35.529                 4  0.720833  2.375000  15.166667  3.083333


In [9]:
# Implementation of Support Vector Machine for Type of Heating System based on the total production/consumption on one day
from sklearn.model_selection import train_test_split

# Splitting up the data into a test-set and training-set
X = df_for_SVM.drop(['Number_of_Persons'], axis='columns')
y = df_for_SVM['Number_of_Persons']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

# Train the model
from sklearn.svm import SVC
consumption_SVM = SVC(gamma='auto')
consumption_SVM.fit(X_train, y_train)

# Print the score of the model
print('Score of the model: ' + str(round(np.mean(consumption_SVM.score(X_test, y_test)) * 100, 1)) + '%')

print() #Print blank line

# Print out a classification report
from sklearn.metrics import classification_report
y_pred = consumption_SVM.predict(X_test)
print(classification_report(y_test,y_pred))

Score of the model: 36.1%

              precision    recall  f1-score   support

           1       0.33      0.28      0.30       492
           2       0.36      0.44      0.40       696
           3       0.00      0.00      0.00       113
           4       0.38      0.40      0.39       648

   micro avg       0.36      0.36      0.36      1949
   macro avg       0.27      0.28      0.27      1949
weighted avg       0.34      0.36      0.35      1949



  'precision', 'predicted', average, warn_for)
