# Import Libraries

In [3]:
# import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

# Table of Contents

* [Import Datasets](#importdata)
* [Indexing](#index)
* [Data Exploration](#explore)
* [Clean Data](#clean)
* [List of Customers](#column)


# Import Datasets <a class="anchor" id="importdata"></a>

The two main datasets are "Levering" and "Teruglevering", which are respectively consumption and production.

In [90]:
# import dataset
consumption = pd.read_csv("Raw Data/Levering.csv", sep=';')
production = pd.read_csv("Raw Data/Teruglevering.csv", sep=';', index_col=0, skiprows=1)

  interactivity=interactivity, compiler=compiler, result=result)


# Indexing <a class="anchor" id="index"></a>
The index should show the date and time. Every 15 min there is a measurement (every row).

In [91]:
consumption.datetime = pd.to_datetime(consumption.datetime)


In [126]:
consumption.set_index('datetime', inplace=True)

In [92]:
consumption.head(5)

Unnamed: 0,datetime,SOM,Klant 1,Klant 2,Klant 3,Klant 4,Klant 5,Klant 6,Klant 8,Klant 9,...,Klant 74,Klant 76,Klant 77,Klant 78,Klant 79,Klant 80,Klant 81,Klant 82,leverende klanten,niet leverenden
0,2013-01-01 00:00:00,9076.0,71.0,188.0,,31.0,61.0,138.0,187.0,194.0,...,29.0,60.0,126.0,57.0,223.0,203.0,,,#WAARDE!,#WAARDE!
1,2013-01-01 00:15:00,8962.0,58.0,176.0,,13.0,64.0,147.0,169.0,152.0,...,15.0,72.0,114.0,30.0,257.0,191.0,,,1541,7460
2,2013-01-01 00:30:00,9503.0,57.0,180.0,,16.0,43.0,129.0,173.0,133.0,...,25.0,77.0,112.0,32.0,179.0,162.0,,,1584,7455
3,2013-01-01 00:45:00,8789.0,78.0,224.0,,14.0,29.0,140.0,202.0,140.0,...,64.0,66.0,102.0,29.0,216.0,222.0,,,1339,8016
4,2013-01-01 01:00:00,8806.0,63.0,211.0,,17.0,30.0,147.0,200.0,137.0,...,79.0,71.0,53.0,31.0,209.0,211.0,,,1485,7379


In [45]:
production.head(5)

Unnamed: 0_level_0,SOM,Klant 1,Klant 2,Klant 3,Klant 4,Klant 5,Klant 6,Klant 8,Klant 9,Klant 10,...,Klant 73,Klant 74,Klant 76,Klant 77,Klant 78,Klant 79,Klant 80,Klant 81,Klant 82,terleveraars
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-1-2013 0:00,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1-1-2013 0:15,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1-1-2013 0:30,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1-1-2013 0:45,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1-1-2013 1:00,0,0,0,0,0,0,0.0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data exploration <a class="anchor" id="explore"></a>

### Consumption dataset

1. Approximately 50% of the rows for all customers (columns) are filled with NaN's.
2. The above is mainly the result of only NaNs in the Excel starting from row 35042


### Produciton dataset

In [66]:
#consumption.info()

In [93]:
consumption.isna().sum()

datetime             34992
SOM                  34992
Klant 1              35558
Klant 2              35089
Klant 3              36243
                     ...  
Klant 80             38377
Klant 81             35463
Klant 82             38097
leverende klanten    34992
niet leverenden      34992
Length: 81, dtype: int64

In [67]:
# pd.DataFrame(production.isna().sum())

# Clean Data <a class="anchor" id="clean"></a>
## Consumption
1. Remove empty cells (empty for all columns)
2. Divide into summer and winter

In [127]:
# After 35040 No data
consumption = consumption.iloc[0:35040,:]

In [128]:
# Threshold to remove customers??
consumption.isna().sum().sort_values(ascending=False)

Klant 59    12246
Klant 63    11782
Klant 69     7722
Klant 47     6977
Klant 55     5329
            ...  
Klant 39        0
Klant 38        0
Klant 37        0
Klant 36        0
SOM             0
Length: 80, dtype: int64

In [135]:
# between_time only useful for timeinterval not by data
time_slice = consumption.between_time('00:15:00','00:30:00' )


In [140]:
# Summer 20 June - 22 September
# 16322 = 16320  - 25441  
consumption_summer = consumption.iloc[16320:25440,:]

In [155]:
# Winter 21 Dec - 20 March
# add two periods
winter_dec = consumption.iloc[33984:35040,:] 
winter_jfm = consumption.iloc[0:7584,:]
# add via concat
consumption_winter = pd.concat([winter_dec,winter_jfm])

# List of customers <a class="anchor" id="colum"></a>

Four different kind of customers:
1. There are some customer numbers that do not exist in the dataset (in other words, the column is not included)
2. Customers with a lot of NaNs
3. Customer without solar panels (no production)
4. Customer with solar panels (with production)

In [None]:
cust_not_exist = ['Klant 7','Klant 12','Klant 54', 'Klant 56' , 'Klant 75']
cust_no_data = ['Klant 59']

In [None]:
prosumers = ['Klant 1','Klant 21','Klant 30', 'Klant 47' , 'Klant 48',
             'Klant 55', 'Klant 63' , 'Klant 69','Klant 79' , 'Klant 80']