# Purpose of the Notebook:

In this notebook, a sample of simulated live data is generated.
Those samples can be used in later notebooks to check whether the forecasting algorithm works in every possible situation.

# Create JSON LiveData

In [1]:
# import
import json
import pandas as pd
import numpy as np
import orga_functions as org

from X1_DataPreparation import change_column
from X1_DataPreparation import og_column_list
from X1_DataPreparation import og_int_column_list

In [2]:
# read in the feature codebook
path = org.path("02_AlleFeatureList.csv")
feature_df = pd.read_csv(path, sep =";")

## FeatureList DataPreparation

In [3]:
feature_df

Unnamed: 0,names,mean,25%,50%,75%,min,max
0,co_gt,2.168604,1.1,1.8,2.9,0.1,5.6
1,pt08_s1_co,1098.303412,932.0,1061.0,1234.0,647.0,1687.0
2,nmhc_gt,218.607666,66.0,145.0,297.0,0.1,643.5
3,c6h6_gt,10.458205,4.7,8.6,14.4,0.1,28.95
4,pt08_s2_nmhc,953.579453,749.0,925.0,1130.0,749.0,1701.5
5,nox_gt,232.360072,89.0,164.0,303.75,0.1,625.875
6,pt08_s3_nox,848.762615,672.0,818.0,984.0,204.0,1452.0
7,no2_gt,106.251831,73.0,103.0,132.0,0.1,220.5
8,pt08_s4_no2,1507.352834,1305.0,1508.0,1707.0,702.0,2310.0
9,pt08_s5_o3,1024.200026,737.0,962.0,1272.0,253.0,2074.5


#### Original Names

In [4]:
# original Names List
original_names = og_column_list()

In [5]:
# replace names by original/raw names
def og_name(name):
    for og in original_names:
        if change_column(og) == name:
            return og
    return name

feature_df.names = [og_name(x) for x in feature_df.names]

In [6]:
feature_df

Unnamed: 0,names,mean,25%,50%,75%,min,max
0,CO(GT),2.168604,1.1,1.8,2.9,0.1,5.6
1,PT08.S1(CO),1098.303412,932.0,1061.0,1234.0,647.0,1687.0
2,NMHC(GT),218.607666,66.0,145.0,297.0,0.1,643.5
3,C6H6(GT),10.458205,4.7,8.6,14.4,0.1,28.95
4,PT08.S2(NMHC),953.579453,749.0,925.0,1130.0,749.0,1701.5
5,NOx(GT),232.360072,89.0,164.0,303.75,0.1,625.875
6,PT08.S3(NOx),848.762615,672.0,818.0,984.0,204.0,1452.0
7,NO2(GT),106.251831,73.0,103.0,132.0,0.1,220.5
8,PT08.S4(NO2),1507.352834,1305.0,1508.0,1707.0,702.0,2310.0
9,PT08.S5(O3),1024.200026,737.0,962.0,1272.0,253.0,2074.5


In [7]:
# the name column is set as the new index column
feature_df.set_index('names', inplace = True)

In [8]:
feature_df[["min", "max"]]

Unnamed: 0_level_0,min,max
names,Unnamed: 1_level_1,Unnamed: 2_level_1
CO(GT),0.1,5.6
PT08.S1(CO),647.0,1687.0
NMHC(GT),0.1,643.5
C6H6(GT),0.1,28.95
PT08.S2(NMHC),749.0,1701.5
NOx(GT),0.1,625.875
PT08.S3(NOx),204.0,1452.0
NO2(GT),0.1,220.5
PT08.S4(NO2),702.0,2310.0
PT08.S5(O3),253.0,2074.5


## Create new entries

#### Functions

inner functions

In [9]:
# gets the min for the given feature
def get_min(feature_name):
    return feature_df.loc[feature_name]["min"]

# gets the max for the given feature
def get_max(feature_name):
    return feature_df.loc[feature_name]["max"]
                         

main functions

In [10]:
# returns a correct value for the given feature
def get_correct(feat_name):
    if feat_name in og_int_column_list():
        value = np.random.randint(get_min(feat_name), get_max(feat_name))
        return value
    else:
        value = np.random.uniform(get_min(feat_name), get_max(feat_name))
        return round(value, 2)

In [11]:
# returns a to small value for the given feature
def get_too_small(feat_name):
    if feat_name in og_int_column_list():
        value = np.random.randint(-250, get_min(feat_name)-1)
        return value
    else:
        value = np.random.uniform(-250, get_min(feat_name)-1)
        return round(value, 2)

In [12]:
# returns a to big value for the given feature
def get_too_big(feat_name):
    if feat_name in og_int_column_list():
        value = np.random.randint(get_max(feat_name)+1, 5000)
        return value
    else:
        value = np.random.uniform(get_max(feat_name)+1, 5000)
        return round(value, 2)

 ### Entry with only correct values

In [13]:
# generate values for the all columns besides the date and time columns
corr_dict = {}
for i in feature_df.index.values:
    corr_dict[i] = get_correct(i)

In [14]:
corr_dict["Date"] = "07/02/2005"
corr_dict["Time"] = '00.00.00'

### Entry with some too small values

In [15]:
# generate values for the all columns besides the date and time columns
small_dict = {}

change = 1

for i in feature_df.index.values:
    if change %2 == 0:
        small_dict[i] = get_correct(i)
    else:
        small_dict[i] = get_too_small(i)
    change+=1

In [16]:
small_dict["Date"] = "07/02/2005"
small_dict["Time"] = '01.00.00'

### Entry some too big values

In [17]:
# generate values for the all columns besides the date and time columns
big_dict = {}

# variable 
change = 1

for i in feature_df.index.values:
    if change % 2 == 0:
        big_dict[i] = get_correct(i)
    else:
        big_dict[i] = get_too_big(i)
    change+=1

In [18]:
big_dict["Date"] = "07/02/2005"
big_dict["Time"] = '02.00.00'

### Entry with mix values

In [19]:
# generate values for the all columns besides the date and time columns
mix_dict = {}

change = 1

for i in feature_df.index.values:
    if change % 3 == 0:              
        mix_dict[i] = get_too_big(i)
    elif change % 2 == 0:            
        mix_dict[i] = get_too_small(i)
    else:                            
        mix_dict[i] = get_correct(i)
    change+=1

In [20]:
mix_dict["Date"] = "07/02/2005"
mix_dict["Time"] = '03.00.00'

### Entry with missing  values

In [21]:
missing_dict = {}

In [22]:
missing_dict["Date"] = "07/02/2005"
missing_dict["Time"] = '04.00.00'

In [23]:
# generate same values for the all columns besides the date and time columns
missing_dict["CO(GT)"] = get_correct('CO(GT)')
missing_dict["PT08.S1(CO)"] = get_correct('PT08.S1(CO)')
missing_dict["NO2(GT)"] = get_correct('NO2(GT)')

In [24]:
missing_dict["NOx(GT)"] = np.nan

### entry with an unknown column

In [25]:
dict_unknown = {}
for i in feature_df.index.values:
    dict_unknown[i] = get_correct(i)
dict_unknown["unknown"] = "low"
dict_unknown["Date"] = "07/02/2005"
dict_unknown["Time"] = '05.00.00'

### more entries

In [26]:
corr_dict2 = {}
for i in feature_df.index.values:
    corr_dict2[i] = get_correct(i)

corr_dict2["Date"] = "07/02/2005"
corr_dict2["Time"] = '06.00.00'

In [27]:
corr_dict3 = {}
for i in feature_df.index.values:
    corr_dict3[i] = get_correct(i)

corr_dict3["Date"] = "07/02/2005"
corr_dict3["Time"] = '07.00.00'

## Save  LiveData in JSON File

In [28]:
#function that saves the data in a JSON form
def entry_list(*entries):
    list = []
    for e in entries:
        list.append(e)
    return list

### Save

In [29]:
# convert each every into a JSON-like form
first_entry = entry_list(corr_dict)
second_entry = entry_list(small_dict)
third_entry = entry_list(big_dict)
fourth_entry = entry_list(mix_dict)
fifth_entry = entry_list(missing_dict)
sixth_entry = entry_list(dict_unknown)

In [30]:
# convert the last two entries into a single JSON-like form
other_entries = entry_list(corr_dict2, corr_dict3)

In [31]:
# Save as JSON file
new_path = org.path("01_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(first_entry, jf, indent = 2)

In [32]:
# Save as JSON file
new_path = org.path("02_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(second_entry, jf, indent = 2)

In [33]:
# Save as JSON file
new_path = org.path("03_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(third_entry, jf, indent = 2)

In [34]:
# Save as JSON file
new_path = org.path("04_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(fourth_entry, jf, indent = 2)

In [35]:
# Save as JSON file
new_path = org.path("05_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(fifth_entry, jf, indent = 2)

In [36]:
# Save as JSON file
new_path = org.path("06_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(sixth_entry, jf, indent = 2)

In [37]:
# Save as JSON file
new_path = org.path("07_LiveData.json")

with open(new_path, 'w') as jf:
    json.dump(other_entries, jf, indent = 2)

### Exemplarily Output

In [38]:
#Output for the first entry
print(json.dumps(first_entry, indent = 2))

[
  {
    "CO(GT)": 3.3,
    "PT08.S1(CO)": 1042,
    "NMHC(GT)": 377,
    "C6H6(GT)": 16.59,
    "PT08.S2(NMHC)": 1436,
    "NOx(GT)": 187,
    "PT08.S3(NOx)": 1232,
    "NO2(GT)": 184,
    "PT08.S4(NO2)": 1290,
    "PT08.S5(O3)": 525,
    "T": 1.2,
    "RH": 87.77,
    "AH": 0.47,
    "Date": "07/02/2005",
    "Time": "00.00.00"
  }
]


In [39]:
#Output for the two last entries 
print(json.dumps(other_entries, indent = 2))

[
  {
    "CO(GT)": 4.66,
    "PT08.S1(CO)": 1317,
    "NMHC(GT)": 122,
    "C6H6(GT)": 19.8,
    "PT08.S2(NMHC)": 1006,
    "NOx(GT)": 82,
    "PT08.S3(NOx)": 1119,
    "NO2(GT)": 147,
    "PT08.S4(NO2)": 1244,
    "PT08.S5(O3)": 480,
    "T": 26.91,
    "RH": 59.16,
    "AH": 1.21,
    "Date": "07/02/2005",
    "Time": "06.00.00"
  },
  {
    "CO(GT)": 3.47,
    "PT08.S1(CO)": 1576,
    "NMHC(GT)": 281,
    "C6H6(GT)": 11.73,
    "PT08.S2(NMHC)": 880,
    "NOx(GT)": 419,
    "PT08.S3(NOx)": 327,
    "NO2(GT)": 160,
    "PT08.S4(NO2)": 1857,
    "PT08.S5(O3)": 2017,
    "T": 8.21,
    "RH": 40.05,
    "AH": 0.57,
    "Date": "07/02/2005",
    "Time": "07.00.00"
  }
]
