In [1]:
# imports
import pandas as pd
import functions as func
import orga_functions as org
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import MinMaxScaler

In [2]:
# read in the dataframe
path = org.path("02_AirQuality_processed.csv")
df = pd.read_csv(path, sep=';')

In [3]:
df

Unnamed: 0,date,co_gt,pt08_s1_co,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
0,2004-03-10 18:00:00,2.6,1360.0,11.9,1046.000000,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2004-03-10 19:00:00,2.0,1292.0,9.4,955.000000,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2004-03-10 20:00:00,2.2,1402.0,9.0,939.000000,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2004-03-10 21:00:00,2.2,1376.0,9.2,948.000000,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,2004-03-10 22:00:00,1.6,1272.0,6.5,836.000000,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-06 19:00:00,1.6,985.0,4.5,953.579453,227.0,891.0,165.0,875.0,774.0,6.0,38.0,0.3584
7994,2005-02-06 20:00:00,1.8,1002.0,5.3,780.000000,252.0,855.0,179.0,892.0,857.0,5.8,36.4,0.3385
7995,2005-02-06 21:00:00,1.4,938.0,3.7,953.579453,193.0,937.0,149.0,805.0,737.0,5.8,35.4,0.3286
7996,2005-02-06 22:00:00,1.1,896.0,2.6,953.579453,158.0,1033.0,126.0,782.0,610.0,5.4,36.6,0.3304


# Features skalieren auf [0,1]

In [4]:
# Cutting out 'date' column
date = df.date
df.drop(columns=['date'], inplace=True)

In [5]:
# scaling features to range [0,1]
mapper = DataFrameMapper([(df.columns, MinMaxScaler())])
scaled_features = mapper.fit_transform(df.copy(), 4)
scaled_features_df = pd.DataFrame(scaled_features, index=df.index, columns=df.columns)

scaled_features_df

Unnamed: 0,co_gt,pt08_s1_co,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
0,0.454545,0.688224,0.409722,0.311975,0.263242,0.649558,0.509174,0.616822,0.557692,0.308585,0.499371,0.275071
1,0.345455,0.622587,0.322917,0.216387,0.162119,0.753982,0.412844,0.533956,0.395055,0.301624,0.484277,0.259177
2,0.381818,0.728764,0.309028,0.199580,0.207063,0.723894,0.513761,0.531464,0.451099,0.269142,0.563522,0.271332
3,0.381818,0.703668,0.315972,0.209034,0.272873,0.681416,0.550459,0.549533,0.521978,0.248260,0.638994,0.289292
4,0.272727,0.603282,0.222222,0.091387,0.207063,0.781416,0.522936,0.490966,0.470879,0.252900,0.633962,0.290326
...,...,...,...,...,...,...,...,...,...,...,...,...
7993,0.272727,0.326255,0.152778,0.214894,0.361156,0.503540,0.747706,0.107788,0.286264,0.132251,0.362264,0.078536
7994,0.309091,0.342664,0.180556,0.032563,0.401284,0.471681,0.811927,0.118380,0.331868,0.127610,0.342138,0.068743
7995,0.236364,0.280888,0.125000,0.214894,0.306581,0.544248,0.674312,0.064174,0.265934,0.127610,0.329560,0.063872
7996,0.181818,0.240347,0.086806,0.214894,0.250401,0.629204,0.568807,0.049844,0.196154,0.118329,0.344654,0.064757


In [6]:
# check if scaling was applied correct [y = (x - min()) / (max() - min())]
(df.co_gt.iloc[0] - df.co_gt.min()) / (df.co_gt.max() - df.co_gt.min())

0.45454545454545453

In [7]:
# concat 'date' column to df 
df_normalized = pd.concat([date,scaled_features_df], axis=1)
df_normalized

Unnamed: 0,date,co_gt,pt08_s1_co,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
0,2004-03-10 18:00:00,0.454545,0.688224,0.409722,0.311975,0.263242,0.649558,0.509174,0.616822,0.557692,0.308585,0.499371,0.275071
1,2004-03-10 19:00:00,0.345455,0.622587,0.322917,0.216387,0.162119,0.753982,0.412844,0.533956,0.395055,0.301624,0.484277,0.259177
2,2004-03-10 20:00:00,0.381818,0.728764,0.309028,0.199580,0.207063,0.723894,0.513761,0.531464,0.451099,0.269142,0.563522,0.271332
3,2004-03-10 21:00:00,0.381818,0.703668,0.315972,0.209034,0.272873,0.681416,0.550459,0.549533,0.521978,0.248260,0.638994,0.289292
4,2004-03-10 22:00:00,0.272727,0.603282,0.222222,0.091387,0.207063,0.781416,0.522936,0.490966,0.470879,0.252900,0.633962,0.290326
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-06 19:00:00,0.272727,0.326255,0.152778,0.214894,0.361156,0.503540,0.747706,0.107788,0.286264,0.132251,0.362264,0.078536
7994,2005-02-06 20:00:00,0.309091,0.342664,0.180556,0.032563,0.401284,0.471681,0.811927,0.118380,0.331868,0.127610,0.342138,0.068743
7995,2005-02-06 21:00:00,0.236364,0.280888,0.125000,0.214894,0.306581,0.544248,0.674312,0.064174,0.265934,0.127610,0.329560,0.063872
7996,2005-02-06 22:00:00,0.181818,0.240347,0.086806,0.214894,0.250401,0.629204,0.568807,0.049844,0.196154,0.118329,0.344654,0.064757


In [8]:
# save clean dataset locally
new_path ="../../data/csv/AirQuality/03_AirQuality_normalized.csv"
df_normalized.to_csv(new_path, sep=';', index = False)