# Purpose of the Notebook

In this notebook, an attempt is made to improve the performance of the model by doing feature engineering. Feature engineering is the process of using domain knowledge to extract features (characteristics, properties, attributes) from raw data.

# Feature Engineering

In [1]:
# imports
import pandas as pd
import X3_Forecasting as forecast
import orga_functions as org
import datetime

In [2]:
# read in the dataframe
df = pd.read_csv(org.path("02_AirQuality_processed.csv"), sep=';')
#df = pd.read_csv(org.path("03_AirQuality_normalized.csv"), sep=';')
df

Unnamed: 0,date,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah
0,2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578
1,2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255
2,2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502
3,2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867
4,2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-06 19:00:00,1.6,985.0,218.607666,4.5,953.579453,227.0,891.0,165.0,875.0,774.0,6.0,38.0,0.3584
7994,2005-02-06 20:00:00,1.8,1002.0,218.607666,5.3,780.000000,252.0,855.0,179.0,892.0,857.0,5.8,36.4,0.3385
7995,2005-02-06 21:00:00,1.4,938.0,218.607666,3.7,953.579453,193.0,937.0,149.0,805.0,737.0,5.8,35.4,0.3286
7996,2005-02-06 22:00:00,1.1,896.0,218.607666,2.6,953.579453,158.0,1033.0,126.0,782.0,610.0,5.4,36.6,0.3304


In [3]:
# Ensure the timestamp-datetype
df["date"] = pd.to_datetime(df["date"], format = "%Y-%m-%d %H:%M:%S")

# New Features

These are the features which have been thought of and how they were created:

#### Weekday

Categorical features 'monday' to 'sunday' which are 1 if true and 0 if false.

In [4]:
# weekday(number) for every date
df["weekday"] = [x.weekday() for x in df.date]

In [5]:
df

Unnamed: 0,date,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,pt08_s5_o3,t,rh,ah,weekday
0,2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,1268.0,13.6,48.9,0.7578,2
1,2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,972.0,13.3,47.7,0.7255,2
2,2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,1074.0,11.9,54.0,0.7502,2
3,2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,1203.0,11.0,60.0,0.7867,2
4,2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,1110.0,11.2,59.6,0.7888,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-06 19:00:00,1.6,985.0,218.607666,4.5,953.579453,227.0,891.0,165.0,875.0,774.0,6.0,38.0,0.3584,6
7994,2005-02-06 20:00:00,1.8,1002.0,218.607666,5.3,780.000000,252.0,855.0,179.0,892.0,857.0,5.8,36.4,0.3385,6
7995,2005-02-06 21:00:00,1.4,938.0,218.607666,3.7,953.579453,193.0,937.0,149.0,805.0,737.0,5.8,35.4,0.3286,6
7996,2005-02-06 22:00:00,1.1,896.0,218.607666,2.6,953.579453,158.0,1033.0,126.0,782.0,610.0,5.4,36.6,0.3304,6


In [6]:
# columns for every single weekday (0/1 values)
df["monday"] = df["tuesday"] = df["wednesday"] = df["thursday"] = df["friday"] = df["saturday"] = df["sunday"] = 0

In [7]:
# set weekday for every row
for i in range(len(df)):
    if df.loc[i]["weekday"] == 0:
        df.at[i, "monday"] = 1
        
    elif df.loc[i]["weekday"] == 1:
        df.at[i, "tuesday"] = 1
        
    elif df.loc[i]["weekday"] == 2:
        df.at[i, "wednesday"] = 1
        
    elif df.loc[i]["weekday"] == 3:
        df.at[i, "thursday"] = 1
        
    elif df.loc[i]["weekday"] == 4:
        df.at[i, "friday"] = 1
        
    elif df.loc[i]["weekday"] == 5:
        df.at[i, "saturday"] = 1
        
    elif df.loc[i]["weekday"] == 6:
        df.at[i, "sunday"] = 1

In [8]:
df.drop(columns=["weekday"], inplace =True)

In [9]:
df

Unnamed: 0,date,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,...,t,rh,ah,monday,tuesday,wednesday,thursday,friday,saturday,sunday
0,2004-03-10 18:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,...,13.6,48.9,0.7578,0,0,1,0,0,0,0
1,2004-03-10 19:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,...,13.3,47.7,0.7255,0,0,1,0,0,0,0
2,2004-03-10 20:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,...,11.9,54.0,0.7502,0,0,1,0,0,0,0
3,2004-03-10 21:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,...,11.0,60.0,0.7867,0,0,1,0,0,0,0
4,2004-03-10 22:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,...,11.2,59.6,0.7888,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-06 19:00:00,1.6,985.0,218.607666,4.5,953.579453,227.0,891.0,165.0,875.0,...,6.0,38.0,0.3584,0,0,0,0,0,0,1
7994,2005-02-06 20:00:00,1.8,1002.0,218.607666,5.3,780.000000,252.0,855.0,179.0,892.0,...,5.8,36.4,0.3385,0,0,0,0,0,0,1
7995,2005-02-06 21:00:00,1.4,938.0,218.607666,3.7,953.579453,193.0,937.0,149.0,805.0,...,5.8,35.4,0.3286,0,0,0,0,0,0,1
7996,2005-02-06 22:00:00,1.1,896.0,218.607666,2.6,953.579453,158.0,1033.0,126.0,782.0,...,5.4,36.6,0.3304,0,0,0,0,0,0,1


#### Seasons

Categorical features 'winter' to 'autumn' which are true if the timestamp is within that season.

In [10]:
# source: https://stackoverflow.com/questions/16139306/determine-season-given-timestamp-in-python-using-datetime
from datetime import date, datetime

Y = 2000 # dummy leap year to allow input X-02-29 (leap day)
seasons = [('winter', (date(Y,  1,  1),  date(Y,  3, 20))),
           ('spring', (date(Y,  3, 21),  date(Y,  6, 20))),
           ('summer', (date(Y,  6, 21),  date(Y,  9, 22))),
           ('autumn', (date(Y,  9, 23),  date(Y, 12, 20))),
           ('winter', (date(Y, 12, 21),  date(Y, 12, 31)))]


def get_season(now):
    if isinstance(now, datetime):
        now = now.date()
    now = now.replace(year=Y)
    return next(season for season, (start, end) in seasons
                if start <= now <= end)


In [11]:
# future time - better performance!
df["date"] = df["date"]+ pd.DateOffset(hours=6, minutes=0)

In [12]:
# season name for every date
df["season"] = [get_season(x) for x in df.date]

In [13]:
# check for 4 seasons
print(df["season"].unique())

['winter' 'spring' 'summer' 'autumn']


In [14]:
# columns for every single season (0/1 values)
df["spring"] = df["summer"] = df["autumn"] = df["winter"] = 0

In [15]:
# set season for every row
for i in range(len(df)):
    if df.loc[i]["season"] == "spring":
        df.at[i, "spring"] = 1
        
    elif df.loc[i]["season"] == "summer":
        df.at[i, "summer"] = 1
        
    elif df.loc[i]["season"] == "autumn":
        df.at[i, "autumn"] = 1
        
    elif df.loc[i]["season"] == "winter":
        df.at[i, "winter"] = 1

In [16]:
# drop column season
df.drop(columns = ["season"], inplace = True)

#### Tageszeiten

Categorical features which represent the time of day as 'early_morning' to 'night'.

In [17]:
df["early_morning"] = df["mid_morning"] = df["midday"] = df["afternoon"] = df["evening"] = df["night"] = 0

In [18]:
for i in range(len(df)):
    
    curr_cel = df.loc[i]["date"]
    
    if curr_cel.hour >= 6 and curr_cel.hour < 10:
        df.at[i, "early_morning"] = 1
        
    elif curr_cel.hour >= 10 and curr_cel.hour < 12 :
        df.at[i, "mid_morning"] = 1
        
    elif curr_cel.hour >= 12 and curr_cel.hour < 14 :
        df.at[i, "midday"] = 1
    
    elif curr_cel.hour >= 14 and curr_cel.hour < 17 :
        df.at[i, "afternoon"] = 1
    
    elif curr_cel.hour >= 17 and curr_cel.hour < 21 :
        df.at[i, "evening"] = 1
    else:
        df.at[i, "night"] = 1

In [19]:
df

Unnamed: 0,date,co_gt,pt08_s1_co,nmhc_gt,c6h6_gt,pt08_s2_nmhc,nox_gt,pt08_s3_nox,no2_gt,pt08_s4_no2,...,spring,summer,autumn,winter,early_morning,mid_morning,midday,afternoon,evening,night
0,2004-03-11 00:00:00,2.6,1360.0,150.000000,11.9,1046.000000,166.0,1056.0,113.0,1692.0,...,0,0,0,1,0,0,0,0,0,1
1,2004-03-11 01:00:00,2.0,1292.0,112.000000,9.4,955.000000,103.0,1174.0,92.0,1559.0,...,0,0,0,1,0,0,0,0,0,1
2,2004-03-11 02:00:00,2.2,1402.0,88.000000,9.0,939.000000,131.0,1140.0,114.0,1555.0,...,0,0,0,1,0,0,0,0,0,1
3,2004-03-11 03:00:00,2.2,1376.0,80.000000,9.2,948.000000,172.0,1092.0,122.0,1584.0,...,0,0,0,1,0,0,0,0,0,1
4,2004-03-11 04:00:00,1.6,1272.0,51.000000,6.5,836.000000,131.0,1205.0,116.0,1490.0,...,0,0,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,2005-02-07 01:00:00,1.6,985.0,218.607666,4.5,953.579453,227.0,891.0,165.0,875.0,...,0,0,0,1,0,0,0,0,0,1
7994,2005-02-07 02:00:00,1.8,1002.0,218.607666,5.3,780.000000,252.0,855.0,179.0,892.0,...,0,0,0,1,0,0,0,0,0,1
7995,2005-02-07 03:00:00,1.4,938.0,218.607666,3.7,953.579453,193.0,937.0,149.0,805.0,...,0,0,0,1,0,0,0,0,0,1
7996,2005-02-07 04:00:00,1.1,896.0,218.607666,2.6,953.579453,158.0,1033.0,126.0,782.0,...,0,0,0,1,0,0,0,0,0,1


In [20]:
# save dataset locally
new_path = org.path("04_AirQuality_NewFeatures.csv")
df.to_csv(new_path, sep=';', index = False)

# Test

Below you will find the results of the performance tests that were conducted to test whether the new features improve performance or not.

In [21]:
# set date column as index column
df.set_index("date", inplace = True)

In [22]:
# searching for best features by checking the best mean_absolute_error (LinearRegression) (with 6hrs shift)

#0.0633526390377646
features_with_weekday = ['pt08_s1_co',"c6h6_gt","pt08_s2_nmhc","no2_gt", "pt08_s4_no2","t",
                       "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", "ah"]


# 0.06070736267619392
# 0.06003062680060758  # hrs +6
features_with_season = ['pt08_s1_co',"c6h6_gt","pt08_s2_nmhc","no2_gt", "pt08_s4_no2","t",
                        "spring", "summer", "autumn", "winter", "ah"]


# 0.062101402957687124
features_weekday_season = ['pt08_s1_co',"c6h6_gt","pt08_s2_nmhc","no2_gt", "pt08_s4_no2","t",
                  "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday", 
                  "spring", "summer", "autumn", "winter", "ah"]


# 0.06819995689500315)
features_with_daytime = ['pt08_s1_co',"c6h6_gt","pt08_s2_nmhc","no2_gt", "pt08_s4_no2","t",
                        "early_morning", "mid_morning", "midday", "afternoon", "evening","night", "ah"]

#### Final Result

In [23]:
best_features = ['pt08_s1_co',"c6h6_gt","pt08_s2_nmhc","no2_gt", "pt08_s4_no2","t",
                        "spring", "summer", "autumn", "winter", "ah"] 

In [24]:
features = best_features

# Forecasting

Below is the area used for testing the performance.

#### Target

In [34]:
#define target column
target = 'ah_target'

#### Shift

In [26]:
# shift abs humidty values by 6 hrs
df[target] = df.ah.shift(periods=-6)

In [27]:
# drop rows where target is unknown (last 6 rows with NaN ah_target)
df.dropna(subset=[target], inplace=True)

#### Preparation

In [36]:
# Train/ Test-split
from sklearn.model_selection import train_test_split
training, test = train_test_split(df, test_size=1000, shuffle=False, random_state=1999)

In [37]:
# input/ target-split
X_train = training[features]
y_train = training[target]

x_test = test[features]
y_test = test[target]

#### Linear Regression

In [38]:
# training the regression model
from sklearn.linear_model import LinearRegression
clf_lin_reg = LinearRegression()
clf_lin_reg.fit(X_train, y_train)

#### Performance Check

In [31]:
# make prediction
pred_linr_y = clf_lin_reg.predict(x_test)

In [39]:
# calculate mean absolute error
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, pred_linr_y)

0.06003062680060768

In [40]:
# target vs prediction by model
forecast.check_df(x_test, y_test, clf_lin_reg)

Unnamed: 0_level_0,future_ah,predicted_ah
future_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
2004-12-27 14:00:00,0.9302,0.931464
2004-12-27 15:00:00,0.9113,0.907273
2004-12-27 16:00:00,0.9335,0.889934
2004-12-27 17:00:00,0.9261,0.862365
2004-12-27 18:00:00,0.9379,0.843156
...,...,...
2005-02-07 01:00:00,0.3584,0.403884
2005-02-07 02:00:00,0.3385,0.418554
2005-02-07 03:00:00,0.3286,0.429271
2005-02-07 04:00:00,0.3304,0.430793
