# Aufgabe
Bearbeitet die Aufgabenstellung aus Aufgabe 1a.1, aber setzt euren Vorverarbeitungsprozess mit Hilfe von
sklearn-Pipelines um. Achtet dabei (a) auf die Reihenfolge der Pipeline-Schritte und (b) darauf, dass ihr die
Pipelines an den richtigen Daten fittet.

In [53]:
import pandas as pd
from sklearn.pipeline import Pipeline 
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, RobustScaler

In [54]:
# helper class
# custom selector for selecting numeric and non-numeric features
class ColumnSelector:
    def __init__(self, select_numeric = True):
        self.select_numeric = select_numeric
    
    def fit(self, X, y = None):
        return self # because we do not need to fit the data, so we can simply return it

    def transform(self, X, y = None):
        if self.select_numeric:
            return X.select_dtypes(include=["number"])
        else:
            return X.select_dtypes(exclude=["number"])

In [55]:
data = pd.read_csv("../../data/bikesharing_simple.csv")

In [56]:
data.isna().any()

Unnamed: 0    False
instant       False
dteday        False
season        False
yr            False
mnth          False
hr            False
holiday       False
weekday       False
workingday    False
weathersit    False
temp          False
atemp         False
hum           False
casual        False
registered    False
cnt           False
windspeed      True
dtype: bool

In [57]:
list(data.columns)

['Unnamed: 0',
 'instant',
 'dteday',
 'season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'casual',
 'registered',
 'cnt',
 'windspeed']

In [58]:
# remove unused columns
del data['instant']

# because the output feature will be cnt we have to remove casual and registered, because the those will sum up to cnt
del data['casual']
del data['registered']

list(data.columns)

['Unnamed: 0',
 'dteday',
 'season',
 'yr',
 'mnth',
 'hr',
 'holiday',
 'weekday',
 'workingday',
 'weathersit',
 'temp',
 'atemp',
 'hum',
 'cnt',
 'windspeed']

In [59]:
input_features = [
    'dteday',
    'season',
    'yr',
    'mnth',
    'hr',
    'holiday',
    'weekday',
    'workingday',
    'weathersit',
    'temp',
    'atemp',
    'hum',
    'windspeed'
]

output_features = [
    'cnt'
]

In [60]:
# input and output matrix
X = data[input_features]
y = data[output_features]

In [61]:
X.head()

Unnamed: 0,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,2011-01-01,spring,0,1,0,0,Saturday,No,Clear,0.24,0.2879,0.81,0.0
1,2011-01-01,spring,0,1,1,0,Saturday,No,Clear,0.22,0.2727,0.8,0.0
2,2011-01-01,spring,0,1,2,0,Saturday,No,Clear,0.22,0.2727,0.8,0.0
3,2011-01-01,spring,0,1,3,0,Saturday,No,Clear,0.24,0.2879,0.75,0.0
4,2011-01-01,spring,0,1,4,0,Saturday,No,Clear,0.24,0.2879,0.75,0.0


In [62]:
y.head()

Unnamed: 0,cnt
0,16
1,40
2,32
3,13
4,1


In [63]:
# test_size -> # of test samples
# random_state -> seed
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 123, test_size = 0.2)

In [64]:
print(X_train.shape[0], X_test.shape[0], y_train.shape[0], y_test.shape[0])

13903 3476 13903 3476


In [65]:
x_pipeline = FeatureUnion(transformer_list=[
    ("numeric pipeline", Pipeline(steps=[
        ("select numeric", ColumnSelector(select_numeric=True)), # select all numeric features
        ("impute data", SimpleImputer(strategy="median")), # replace null values with the median
        ("scale data", RobustScaler()) # scale the numeric values
    ])),
    ("non-numeric pipeline", Pipeline(steps=[
        ("select non-numeric", ColumnSelector(select_numeric=False)), # select all non-numeric features
        ("encode", OneHotEncoder()) # use one-hot-encoding 
    ])),
])

In [66]:
x_pipeline.fit(X_train)
X_train_p = x_pipeline.transform(X_train) # preprocessed data

In [68]:
# For our test data, we only need to run it through the transform pipeline
X_test_p = x_pipeline.transform(X_test)

In [71]:
print(X_test_p.shape)

(3476, 756)
