# Modelling

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FeatureUnion, Pipeline

In [None]:
data = pd.read_csv("../../data/housing.csv")

input_features = [
    'longitude',
    'latitude',
    'housing_median_age',
    'total_rooms',
    'total_bedrooms',
    'population',
    'households',
    'median_income',
    'ocean_proximity'
]

output_features = [
    'median_house_value'
]

X_train, X_test, y_train, y_test = train_test_split(
    data[input_features],
    data[output_features]
)

In [None]:
class ColumnSelector:
    
    def __init__(self, select_numeric=True):
        self.select_numeric = select_numeric
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        if self.select_numeric:
            return X.select_dtypes(include=["number"])
        elif not self.select_numeric:
            return X.select_dtypes(exclude=["number"])

In [None]:
X_pipeline = FeatureUnion(transformer_list=[
    ("numeric pipeline", Pipeline(steps=[
        ("select numbers", ColumnSelector(select_numeric=True)),
        ("impute data", SimpleImputer(strategy="median")),
        ("scale data", MinMaxScaler())
    ])),
    ("non_numeric pipeline", Pipeline(steps=[
        ("select non numeric", ColumnSelector(select_numeric=False)),
        ("encode data", OneHotEncoder())
    ]))
])

y_pipeline = Pipeline(steps=[
    ("scale data", MinMaxScaler())
])

X_pipeline.fit(X_train)
X_train_p = X_pipeline.transform(X_train)
X_test_p = X_pipeline.transform(X_test)

y_pipeline.fit(y_train)
y_train_p = y_pipeline.transform(y_train)
y_test_p = y_pipeline.transform(y_test)

## Input zum Thema

### Funktionsprinzip eines Estimators

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
lr_model = LinearRegression()

In [None]:
lr_model.fit(X_train_p, y_train_p)

In [None]:
prediction = lr_model.predict(X_test_p)

In [None]:
lr_model.score(X_test_p, y_test_p)

### `sklearn` Estimators

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor
from sklearn.linear_model import Lasso, LogisticRegression, Ridge, PassiveAggressiveRegressor, SGDRegressor
from sklearn.svm import SVR

## Weiterführende Aufgaben

* Vollzieht dieses Beispiel mit anderen Daten aus `../data` nach
* Probiert alternative Estimatoren auf eure Daten aus (z.B. `sklearn.tree.DecisionTreeRegressor`, `sklear.ensemble.GradientBoostingRegressor`, `sklearn.ensemble.RandomForestRegressor`, usw.)
* Schaut euch die verfügbaren Parameter eines gefitteten Estimators an (z.B. `coef_`, `intercept_`) und überlegt euch, was diese bedeuten. Schaut euch an, wie diese Parameter zu unseren Überlegungen hinsichtlich der Feature-Korrelation passen.

In [None]:
# Platz für euren Code