### Vorverarbeitung in der Praxis (Kapitel 3.4.4.)

#### 1) Daten laden

In [25]:
import pandas as pd
import numpy as np

df = pd.read_csv('insurance.csv')
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

### Teil 1 - Anlernprozess durchführen

#### a) Trainings- und Testdaten separieren

In [26]:
from sklearn.model_selection import train_test_split
X = df[['age', 'sex', 'bmi', 'children', 'smoker', 'region']]
y = df['charges']
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                   test_size=0.2, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 6), (268, 6), (1070,), (268,))

#### b) Kategoriale Variablen aus Trainingsdaten ziehen, mit dem OneHotEncoder anlernen und transformieren

In [27]:
from sklearn.preprocessing import OneHotEncoder
X_train_ohe = X_train[['sex', 'smoker', 'region']]
ohe = OneHotEncoder(sparse_output=False)
X_train_ohe = ohe.fit_transform(X_train_ohe)
X_train_ohe

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.]])

#### c) Numerische Variablen aus Trainingsdaten extrahieren, den Standardisierer anlernen, die Daten transformieren

In [28]:
from sklearn.preprocessing import StandardScaler
X_train_std = X_train[['age', 'bmi', 'children']]
scaler = StandardScaler()

X_train_std = scaler.fit_transform(X_train_std)
X_train_std.shape

(1070, 3)

#### d) Die beiden separierten Arrays wieder zusammenführen, um sie der fit-Methode des Regressors zum Anlernen zu übergeben

In [29]:
import numpy as np
X_train_compl = np.concatenate([X_train_ohe, X_train_std], axis=1)
X_train_compl.shape

(1070, 11)

#### e) Den angelernten OneHotEncoder und den angelernten Standardisierer abspeichern

In [30]:
import joblib
joblib.dump(ohe, 'ohe.pkl')
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']

#### Modell mit vorbereiteten Daten anlernen und angelerntes Modell abspeichern

In [31]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_compl, y_train)
joblib.dump(model, 'model.pkl')

['model.pkl']

### Teil II: Objektorientierter Ansatz

#### a) Klasse erzeugen, die die Vorverarbeitung und den Schätzprozess übernimmt

In [32]:
import joblib
import numpy as np
from sklearn.metrics import mean_absolute_error, r2_score

class PredictInsurance():
    
    def __init__(self, ohe: str, scaler: str, model: str,
                ohe_cols=['sex', 'smoker', 'region'],
                scale_cols=['age', 'bmi', 'children']):
        self.ohe = self.__load_file(ohe)
        self.scaler = self.__load_file(scaler)
        self.model = self.__load_file(model)
        self.ohe_cols = ohe_cols
        self.scale_cols = scale_cols

    def predict(self, X_pred: pd.DataFrame) -> np.array:
        X_compl = self.__preprocess(X_pred)
        return self.model.predict(X_compl)

    def evaluate(self, X_pred: pd.DataFrame, y_true: pd.Series) -> float:
        y_pred = self.predict(X_pred)
        return (mean_absolute_error(y_true, y_pred),
                    r2_score(y_true, y_pred))

    def __load_file(self, file):
        return joblib.load(file)
    
    def __preprocess(self, X_pred):
        X_ohe = self.__ohe(X_pred[self.ohe_cols])
        X_std = self.__scale(X_pred[self.scale_cols])
        return np.concatenate([X_ohe, X_std], axis=1)

    def __ohe(self, X_ohe):
        return self.ohe.transform(X_ohe)

    def __scale(self, X_std):
        return self.scaler.transform(X_std)

#### 4) Evaluation durchführen

In [33]:
predictor = PredictInsurance( ohe='ohe.pkl', 
                              scaler='scaler.pkl',
                              model='model.pkl')
mae, r_square = predictor.evaluate(X_test, y_test)
print('mae: {:.3f}, r2: {:.3f}'.format(mae, r_square))

mae: 3798.304, r2: 0.801


#### 5) Schätzung durchführen (Beispieldatensatz)

In [34]:
X_pred = pd.DataFrame([[35, 'female', 25.77, 1, 'no', 'southeast']],
   columns=['age', 'sex', 'bmi', 'children', 'smoker', 'region'])
y_pred = predictor.predict(X_pred)
print('prediction (X_pred): {:.3f}'.format(y_pred[0]))

prediction (X_pred): 5408.000
