# Bsp. zur Datenvorverarbeitung aus der Praxis

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


## a) X und y aus den Daten entnehmen und in Trainings- und Testdaten aufteilen

In [5]:
X = df[['age', 'sex', 'bmi', 'children','smoker', 'region']]
y = df['charges']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((1070, 6), (268, 6), (1070,), (268,))

## b) OneHot Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
X_train_ohe = X_train[['sex','smoker','region']]
ohe = OneHotEncoder(sparse = False)
ohe.fit(X_train_ohe)
X_train_ohe = ohe.transform(X_train_ohe)
X_train_ohe

array([[0., 1., 1., ..., 1., 0., 0.],
       [1., 0., 1., ..., 0., 0., 1.],
       [0., 1., 1., ..., 0., 1., 0.],
       ...,
       [0., 1., 1., ..., 1., 0., 0.],
       [0., 1., 1., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.]])

## c) Standardisierung der Numerischen Daten

In [11]:
from sklearn.preprocessing import StandardScaler

In [12]:
X_train_std = X_train[['age','bmi','children']]
scaler = StandardScaler()
scaler.fit(X_train_std)
X_train_std = scaler.transform(X_train_std)
X_train_std

array([[ 0.74993609,  0.24757643, -0.89802533],
       [-1.08209956, -0.50005757, -0.89802533],
       [ 0.60901027, -0.15911699, -0.89802533],
       ...,
       [ 0.46808445, -0.80122176,  1.62398807],
       [ 1.10225063,  0.32307041, -0.0573542 ],
       [-0.16608174, -0.14288172, -0.89802533]])

## d) Data Frames wieder zusammenführen

In [13]:
import numpy as np
X_train_compl = np.concatenate([X_train_ohe, X_train_std], axis = 1)
X_train_compl

array([[ 0.        ,  1.        ,  1.        , ...,  0.74993609,
         0.24757643, -0.89802533],
       [ 1.        ,  0.        ,  1.        , ..., -1.08209956,
        -0.50005757, -0.89802533],
       [ 0.        ,  1.        ,  1.        , ...,  0.60901027,
        -0.15911699, -0.89802533],
       ...,
       [ 0.        ,  1.        ,  1.        , ...,  0.46808445,
        -0.80122176,  1.62398807],
       [ 0.        ,  1.        ,  1.        , ...,  1.10225063,
         0.32307041, -0.0573542 ],
       [ 0.        ,  1.        ,  1.        , ..., -0.16608174,
        -0.14288172, -0.89802533]])

## e) Modelle des ohe und des std abspeichern

In [15]:
import joblib
joblib.dump(ohe,'ohe.pkl')
joblib.dump(scaler,'scaler.pkl')

['scaler.pkl']

## f) Lineare Regression als Modell def und anlernen

In [17]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_compl, y_train)
joblib.dump(model, 'model.pkl')

['model.pkl']

# Objektorientierte Umsetzung als Klasse

Ziel: Erzeugung einer Klasse, welche sowohl die Datenvorverarbeitung als auch den Schätzprozess übernimmt

## a) Definition der Klasse

In [18]:
from sklearn.metrics import mean_absolute_error, r2_score

In [35]:
class PredictInsurance():
    def __init__(self, ohe, scaler, model,
                 ohe_cols=['sex','smoker','region'], scale_cols=['age','bmi','children']):
        self.ohe = self.__load_file(ohe)
        self.scaler = self.__load_file(scaler)
        self.model = self.__load_file(model)
        self.ohe_cols = ohe_cols
        self.scale_cols = scale_cols

    def __load_file(self, file_name):
        return joblib.load(file_name)

    def __preprocess(self,X_pred):
        X_ohe = self.__ohe(X_pred[self.ohe_cols])
        X_std = self.__scale(X_pred[self.scale_cols])
        X_compl = np.concatenate([X_ohe, X_std], axis = 1)
        return X_compl

    def __ohe(self, X_ohe):
        return self.ohe.transform(X_ohe)

    def __scale(self, X_std):
        return self.scaler.transform(X_std)

    def predict(self, X_pred):
        X_compl = self.__preprocess(X_pred)
        return self.model.predict(X_compl)

    def evaluate(self, X_pred, y_true):
        y_pred = self.predict(X_pred)
        return (mean_absolute_error(y_true,y_pred), r2_score(y_true,y_pred))

## b) Verwendung der Klasse

In [36]:
predictor = PredictInsurance(
    ohe = 'ohe.pkl',
    scaler = 'scaler.pkl',
    model = 'model.pkl'
)

mae, r2 = predictor.evaluate(X_test, y_test)
mae, r2

(3798.3039139141792, 0.800513805935222)