# Natural Spline

# Data Import

In [1]:
import pandas as pd

df = pd.read_csv('ready_data.csv')
df

Unnamed: 0,housing_median_age,median_income,median_house_value,priceCat,person_per_household,share_bedrooms,rooms_per_household,sm_PpH,sm_RpH,proximity
0,41.0,8.3252,452.6,above,2.555556,0.146591,6.984127,0.938270,1.943640,COAST
1,21.0,8.3014,358.5,above,2.109842,0.155797,6.238137,0.746613,1.830682,COAST
2,52.0,7.2574,352.1,above,2.802260,0.129516,8.288136,1.030426,2.114825,COAST
3,52.0,5.6431,341.3,above,2.547945,0.184458,5.817352,0.935287,1.760845,COAST
4,52.0,3.8462,342.2,above,2.181467,0.172096,6.281853,0.779998,1.837665,COAST
...,...,...,...,...,...,...,...,...,...,...
19369,25.0,1.5603,78.1,below,2.560606,0.224625,5.045455,0.940244,1.618488,INLAND
19370,18.0,2.5568,77.1,below,3.122807,0.215208,6.114035,1.138732,1.810587,INLAND
19371,17.0,1.7000,92.3,below,2.325635,0.215173,5.205543,0.843993,1.649724,INLAND
19372,18.0,1.8672,84.7,below,2.123209,0.219892,5.329513,0.752929,1.673260,INLAND


In [2]:
df = df.drop(["priceCat"], axis=1)

# Vorgehensweise

In den nachfolgenden Punkten wird die Regression eines Natural Splines jeweils einmal mit **SK Learn** und **Statsmodels** durchgeführt.</br>

Vorarbeit wurde bereits in Data.ipynb erbracht.

SK Learn Pipeline verwendet eine Lasso-Regression mit dem ermittelten Hyperparameter aus *lasso_regression.ipynb*. Eventuell handelt es sich dadurch nicht um einen **Natural** Spline.

# Modellierung in SK Learn Pipeline

Für den Spline wird der Hyperparameter aus der Aufgabe zur Lasso-Regression verwendet. Es handelt sich nicht um einen Natural Spline</br>
Lambda: 0.063103048006874

In [3]:
from sklearn.metrics import mean_squared_error

# create function to obtain model mse
def model_results(model_name):

    # Training data
    pred_train = reg.predict(X_train)
    mse_train = round(mean_squared_error(y_train, pred_train, squared=True),4)
    rmse_train = round(mean_squared_error(y_train, pred_train, squared=False),4)

    # Test data
    pred_test = reg.predict(X_test)
    mse_test =round(mean_squared_error(y_test, pred_test, squared=True),4)
    rmse_test =round(mean_squared_error(y_test, pred_test, squared=False),4)

    # Print model results
    result = pd.DataFrame(
        {"model": model_name, 
        "mse_train": [mse_train],
        "rmse_train": [rmse_train],
        "mse_test": [mse_test], 
        "rmse_test": [rmse_test],
        }
        )
    
    return result;

In [5]:
X = df[["housing_median_age", "median_income", "sm_RpH", "sm_PpH", "proximity",]] #sm_ verwendet, um nicht nochmal den FunctionTransformer zuschreiben.
y = df["median_house_value"]

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [8]:
from sklearn.preprocessing import SplineTransformer, OneHotEncoder, StandardScaler
from sklearn.linear_model import Lasso
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.compose import make_column_transformer, ColumnTransformer

lasso = Lasso(alpha=0.063103048006874) # Bestes Alpha aus Aufgabe zur Lasso Regression
column_trans = ColumnTransformer(remainder='passthrough', transformers=[('onehotencoder', OneHotEncoder(),['proximity']), ('standscal', StandardScaler(),['median_income'])])

reg = make_pipeline(column_trans, SplineTransformer(n_knots=4, degree=3), lasso)
reg.fit(X_train, y_train)

y_pred = reg.predict(X_train)

In [9]:
model_results(model_name = "spline")

Unnamed: 0,model,mse_train,rmse_train,mse_test,rmse_test
0,spline,3361.4907,57.9784,3506.5874,59.2164


# Modellierung in Statsmodels & Patsy

In [3]:
from patsy import dmatrix
import statsmodels.api as sm
from sklearn.model_selection import train_test_split

In [17]:
X = df[["median_income"]] #sm_ verwendet, um nicht nochmal den FunctionTransformer zuschreiben.
y = df["median_house_value"]

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 1)

In [19]:
transformed_x3 = dmatrix("cr(train, df=3)", {"train": X_train},return_type='dataframe')

In [20]:
reg = sm.GLM(y_train, transformed_x3).fit()

In [21]:
# Training data
pred_train = reg.predict(dmatrix("cr(train, df=3)", {"train": X_train}, return_type='dataframe'))
mse_train = mean_squared_error(y_train, pred_train, squared=True)
rmse_train = mean_squared_error(y_train, pred_train, squared=False)

# Test data
pred_test = reg.predict(dmatrix("cr(test, df=3)", {"test": X_test}, return_type='dataframe'))
mse_test = mean_squared_error(y_test, pred_test, squared=True)
rmse_test = mean_squared_error(y_test, pred_test, squared=False)

# Save model results
model_results_ns = pd.DataFrame(
    {
    "model": "Natural spline (ns)", 
    "mse_train": [mse_train],  
    "rmse_train": [rmse_train],
    "mse_test": [mse_test], 
    "rmse_test": [rmse_test],
    })

model_results_ns

Unnamed: 0,model,mse_train,rmse_train,mse_test,rmse_test
0,Natural spline (ns),5400.39427,73.487375,5594.923795,74.799223
