# Regression With Regularization

## 1. Pipeline

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
pd.options.display.max_columns = 50

from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [4]:
df = pd.read_csv("../Data/immo_data.csv")
desc = pd.read_csv("../Data/immo_data_column_description.csv")

df_reduced = df.drop(["serviceCharge", "totalRent", "scoutId", "houseNumber", "geo_bln", "geo_krs", "geo_plz", "date", "telekomHybridUploadSpeed"], axis=1, errors="ignore")
categorical_columns = df_reduced.select_dtypes(exclude=np.number).columns
for c in categorical_columns:
    if len(df_reduced[c].unique()) > 50:
        df_reduced.drop(c, axis=1, inplace=True)

interesting_columns = ["yearConstructed", "noParkSpaces", "baseRent", "livingSpace", "noRooms", "numberOfFloors", "heatingCosts", "lastRefurbish"]
upper_limits = df[interesting_columns].quantile(0.995)
lower_limits = df[interesting_columns].quantile(0.005)
for col in interesting_columns:
    df_reduced[col] = df[((df[col] < upper_limits[col]) & (df[col] > lower_limits[col])) | df[col].isna()][col]
df_reduced = df_reduced[df_reduced["baseRent"].isna() == False]

red_num = df_reduced.select_dtypes("number").columns
red_qua = df_reduced.select_dtypes("object").columns
imp_mean = SimpleImputer(missing_values=np.nan, strategy= 'mean').fit(df_reduced[red_num])
imp_most = SimpleImputer(missing_values=np.nan, strategy= 'most_frequent').fit(df_reduced[red_qua])
df_reduced[red_num] = imp_mean.transform(df_reduced[red_num])
df_reduced[red_qua] = imp_most.transform(df_reduced[red_qua])
df_reduced = pd.concat([pd.get_dummies(df_reduced[red_qua]),df_reduced], axis=1)
df_reduced = df_reduced.drop(red_qua, axis=1, errors="ignore")

X = df_reduced.drop("baseRent", axis=1)
X = X[df_reduced.regio1_Baden_Württemberg == 1]
y = df_reduced["baseRent"][df_reduced.regio1_Baden_Württemberg == 1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, shuffle=True)
feature_names = (names for names in X)

scaler = MinMaxScaler()
scaler.fit(X_train, y_train)
X_train = pd.DataFrame(scaler.transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

## 2. Ridge Regression mit Kreuzvalidierung

### 2.1 K-Fold Cross Validation

In [5]:
from sklearn.model_selection import KFold
from sklearn import linear_model
kf = KFold(n_splits=5, shuffle=True, random_state=42)
model = linear_model.Ridge()
mean_scores = []
stddev_scores = []
alphas = [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
for alpha in alphas:
    scores = []
    for train_index, test_index in kf.split(X_train):

        X_train_2, X_test_2 = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_2, y_test_2 = y_train.iloc[train_index], y_train.iloc[test_index]

        model.set_params(alpha=alpha)
        model.fit(X_train_2, y_train_2)
        scores.append(model.score(X_test_2, y_test_2))

    mean_scores.append(np.mean(np.array(scores)))
    stddev_scores.append(np.std(np.array(scores)))

print(mean_scores)
print(stddev_scores)

[0.8355096397120206, 0.8355096397125094, 0.8355096397173977, 0.8355096397662816, 0.8355096402551194, 0.8355096451433379, 0.8355096940097921, 0.8355101811025417, 0.8355148962019449, 0.835547655065587, 0.834960946946681]
[0.006312856201045064, 0.006312856201336555, 0.006312856204251616, 0.0063128562334019145, 0.006312856524904879, 0.006312859439931194, 0.006312888589883459, 0.00631318005821466, 0.006316091591572124, 0.006344865406743081, 0.006591254961093174]


### 2.2 Training für bestes Alpha

In [6]:
best_alpha_index = mean_scores.index(max(mean_scores))
best_alpha = alphas[best_alpha_index]

ridge = linear_model.Ridge(alpha=best_alpha)
ridge.fit(X_train, y_train)

y_test_pred = ridge.predict(X_test)

### 2.3 Evaluation

In [7]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

r2_test = r2_score(y_test, y_test_pred)
mse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = np.sqrt(mean_absolute_error(y_test, y_test_pred))

print(f"R² Testdaten: {r2_test}")
print(f"Mean squared error Testdaten: {mse_test}")
print(f"Mean absolute error Testdaten: {mae_test}")

R² Testdaten: 0.8396719715056218
Mean squared error Testdaten: 182.23734311132836
Mean absolute error Testdaten: 11.59163865138574


## 3. Lasso mit Kreuzvalidierung

### 3.1 K-Fold Cross Validation

Untersuchung von Hyperparmeter alpha  für das Model Lasso (L1-Regularisierung).

In [8]:
from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=100)
mean_scores = []
stddev_scores = []
alphas = [0.000000001, 0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0, 10.0]
for alpha in alphas:
    scores = []
    for train_index, test_index in kf.split(X_train):

        X_train_2, X_test_2 = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_2, y_test_2 = y_train.iloc[train_index], y_train.iloc[test_index]

        lasso.set_params(alpha=alpha)
        lasso.fit(X_train_2, y_train_2)
        scores.append(lasso.score(X_test_2, y_test_2))

    # Speichern von Mittelwert und Standardabweichung über die 5 Läufe für jedes alpha
    mean_scores.append(np.mean(np.array(scores)))
    stddev_scores.append(np.std(np.array(scores)))

print(mean_scores)
print(stddev_scores)

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


[0.8355096369502739, 0.8355096369897842, 0.8355096373848869, 0.8355096413359275, 0.8355096806328366, 0.835510073370546, 0.8355139324074644, 0.8355491126717425, 0.8356932943576721, 0.8321944413298462, 0.8062370576022871]
[0.006312906965348381, 0.006312906958767069, 0.0063129068929535445, 0.006312906234743103, 0.0063129000169776985, 0.006312837707676724, 0.006312359306065945, 0.006306883741280788, 0.00628153395470915, 0.006907229899403617, 0.009182396139608208]


### 3.2 Training für bestes Alpha

In [9]:
best_alpha_index = mean_scores.index(max(mean_scores))
best_alpha = alphas[best_alpha_index]

lasso = linear_model.Lasso(max_iter=100)
lasso.alpha = best_alpha
lasso.fit(X_train, y_train)

Lasso(alpha=0.1, max_iter=100)

### 3.3 Evaluation

In [10]:
y_test_pred = lasso.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

r2_test = r2_score(y_test, y_test_pred)
mse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
mae_test = np.sqrt(mean_absolute_error(y_test, y_test_pred))

print(f"R² Testdaten: {r2_test}")
print(f"Mean squared error Testdaten: {mse_test}")
print(f"Mean absolute error Testdaten: {mae_test}")

R² Testdaten: 0.839824011623335
Mean squared error Testdaten: 182.1509141841807
Mean absolute error Testdaten: 11.595821722114323
