# Musternotebook: Regression
Die dazugehörige Dokumentation der einzelnen Methoden finden sie in examplary_classification.ipynb

## Imports, Variables & Settings

In [111]:
# Reset IPython Notebook Variables
%reset -f

In [112]:
# Imports
import numpy as np          # Numpy as the math lib
import pandas as pd         # Pandas for manipulating dataframes
import bdsm                 # bsdm as general helpers
import seaborn as sns       # seaborn for datavisualisation
import sklearn as sk        # scikit-learn for modeling and utility

from scipy import stats as stats

from sklearn import svm
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

# Set seed
seed = 1234
np.random.seed(seed)

# Set seaborn color schema
palette = sns.dark_palette("#69d", reverse=False, as_cmap=True)

## Data Loading

In [113]:
# From special library
df_data = bdsm.datasets.penguins().clean().to_numeric()
df_data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype('float')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].fillna(df.groupby('Species')[col].transform('mean'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Sex'] = np.where(df['Sex'].isna(), 'Unknown', df['Sex'].str.title())
A value is trying to be set

Unnamed: 0,CulmenLength,CulmenDepth,FlipperLength,BodyMass,Species_cat,Island_cat,Sex_cat
0,39.100000,18.700000,181.000000,3750.000000,0,2,1
1,39.500000,17.400000,186.000000,3800.000000,0,2,0
2,40.300000,18.000000,195.000000,3250.000000,0,2,0
3,38.791391,18.346358,189.953642,3700.662252,0,2,2
4,36.700000,19.300000,193.000000,3450.000000,0,2,0
...,...,...,...,...,...,...,...
341,55.800000,19.800000,207.000000,4000.000000,1,1,1
342,43.500000,18.100000,202.000000,3400.000000,1,1,0
343,49.600000,18.200000,193.000000,3775.000000,1,1,1
344,50.800000,19.000000,210.000000,4100.000000,1,1,1


# Standardisierung des gesamten datasets

In [114]:
print("Mean:")
print(df_data.mean(axis=0))
print("Std:")
print(df_data.std(axis=0))
df_data = stats.zscore(df_data)

Mean:
CulmenLength       43.917431
CulmenDepth        17.148339
FlipperLength     200.930641
BodyMass         4202.839182
Species_cat         0.918605
Island_cat          0.662791
Sex_cat             0.552326
dtype: float64
Std:
CulmenLength       5.454118
CulmenDepth        1.973561
FlipperLength     14.060613
BodyMass         801.461347
Species_cat        0.893320
Island_cat         0.726194
Sex_cat            0.558680
dtype: float64


### Data Split
Bestimmen von data und labels

In [115]:
data, labels = df_data[["CulmenLength", "CulmenDepth", "FlipperLength", "Species_cat", "Island_cat", "Sex_cat"]], df_data["BodyMass"]

Korrelation überprüfen

### Train / Validation / Test Datasets

In [116]:
x_train, x_tmp, y_train, y_tmp = sk.model_selection.train_test_split(data, labels, test_size = 0.30, random_state=seed)
x_valid, x_test, y_valid, y_test = sk.model_selection.train_test_split(x_tmp, y_tmp, test_size = 0.30, random_state=seed)
print(f"train/validation: {len(x_train)}/{len(x_valid)}")

train/validation: 240/72


### Models
#### Linear Regression

In [117]:
model = LinearRegression()
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print("R-squared: ", model.score(x_valid, y_valid))
print("Mean Squared Error: ", sk.metrics.mean_squared_error(y_pred, y_valid))

R-squared:  0.7522885002020309
Mean Squared Error:  0.2514817973699433


#### Decision Tree

In [118]:
model = DecisionTreeRegressor(random_state=seed)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print("R-squared: ", model.score(x_valid, y_valid))
print("Mean Squared Error: ", sk.metrics.mean_squared_error(y_pred, y_valid))

R-squared:  0.6677160590149094
Mean Squared Error:  0.33734147499915085


### KNN

In [119]:
model = KNeighborsRegressor(n_neighbors=3)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print("R-squared: ", model.score(x_valid, y_valid))
print("Mean Squared Error: ", sk.metrics.mean_squared_error(y_pred, y_valid))

R-squared:  0.8520966046848659
Mean Squared Error:  0.15015456174341135


#### Support Vector Regression (SVR)

In [120]:
model = SVR(C=0.9)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print("R-squared: ", model.score(x_valid, y_valid))
print("Mean Squared Error: ", sk.metrics.mean_squared_error(y_pred, y_valid))

R-squared:  0.8350158339405682
Mean Squared Error:  0.16749531068216994


#### Random Forest

In [121]:
model = RandomForestRegressor(
    n_estimators=100,
    max_leaf_nodes=16,
    n_jobs=-1,
    random_state=seed
)
model.fit(x_train, y_train)
y_pred = model.predict(x_valid)
print("R-squared: ", model.score(x_valid, y_valid))
print("Mean Squared Error: ", sk.metrics.mean_squared_error(y_pred, y_valid))

R-squared:  0.8078547669631535
Mean Squared Error:  0.19506978319368662
