In [117]:
import pandas as pd

data_path = "/home/rouven/python/UebungDatascience/KaggleDatasets/data/employee_data.csv"

df = pd.read_csv(data_path, index_col="Unnamed: 0").drop(columns=["id"])
df


Unnamed: 0,groups,age,healthy_eating,active_lifestyle,salary
0,A,36,5,5,2297
1,A,55,3,5,1134
2,A,61,8,1,4969
3,O,29,3,6,902
4,O,34,6,2,3574
...,...,...,...,...,...
995,O,33,7,7,2996
996,O,21,1,2,667
997,O,49,9,7,4158
998,AB,56,6,7,2414


In [118]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   groups            1000 non-null   object
 1   age               1000 non-null   int64 
 2   healthy_eating    1000 non-null   int64 
 3   active_lifestyle  1000 non-null   int64 
 4   salary            1000 non-null   int64 
dtypes: int64(4), object(1)
memory usage: 46.9+ KB


In [119]:
df["groups"].value_counts()

groups
A     375
O     375
AB    125
B     125
Name: count, dtype: int64

In [120]:
df["healthy_eating"].value_counts()

healthy_eating
5     179
6     176
4     173
3     138
7     116
8      73
2      71
9      26
1      25
0      14
10      9
Name: count, dtype: int64

In [121]:
df["active_lifestyle"].value_counts()

active_lifestyle
6     213
5     168
7     163
8     114
4     104
3      92
9      64
2      34
1      26
10     15
0       7
Name: count, dtype: int64

In [122]:
X = df.drop(columns="salary")
y = df["salary"]

In [123]:
print(X)

    groups  age  healthy_eating  active_lifestyle
0        A   36               5                 5
1        A   55               3                 5
2        A   61               8                 1
3        O   29               3                 6
4        O   34               6                 2
..     ...  ...             ...               ...
995      O   33               7                 7
996      O   21               1                 2
997      O   49               9                 7
998     AB   56               6                 7
999      B   64               4                 9

[1000 rows x 4 columns]


In [124]:
categorical_cols = [0]
ordinal_cols = [2, 3]
numeric_cols = [1]

### Helper

In [125]:
def print_grid_cv_results(grid_result) -> None:
    print(f"Best model score: {grid_result.best_score_} Best model params: {grid_result.best_params_} ")
    means = grid_result.cv_results_["mean_test_score"]
    stds = grid_result.cv_results_["std_test_score"]
    params = grid_result.cv_results_["params"]

    for mean, std, param in zip(means, stds, params):
        mean_ = round(mean, 4)
        std_ = round(std, 4)
        print(f"{mean_} (+/- {2 * std_}) with: {param}")


In [126]:
X = X.to_numpy()
y = y.to_numpy()


In [127]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [128]:
X_train

array([['A', 54, 8, 5],
       ['O', 64, 6, 5],
       ['O', 28, 3, 3],
       ...,
       ['A', 30, 3, 2],
       ['O', 34, 2, 8],
       ['A', 32, 6, 6]], shape=(800, 4), dtype=object)

## Pipe with passthrough of ordinals

In [129]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder


numeric_pipeline = Pipeline([("standardscaler", StandardScaler())])

categorical_pipeline = Pipeline([("oneHotEncoder", OneHotEncoder(sparse_output=False))])

trf = ColumnTransformer(
    [
        ("num_pipe", numeric_pipeline, numeric_cols),
        ("cat_pipe", categorical_pipeline, categorical_cols),
        ("pass", "passthrough", ordinal_cols),
    ]
)

X_train_pass = trf.fit_transform(X_train)
X_test_pass = trf.transform(X_test)
X_train_pass

array([[0.9333617060884627, 1.0, 0.0, ..., 0.0, 8, 5],
       [1.6784863092925044, 0.0, 0.0, ..., 1.0, 6, 5],
       [-1.003962262242046, 0.0, 0.0, ..., 1.0, 3, 3],
       ...,
       [-0.8549373416012377, 1.0, 0.0, ..., 0.0, 3, 2],
       [-0.556887500319621, 0.0, 0.0, ..., 1.0, 2, 8],
       [-0.7059124209604293, 1.0, 0.0, ..., 0.0, 6, 6]],
      shape=(800, 7), dtype=object)

## Pipe with MinMax of ordinals

In [130]:
from sklearn.preprocessing import MinMaxScaler


numeric_pipeline = Pipeline([("standardscaler", StandardScaler())])

categorical_pipeline = Pipeline([("oneHotEncoder", OneHotEncoder(sparse_output=False))])

ordinal_pipeline = Pipeline([("MinMax", MinMaxScaler())])

trf = ColumnTransformer(
    [
        ("num_pipe", numeric_pipeline, numeric_cols),
        ("cat_pipe", categorical_pipeline, categorical_cols),
        ("ord_pipe", ordinal_pipeline, ordinal_cols),
    ]
)


X_train_minmax = trf.fit_transform(X_train)
X_test_minmax = trf.transform(X_test)
X_train_minmax


array([[ 0.93336171,  1.        ,  0.        , ...,  0.        ,
         0.8       ,  0.5       ],
       [ 1.67848631,  0.        ,  0.        , ...,  1.        ,
         0.6       ,  0.5       ],
       [-1.00396226,  0.        ,  0.        , ...,  1.        ,
         0.3       ,  0.3       ],
       ...,
       [-0.85493734,  1.        ,  0.        , ...,  0.        ,
         0.3       ,  0.2       ],
       [-0.5568875 ,  0.        ,  0.        , ...,  1.        ,
         0.2       ,  0.8       ],
       [-0.70591242,  1.        ,  0.        , ...,  0.        ,
         0.6       ,  0.6       ]], shape=(800, 7))

## LinearRegression Baseline

### pass through

In [131]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

linreg = LinearRegression()
linreg.fit(X_train_pass, y_train)
y_pred = linreg.predict(X_test_pass)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"r2 score: {r2}")
print(f"mse: {mse}")

r2 score: 0.906593244496742
mse: 101570.74509289958


### minmax scaled

In [132]:
linreg = LinearRegression()
linreg.fit(X_train_minmax, y_train)
y_pred = linreg.predict(X_test_minmax)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"r2 score: {r2}")
print(f"mse: {mse}")


r2 score: 0.9065932444967424
mse: 101570.74509289912


## KNNRegressor 

In [None]:
from sklearn.neighbors import KNeighborsRegressor

In [None]:
from sklearn.model_selection import GridSearchCV
