## Aufgabe 4
Trainieren Sie drei verschiedene Modelle, die in der Vorlesung behandelt wurden: ein `lineares Modell` (einfache lineare Regression, Ridge, Lasso), einen `Entscheidungsbaum` und ein `Ensemble-Modell` (Gradient Boosting oder Random Forest)
1. Optimieren sie Hyperparameter der Modelle mittels Suche und Kreuzvalidierung. Überlegen Sie dazu zunächst (mit Hilfe der Vorlesungsunterlagen und der Dokumentation der Methoden in scikit-learn ), was für die jeweiligen Modelle Hyperparameter sind und für welche sich eine Optimierung ggf. lohnen könnte.
2. WelchessinddiewichtigstenFeaturesfürdiejeweiligenModelle?

# lineare Regression

In [49]:
# Importieren der erforderlichen Bibliotheken
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from pandas.api.types import CategoricalDtype

In [50]:
# Schritt 1: Laden der Datensätze
train_data_df = pd.read_csv('../data/raw/dmml1_train.csv')
store_data_df = pd.read_csv('../data/raw/dmml1_stores.csv')

In [51]:
median_distance = store_data_df['CompetitionDistance'].median()
store_data_df['CompetitionDistance'].fillna(median_distance, inplace=True)

In [52]:
# Schritt 2: Ersetzen fehlender Werte in 'CompetitionDistance' und Merging der Datensätze
median_distance = store_data_df['CompetitionDistance'].median()
store_data_df['CompetitionDistance'].fillna(median_distance, inplace=True)

merged_data = train_data_df.merge(store_data_df, on='Store ID', how='left')
merged_data.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], inplace=True)

In [53]:
# Schritt 3: Anpassung der Datumsmerkmale und Generierung neuer Features
merged_data['Date'] = pd.to_datetime(merged_data['Date'])
merged_data['Year'] = merged_data['Date'].dt.year
merged_data['Month'] = merged_data['Date'].dt.month
merged_data['Day'] = merged_data['Date'].dt.day
merged_data['WeekOfYear'] = merged_data['Date'].dt.isocalendar().week
merged_data['Weekend'] = np.where(merged_data['DayOfWeek'].isin([6, 7]), 1, 0)  # Samstag = 6, Sonntag = 7

cat_type = CategoricalDtype(categories=['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag'], ordered=True)
merged_data['Weekday'] = merged_data['Date'].dt.day_name(locale='de_DE').astype(cat_type)

merged_data['Quarter'] = merged_data['Date'].dt.quarter
merged_data['DayOfYear'] = merged_data['Date'].dt.dayofyear
merged_data['DayOfMonth'] = merged_data['Date'].dt.day

merged_data['Season'] = merged_data['Month'].apply(lambda month: (month%12 // 3 + 1))
merged_data['Season'].replace(to_replace=[1,2,3,4], value=['Winter', 'Frühling','Sommer','Herbst'], inplace=True)

# Entfernen der ursprünglichen Date-Spalte
merged_data.drop('Date', axis=1, inplace=True)

In [54]:
# Schritt 4: One-Hot-Encoding der kategorischen Variablen
categorical_columns = ['StateHoliday', 'StoreType', 'Assortment', 'Season', 'Weekday']
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_columns = encoder.fit_transform(merged_data[categorical_columns])

encoded_columns_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))
merged_data_encoded = pd.concat([merged_data, encoded_columns_df], axis=1)
merged_data_encoded.drop(categorical_columns, axis=1, inplace=True)



In [55]:
# Schritt 5: Standardisierung der numerischen Spalten
numerical_columns = [col for col in merged_data_encoded.columns 
                    if col not in ['Store ID', 'Open', 'Promo', 'SchoolHoliday', 'Promo2', 'Weekend'] 
                    and merged_data_encoded[col].nunique() > 2]

scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(merged_data_encoded[numerical_columns])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_columns)
for col in numerical_columns: 
    merged_data_encoded[col] = scaled_numerical_df[col]

In [None]:
# Schritt 6: Vorbereitung der Daten für das lineare Regressionsmodell
X = merged_data_encoded.drop(['Sales', 'Customers'], axis=1)
y = merged_data_encoded['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Schritt 7: Initialisierung des linearen Regressionsmodells und RFE
estimator = LinearRegression()
selector = RFE(estimator)
n_features = X_train.shape[1]

In [None]:
# Schritt 8: Einsatz von GridSearchCV zur Ermittlung der optimalen Anzahl von Features
param_grid = {
    'n_features_to_select': list(range(1, n_features + 1, 1))
}
grid_search = GridSearchCV(selector, param_grid, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
# Training des GridSearchCV-Objekts
grid_search.fit(X_train, y_train)

In [None]:
# Schritt 9: Auswahl und Training des Modells mit den besten Features
best_n_features = grid_search.best_params_['n_features_to_select']
selector = RFE(estimator, n_features_to_select=best_n_features)
selector.fit(X_train, y_train)

In [None]:
# Schritt 10: Identifizieren der ausgewählten und weggelassenen Features
selected_features = X_train.columns[selector.support_]
dropped_features = X_train.columns[~selector.support_]
X_train_selected = X_train.loc[:, selector.support_]
X_test_selected = X_test.loc[:, selector.support_]

In [None]:
# Schritt 11: Training des finalen linearen Regressionsmodells
final_model = LinearRegression()
final_model.fit(X_train_selected, y_train)

In [None]:
# Schritt 12: Vorhersage und Berechnung des RMSE und R-Quadrat-Werts
y_pred = final_model.predict(X_test_selected)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r_squared = r2_score(y_test, y_pred)

print(f"Optimale Anzahl von Features: {best_n_features} von {n_features}")
print(f"Ausgewählte Features: {selected_features.tolist()}")
print(f"Weggelassene Features: {dropped_features.tolist()}")
print(f"Root Mean Squared Error (RMSE) des finalen Modells: {rmse}")
print(f"R-Quadrat (R²) des finalen Modells: {r_squared}")
