In [1]:
# Importieren der erforderlichen Bibliotheken
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.feature_selection import RFE
from pandas.api.types import CategoricalDtype

In [2]:
# Schritt 1: Laden der Datensätze
train_data_df = pd.read_csv('../data/raw/dmml1_train.csv')
store_data_df = pd.read_csv('../data/raw/dmml1_stores.csv')

In [10]:
store_data_df

Unnamed: 0,Store ID,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
1,2,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
2,3,a,c,2030.0,8.0,2000.0,0,,,
3,4,a,c,1070.0,,,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
4,5,a,c,4590.0,3.0,2000.0,1,40.0,2011.0,"Jan,Apr,Jul,Oct"
...,...,...,...,...,...,...,...,...,...,...
295,296,a,a,690.0,6.0,2007.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
296,297,b,b,720.0,3.0,2002.0,0,,,
297,298,d,c,1340.0,10.0,2006.0,1,5.0,2013.0,"Feb,May,Aug,Nov"
298,299,d,a,260.0,2.0,2012.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"


In [3]:
# Schritt 2: Ersetzen fehlender Werte in 'CompetitionDistance' und Merging der Datensätze
median_distance = store_data_df['CompetitionDistance'].median()
store_data_df['CompetitionDistance'].fillna(median_distance, inplace=True)

merged_data = train_data_df.merge(store_data_df, on='Store ID', how='left')
merged_data.drop(columns=['CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'], inplace=True)

In [4]:
# Schritt 3: Anpassung der Datumsmerkmale und Generierung neuer Features
merged_data['Date'] = pd.to_datetime(merged_data['Date'])
merged_data['Year'] = merged_data['Date'].dt.year
merged_data['Month'] = merged_data['Date'].dt.month
merged_data['Day'] = merged_data['Date'].dt.day
merged_data['WeekOfYear'] = merged_data['Date'].dt.isocalendar().week
merged_data['Weekend'] = np.where(merged_data['DayOfWeek'].isin([6, 7]), 1, 0)  # Samstag = 6, Sonntag = 7

# cat_type = CategoricalDtype(categories=['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag'], ordered=True)
# merged_data['Weekday'] = merged_data['Date'].dt.day_name(locale='de_DE').astype(cat_type)

merged_data['Quarter'] = merged_data['Date'].dt.quarter
merged_data['DayOfYear'] = merged_data['Date'].dt.dayofyear
merged_data['DayOfMonth'] = merged_data['Date'].dt.day

merged_data['Season'] = merged_data['Month'].apply(lambda month: (month%12 // 3 + 1))
merged_data['Season'].replace(to_replace=[1,2,3,4], value=['Winter', 'Frühling','Sommer','Herbst'], inplace=True)

# Entfernen der ursprünglichen Date-Spalte
merged_data.drop('Date', axis=1, inplace=True)

In [None]:
# Schritt 4: One-Hot-Encoding der kategorischen Variablen
categorical_columns = ['StateHoliday', 'StoreType', 'Assortment', 'Season', 'DayOfWeek']
encoder = OneHotEncoder(sparse=False, drop='first')
encoded_columns = encoder.fit_transform(merged_data[categorical_columns])

encoded_columns_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(categorical_columns))
merged_data_encoded = pd.concat([merged_data, encoded_columns_df], axis=1)
merged_data_encoded.drop(categorical_columns, axis=1, inplace=True)



In [None]:
# Schritt 5: Standardisierung der numerischen Spalten
numerical_columns = [col for col in merged_data_encoded.columns 
                    if col not in ['Store ID', 'Open', 'Promo', 'SchoolHoliday', 'Promo2', 'Weekend'] 
                    and merged_data_encoded[col].nunique() > 2]

scaler = StandardScaler()
scaled_numerical = scaler.fit_transform(merged_data_encoded[numerical_columns])
scaled_numerical_df = pd.DataFrame(scaled_numerical, columns=numerical_columns)
for col in numerical_columns: 
    merged_data_encoded[col] = scaled_numerical_df[col]

In [None]:
# Teilen der Daten in Features und Zielvariable
X = merged_data_encoded.drop(['Sales', 'Customers'], axis=1)
y = merged_data_encoded['Sales']

# Teilen der Daten in Trainings- und Testsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisierung der Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# Schritt 6: Vorbereitung der Daten für das lineare Regressionsmodell
X = merged_data_encoded.drop(['Sales', 'Customers'], axis=1)
y = merged_data_encoded['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardisierung der Features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Schritt 7: Anwenden von Lasso Regression mit GridSearchCV
lasso = Lasso()
# Anpassung des alpha-Bereichs für die Grid-Suche
parameters = {'alpha': [0.00001, 0.00005, 0.0001, 0.0005, 0.001], 'max_iter': [10000]}
lasso_regressor = GridSearchCV(Lasso(), parameters, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
lasso_regressor.fit(X_train_scaled, y_train)


# Auswertung des Modells
best_lasso_model = lasso_regressor.best_estimator_
y_pred = best_lasso_model.predict(X_test_scaled)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r_squared = r2_score(y_test, y_pred)

print(f"Bestes Alpha für Lasso: {lasso_regressor.best_params_}")
print(f"Root Mean Squared Error (RMSE) des Lasso Modells: {rmse}")
print(f"R-Quadrat (R²) des Lasso Modells: {r_squared}")



  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Bestes Alpha für Lasso: {'alpha': 1e-05, 'max_iter': 10000}
Root Mean Squared Error (RMSE) des Lasso Modells: 0.668985685601276
R-Quadrat (R²) des Lasso Modells: 0.5527510581590486


  model = cd_fast.enet_coordinate_descent(
