## Arbeitsschritte
* Daten laden +
* EDA +
* Cleaning +
* Feature Engineering
* Modelle bauen (ML und NN)
* Hyperparam. tuning für besten Modelle
* Ensemble für Modelle
* Feature Importance / Permutation Importance
* Submission auf kaggle (+ Screenshot)
* 2 geile Grafiken
* je Kapitel eine Zusammenfassung

Siehe itslearning Aufgabe

# Mies viele Kommentare

# Daten Import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from best_params import make
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LinearRegression
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
# from cuml.ensemble import RandomForestRegressor
# from cuml.neighbors import KNeighborsRegressor
# from cuml.linear_model import LinearRegression
from sklearn.svm import SVR
import torch
import torch.nn as nn
import json
import torchvision.datasets as datasets

df = pd.read_csv("train.csv")
store_df = pd.read_csv("store.csv")

big_df = df.merge(store_df, on="Store", how="left")
store_df

## EDA

In [None]:
fig, axs = plt.subplots(3, 2, figsize=(10, 5))

# Plot 1: Sales by Day of Week
s_b_d = big_df["Sales"].groupby(big_df["DayOfWeek"]).sum()
axs[0, 0].plot(s_b_d)
axs[0, 0].set_title('Sales by Day of Week')

# Plot 2: Scatter plot of Customers vs Sales
axs[0, 1].scatter(big_df["Customers"], big_df["Sales"])
axs[0, 1].set_title('Customers vs Sales')

# Plot 3: Sales by Store Type
sales_by_storeType = big_df["Sales"].groupby(big_df["StoreType"]).sum()
axs[1, 0].plot(sales_by_storeType)
axs[1, 0].set_title('Sales by Store Type')

# Plot 4: Amount of Stores by Type
amt_stores_by_type = big_df["StoreType"].value_counts()
axs[1, 1].bar(["a", "b", "c", "d"], amt_stores_by_type)
axs[1, 1].set_title('Amount of Stores by Type')

# mean sales per customer
sales_per_customer = big_df["Sales"] / big_df["Customers"]
big_df["sales_per_customer"] = sales_per_customer

m_sales_customer_by_st = big_df["sales_per_customer"].groupby(big_df["StoreType"]).sum().values.tolist()
axs[2, 0].plot(big_df["StoreType"].unique(), m_sales_customer_by_st)
axs[2, 0].set_title("mean_sales_p_cust_by_stoTyp")

# Display the plots
plt.tight_layout()
plt.show()

## Datacleaning

In [None]:
big_df["CompetitionDistance"] = big_df["CompetitionDistance"].fillna(0)

comp_zeros = big_df["CompetitionDistance"] == 0

big_df["CompetitionOpenSinceMonth"][comp_zeros] = 0
big_df["CompetitionOpenSinceYear"][comp_zeros] = 0

big_df = big_df.dropna(axis=0)
big_df.head(5)

## Feature Engineering

In [None]:
#big_df["year"] = big_df["Date"].str.split("-").str[0].astype(int)
#big_df["month"] = big_df["Date"].str.split("-").str[1].astype(int)
#big_df["day"] = big_df["Date"].str.split("-").str[2].astype(int)

to_drop = ["Date", "Store"]

big_df = big_df.drop(to_drop, axis=1)

big_df["CompetitionDistance"] = big_df["CompetitionDistance"].astype(int)
big_df["CompetitionOpenSinceMonth"] = big_df["CompetitionOpenSinceMonth"].astype(int)
big_df["CompetitionOpenSinceYear"] = big_df["CompetitionOpenSinceYear"].astype(int)

big_df["Promo2SinceWeek"] = big_df["Promo2SinceWeek"].astype(int)
big_df["Promo2SinceYear"] = big_df["Promo2SinceYear"].astype(int)

#big_df["promo2week_bool"] = big_df["Promo2SinceWeek"] == big_df["Promo2SinceWeek"].isna()

store_type_dict = {"a": 0, "b": 1, "c": 2, "d": 3}
big_df["StoreType"] = big_df["StoreType"].map(store_type_dict)

assortment_dict = {"a": 0, "b": 1, "c": 2}
big_df["Assortment"] = big_df["Assortment"].map(assortment_dict)

big_df

In [None]:
print(big_df["PromoInterval"].unique()) # im 3-Monats Intervall

In [None]:
#big_df["first_promo_month"] = big_df["PromoInterval"].str.split(",").str[0]

big_df["jan"] = 0
big_df["feb"] = 0
big_df["mar"] = 0
big_df["apr"] = 0
big_df["may"] = 0
big_df["jun"] = 0
big_df["jul"] = 0
big_df["aug"] = 0
big_df["sep"] = 0
big_df["oct"] = 0
big_df["nov"] = 0
big_df["dec"] = 0

for index, row in big_df.iterrows():
    start = row["PromoInterval"].split(",")[0]
    match start:
        case "Jan":
            big_df.at[index, "jan"] = 1
            big_df.at[index, "apr"] = 1
            big_df.at[index, "jul"] = 1
            big_df.at[index, "oct"] = 1

        case "Feb":
            big_df.at[index, "feb"] = 1
            big_df.at[index, "may"] = 1
            big_df.at[index, "aug"] = 1
            big_df.at[index, "nov"] = 1

        case "Mar":
            big_df.at[index, "mar"] = 1
            big_df.at[index, "jun"] = 1
            big_df.at[index, "sep"] = 1
            big_df.at[index, "dec"] = 1


big_df = big_df.drop("PromoInterval", axis=1)
big_df


In [None]:
for col in big_df.columns:
    col_n = re.sub(r'(?<!^)(?=[A-Z])', '_', col).lower()
    big_df.rename(columns={col: col_n}, inplace=True)
big_df

In [None]:
big_df["state_holiday"].unique()

holiday_map = {"0": 0, "a": 1, "b": 2, "c": 3, 0: 4}

big_df["state_holiday"] = big_df["state_holiday"].map(holiday_map)

In [None]:
big_df

In [None]:
X = big_df.drop("sales", axis=1)
y = big_df["sales"]

scaler=StandardScaler()
scaler.fit(X)
x_scaled=scaler.transform(X)

x_scaled_df = pd.DataFrame(x_scaled, columns=X.columns)

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
border = 100000
X_train = x_scaled_df[:border]
y_train = y[:border]
X_test = x_scaled_df[border:]
y_test = y[border:]

print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

#print("Find the best parameters for the models")
#make(X_train, y_train)

print("Load the best parameters from json file")
params_f = json.load(open("best_params.json", "r"))


def make_voting(X_train, y_train, X_test, y_test):
    best_knn_params = params_f["knn"]
    best_tree_params = params_f["tree"]
    best_forest_params = params_f["forest"]
    best_svm_params = params_f["svm"]
    best_lin_reg_params = params_f["lin_reg"]

    knn = KNeighborsRegressor(n_neighbors=best_knn_params["kneighborsregressor__n_neighbors"],
                             weights=best_knn_params["kneighborsregressor__weights"],
                             algorithm=best_knn_params["kneighborsregressor__algorithm"])
    tree = DecisionTreeRegressor(max_depth=best_tree_params["decisiontreeregressor__max_depth"],
                                 min_samples_split=best_tree_params["decisiontreeregressor__min_samples_split"],
                                 min_samples_leaf=best_tree_params["decisiontreeregressor__min_samples_leaf"])
    forest = RandomForestRegressor(n_estimators=best_forest_params["randomforestregressor__n_estimators"],
                                   max_depth=best_forest_params["randomforestregressor__max_depth"],
                                   min_samples_split=best_forest_params["randomforestregressor__min_samples_split"],
                                   min_samples_leaf=best_forest_params["randomforestregressor__min_samples_leaf"])
    svm = SVR(kernel=best_svm_params["svr__kernel"],
             degree=best_svm_params["svr__degree"],
             C=best_svm_params["svr__C"])
    lin_reg = LinearRegression(fit_intercept=best_lin_reg_params["linearregression__fit_intercept"])

    knn.fit(X_train, y_train)
    print("knn done")
    tree.fit(X_train, y_train)
    print("tree done")
    forest.fit(X_train, y_train)
    print("forest done")
    svm.fit(X_train, y_train)
    print("svm done")
    lin_reg.fit(X_train, y_train)
    print("lin_reg done")

    voting = VotingRegressor(estimators=[("knn", knn), ("tree", tree), ("forest", forest), ("svm", svm), ("lin_reg", lin_reg)], n_jobs=-1)

    voting.fit(X_train, y_train)
    print("voting done")

    error = mean_squared_error(y_test, voting.predict(X_test))
    print("ERROR:", error)

    accuracy = voting.score(y_test, voting.predict(X_test))
    print("Accuracy:", accuracy)
    
    return voting

# make_voting(X_train, y_train, X_test, y_test) # error: 276596.59625331557

In [None]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.l1_h = 1000
        self.l2_h = 3000
        self.l3_h = 4000
        self.l4_h = 1000
        self.l5_h = 1

        self.l1 = nn.Linear(X.shape[1], self.l1_h)
        self.l2 = nn.Linear(self.l1_h, self.l2_h)
        self.l3 = nn.Linear(self.l2_h, self.l3_h)
        self.l4 = nn.Linear(self.l3_h, self.l4_h)
        self.l5 = nn.Linear(self.l4_h, self.l5_h)

        self.relu = nn.ReLU()

    def forward(self, x):
        #x = self.l1(x)
        #x = self.relu(x)
        #x = self.l2(x)
        #x = self.relu(x)
        #x = self.l3(x)
        #x = self.relu(x)
        #x = self.l4(x)
        #x = self.relu(x)
        #x = self.l5(x)
        #x = self.relu(x)

        x = self.relu(self.l5(self.relu(self.l4(self.relu(self.l3(self.relu(self.l2(self.relu(self.l1(x))))))))))

        return x
        
    
class Dataset(torch.utils.data.Dataset):
    def __init__(self, x, y):
        self.x = torch.tensor(x.values, dtype=torch.float32)
        self.y = torch.tensor(y.values, dtype=torch.float32)

    def __len__(self):
        return len(self.x)

    def __getitem__(self, idx):
        return self.x[idx], self.y[idx]
    
torch.manual_seed(1234)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = Model()
print("model initialized")
model.to(device)
print("model on device")
optmizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.1)
loss_fn = nn.MSELoss()
dataset = Dataset(X_train, y_train)
data_loader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=True)
schedular = torch.optim.lr_scheduler.CosineAnnealingLR(optmizer, T_max=20)

epochs = 15
losses = []

for epoch in range(epochs):
   for xb, yb in data_loader:
       
       xb, yb = xb.to(device), yb.to(device)

       y_hat = model(xb)
       y_hat = y_hat.squeeze()  # remove dimensions of size 1 from the output
       loss = loss_fn(y_hat, yb)
       loss.backward()
       optmizer.step()
       optmizer.zero_grad()
       schedular.step()
   print(f"Epoch [{epoch + 1}/{epochs}], Loss: {loss.item()}")
   losses.append(loss.item())

torch.save(model.state_dict(), "modelGPU.pt")

plt.plot(range(epochs),losses, color="blue")
plt.legend(["loss"], loc="upper right")
plt.xlabel("epochs")
plt.ylabel("loss")
plt.show()

In [25]:
# Check if GPU is available and if not, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Create model instance and load state dict
tm = Model()
tm.load_state_dict(torch.load("modelGPU.pt"))
tm = tm.to(device)  # Move model to GPU

count = 0
total = len(y_test)
total_diff = 0

for i in range(total):
    if i % 1000 == 0:
        print((i / total) * 100)

    # Move test data to GPU before making predictions
    X_test_tensor = torch.tensor(X_test.iloc[i].values, dtype=torch.float32).to(device)
    y_hat = tm(X_test_tensor)  # Use tm to make predictions

    # Move y_hat back to CPU for comparison with y_test
    y_hat = y_hat.to("cpu")

    # Calculate the absolute difference between y_hat and y_test
    diff = torch.abs(y_hat - y_test.iloc[i])
    total_diff += diff.item()

# Calculate the Mean Absolute Error
mae = total_diff / total
print(f'Mean Absolute Error: {mae}')

0.0
0.5937889674009856
1.1875779348019713
1.781366902202957
2.3751558696039425
2.9689448370049285
3.562733804405914
4.1565227718069
4.750311739207885
5.344100706608871
5.937889674009857
6.531678641410843
7.125467608811828
7.719256576212814
8.3130455436138
8.906834511014786
9.50062347841577
10.094412445816756
10.688201413217742
11.281990380618728
11.875779348019714
12.4695683154207
13.063357282821686
13.65714625022267
14.250935217623656
14.84472418502464
15.438513152425628
16.032302119826614
16.6260910872276
17.219880054628586
17.813669022029572
18.407457989430558
19.00124695683154
19.59503592423253
20.188824891633512
20.782613859034498
21.376402826435484
21.97019179383647
22.563980761237456
23.157769728638442
23.751558696039428
24.345347663440414
24.9391366308414
25.532925598242386
26.126714565643372
26.720503533044354
27.31429250044534
27.90808146784633
28.501870435247312
29.095659402648298
29.68944837004928
30.28323733745027
30.877026304851256
31.47081527225224
32.06460423965323
32.6