In [1]:
import pandas as pd
df = pd.read_csv("sales.csv")
print(df.head())

print(df.describe())  # Zeigt grundlegende Statistiken wie Mittelwert, Std., Min/Max-Werte
print(df.info())  # Zeigt Datentypen und fehlende Werte an

print(df.isnull().sum())  # Gibt die Anzahl der fehlenden Werte pro Spalte aus



          Produkt   Kategorie  Umsatz
0          Laptop  Elektronik    1200
1      Smartphone  Elektronik     850
2          Tablet  Elektronik     600
3  Kaffeemaschine    Haushalt     230
4     Staubsauger    Haushalt     310
            Umsatz
count    15.000000
mean    298.666667
std     341.271083
min      25.000000
25%      65.000000
50%     150.000000
75%     355.000000
max    1200.000000
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15 entries, 0 to 14
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Produkt    15 non-null     object
 1   Kategorie  15 non-null     object
 2   Umsatz     15 non-null     int64 
dtypes: int64(1), object(2)
memory usage: 492.0+ bytes
None
Produkt      0
Kategorie    0
Umsatz       0
dtype: int64


In [2]:
from sklearn.model_selection import train_test_split

X = df.drop("Umsatz", axis=1)  # Alle Features außer der Zielvariable
y = df["Umsatz"]  # Die Zielvariable (die du vorhersagen möchtest)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

#print(df.head())  # Zeigt dir die ersten paar Zeilen des DataFrames an


# One-Hot-Encoding für den Trainingsdatensatz
X_train_encoded = pd.get_dummies(X_train, drop_first=True)

# One-Hot-Encoding für den Testdatensatz und gleiche Spalten wie im Trainingsdatensatz
X_test_encoded = pd.get_dummies(X_test, drop_first=True)

# Sicherstellen, dass beide Datensätze die gleiche Spaltenstruktur haben
X_test_encoded = X_test_encoded.reindex(columns=X_train_encoded.columns, fill_value=0)
# Skaliere die Daten
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_encoded)
X_test_scaled = scaler.transform(X_test_encoded)

print(X_train_scaled.shape)  # Form der skalierten Trainingsdaten
print(X_test_scaled.shape)   # Form der skalierten Testdaten

#print(df.head())  # Zeigt dir die ersten paar Zeilen des DataFrames an

#print(X_train_encoded.columns)
#print(X_test_encoded.columns)



(12, 13)
(3, 13)


In [4]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

# Pipeline bauen (Ridge + optionaler Scaler)
model = Pipeline([
    ('scaler', StandardScaler()),  # Skaliert automatisch die Features
    ('regressor', Ridge())         # Ridge Regression
])

# Hyperparameter-Tuning für alpha (Regularisierung)
param_grid = {'regressor__alpha': [0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2')
grid_search.fit(X_train_encoded, y_train)  # Nutze die One-Hot-encoded Trainingsdaten

# Bestes Modell auswählen
best_model = grid_search.best_estimator_
print("Beste alpha:", grid_search.best_params_)

# Cross-Validation auf dem besten Modell
scores = cross_val_score(best_model, X_train_encoded, y_train, cv=5, scoring='r2')
print("R² Scores (CV):", scores)
print("Durchschnittliches R²:", scores.mean())


Beste alpha: {'regressor__alpha': 0.01}
R² Scores (CV): [-0.70199981 -0.79207573  0.16025723 -1.46527169  0.57158702]
Durchschnittliches R²: -0.44550059609808806


In [5]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_test_scaled) 

'''from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)'''


'from sklearn.ensemble import RandomForestRegressor\nmodel = RandomForestRegressor(n_estimators=100, random_state=42)\nmodel.fit(X_train_scaled, y_train)\ny_pred = model.predict(X_test_scaled)'

In [6]:
from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")


Mean Squared Error: 257153.64727608513
R² Score: 0.05373177468118162


In [7]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(model, X_train_encoded, y_train, cv=5)  # 5-fache Cross-Validation
print("Cross-Validation Scores:", scores)


Cross-Validation Scores: [-0.27863296 -0.64021567  0.3306213  -3.63537778  0.702558  ]


In [8]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

# Ridge Regression statt LinearRegression
model = Ridge()

param_grid = {"alpha": [0.001, 0.01, 0.1, 1, 10, 100]}

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'alpha': 0.001}


In [9]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)

from sklearn.metrics import r2_score, mean_squared_error

print("Test R²:", r2_score(y_test, y_pred))
print("Test MSE:", mean_squared_error(y_test, y_pred))


Test R²: 0.053766821920987695
Test MSE: 257144.12299396095


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import pandas as pd

df = pd.read_csv("titanic.csv")

num_features = ["Age","Fare"]                    # numerischen features
cat_features = ["Sex","Embarked"]                # kategorischen features

numeric = Pipeline([                                #pipeline für numerische features
    ("imputer", SimpleImputer(strategy="median")),  #füllt fehlende Zahlen mit dem Median(robust gegen ausreißer)
    ("scaler", StandardScaler())                    #skaliert die Spalten
])

categorical = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")), #ersetzt fehlende Kategorien mit dem häufigsten
    ("ohe", OneHotEncoder(handle_unknown="ignore")) #kathegorische werte werden in 0/1 spalten transformiert. neue kategorien werden igoniert
])

preprocessor = ColumnTransformer([  #ein einheitliches, transformiertes Feature-Array, das numerische und encodierte Spalten kombiniert
    ("num", numeric, num_features), #wende die numeric pipeline auf ["Age", "Fare"] an
    ("cat", categorical, cat_features) #wenn die kategorische pipeline auf ["Sex", "Embarked"] an
])


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

X = df[ num_features + cat_features ]
y = df["Survived"]  # Beispiel: Klassifikation 0/1 (Titanic)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

clf = Pipeline([
    ("preprocessor", preprocessor),            # <- alles von oben
    ("model", LogisticRegression(max_iter=500))
])

clf.fit(X_train, y_train)         # fit: Imputer/Scaler/Encoder lernen NUR vom Train-Set
y_pred = clf.predict(X_test)      # predict: gleiche Schritte auf Test-Set anwenden

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.776536312849162
              precision    recall  f1-score   support

           0       0.80      0.85      0.82       110
           1       0.73      0.67      0.70        69

    accuracy                           0.78       179
   macro avg       0.77      0.76      0.76       179
weighted avg       0.77      0.78      0.77       179

[[93 17]
 [23 46]]


In [12]:
from sklearn.preprocessing import FunctionTransformer
import numpy as np

def add_log_fare(X):
    X = X.copy()
    if "Fare" in X:
        X["Fare_log"] = np.log1p(X["Fare"])
    return X

feat = ColumnTransformer([
    ("num", numeric, num_features),
    ("cat", categorical, cat_features)
], remainder="passthrough")

pipe = Pipeline([
    ("add_features", FunctionTransformer(add_log_fare, feature_names_out="one-to-one")),
    ("prep", feat),
    ("model", LogisticRegression(max_iter=500))
])


In [13]:
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

candidates = {
    "logreg": LogisticRegression(max_iter=500),
    "tree": DecisionTreeClassifier(random_state=42),
    "rf": RandomForestClassifier(n_estimators=200, random_state=42)
}

for name, base_model in candidates.items():
    model = Pipeline([("prep", preprocessor), ("model", base_model)])
    scores = cross_val_score(model, X, y, cv=5, scoring="accuracy")
    print(name, scores.mean(), "±", scores.std())


logreg 0.7800075324838366 ± 0.020371452484517782
tree 0.7553825874081979 ± 0.026635028352338164
rf 0.7755633670202748 ± 0.02154501248678269


In [14]:
from sklearn.inspection import permutation_importance
result = permutation_importance(clf, X_test, y_test, n_repeats=10, random_state=42)
# result.importances_mean ... (Interpretation: größer = wichtiger)
result.importances_mean




array([ 0.00782123,  0.01731844,  0.22402235, -0.00055866])

In [15]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint

search_space = {
    "model__n_estimators": randint(100, 600),
    "model__max_depth": randint(3, 20),
    "model__min_samples_split": randint(2, 20)
}

search = RandomizedSearchCV(
    Pipeline([("prep", preprocessor), ("model", RandomForestClassifier(random_state=42))]),
    param_distributions=search_space,
    n_iter=30, cv=5, scoring="f1", random_state=42, n_jobs=-1
)
search.fit(X_train, y_train)

print("Best params:", search.best_params_)
print("CV best score:", search.best_score_)
best_model = search.best_estimator_


KeyboardInterrupt: 

In [None]:
import joblib
joblib.dump(best_model, "sales_model.pkl")
joblib.dump(clf, "best_logreg.pkl")

['best_logreg.pkl']

In [None]:
# app.py
from flask import Flask, request, jsonify
import pandas as pd
import joblib

app = Flask(__name__)
model = joblib.load("best_logreg.pkl")

@app.route("/predict", methods=["POST"])
def predict():
    payload = request.get_json()
    df = pd.DataFrame([payload])
    pred = model.predict(df)[0]
    return jsonify({"prediction": float(pred)})

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8000)


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:8000
 * Running on http://192.168.178.106:8000
Press CTRL+C to quit
127.0.0.1 - - [21/Oct/2025 18:56:31] "GET / HTTP/1.1" 404 -
127.0.0.1 - - [21/Oct/2025 18:56:32] "GET /favicon.ico HTTP/1.1" 404 -
192.168.178.106 - - [21/Oct/2025 18:56:37] "GET / HTTP/1.1" 404 -
192.168.178.106 - - [21/Oct/2025 18:56:37] "GET /favicon.ico HTTP/1.1" 404 -
127.0.0.1 - - [21/Oct/2025 19:03:04] "GET /predict HTTP/1.1" 405 -
[2025-10-21 19:04:58,147] ERROR in app: Exception on /predict [POST]
Traceback (most recent call last):
  File "c:\Users\niros\OneDrive\Dokumente\ML-Learning\venv\Lib\site-packages\flask\app.py", line 1511, in wsgi_app
    response = self.full_dispatch_request()
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\niros\OneDrive\Dokumente\ML-Learning\venv\Lib\site-packages\flask\app.py", line 919, in full_dispatch_request
    rv = self.handle_user_exception(e)
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:

In [None]:
#lineares Modell
# Zugriff auf das trainierte Modell und die Feature-Namen
log_reg = clf.named_steps["model"]
feature_names = clf.named_steps["preprocessor"].get_feature_names_out()

# Koeffizienten der Features holen
coefficients = log_reg.coef_[0]

# DataFrame für bessere Übersicht
importance_df = pd.DataFrame({
    "Feature": feature_names,
    "Koeffizient": coefficients
}).sort_values(by="Koeffizient", ascending=False)

print(importance_df)


           Feature  Koeffizient
2  cat__Sex_female     1.161503
1        num__Fare     0.563674
4  cat__Embarked_C     0.326051
5  cat__Embarked_Q    -0.058283
0         num__Age    -0.170790
6  cat__Embarked_S    -0.304204
3    cat__Sex_male    -1.197939


In [None]:
#treebasiertes Modell
importances = best_model.named_steps["model"].feature_importances_
feature_names = best_model.named_steps["prep"].get_feature_names_out()

feat_imp = pd.Series(importances, index=feature_names).sort_values(ascending=False)
feat_imp.head(15).plot(kind="barh", figsize=(8,5))
plt.title("Feature Importance (Random Forest)")
plt.show()



NameError: name 'rf_pipeline' is not defined

In [None]:
import shap

# Schritt 1: Daten preprocessen (also numerisch machen)
X_test_preprocessed = clf.named_steps["preprocessor"].transform(X_test)

# Schritt 2: Explainer auf das trainierte Modell + preprocessete Daten anwenden
explainer = shap.LinearExplainer(
    clf.named_steps["model"], 
    masker=X_test_preprocessed
)

# Schritt 3: SHAP-Werte berechnen
shap_values = explainer(X_test_preprocessed)
print(shap_values)





.values =
array([[ 0.0739062 , -0.16936799, -0.37168091, ..., -0.04890761,
         0.00641113, -0.07909305],
       [-0.18852148, -0.26385093, -0.37168091, ..., -0.04890761,
         0.00641113, -0.07909305],
       [ 0.10014896, -0.3680169 , -0.37168091, ...,  0.27714313,
         0.00641113,  0.22511098],
       ...,
       [ 0.02142066, -0.32957819, -0.37168091, ..., -0.04890761,
         0.00641113, -0.07909305],
       [ 0.08702758, -0.29094347,  0.78982193, ...,  0.27714313,
         0.00641113,  0.22511098],
       [-0.01794349, -0.14472027, -0.37168091, ..., -0.04890761,
         0.00641113, -0.07909305]], shape=(179, 7))

.base_values =
array([-0.59466687, -0.59466687, -0.59466687, -0.59466687, -0.59466687,
       -0.59466687, -0.59466687, -0.59466687, -0.59466687, -0.59466687,
       -0.59466687, -0.59466687, -0.59466687, -0.59466687, -0.59466687,
       -0.59466687, -0.59466687, -0.59466687, -0.59466687, -0.59466687,
       -0.59466687, -0.59466687, -0.59466687, -0.59466687