In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import json
import os
import joblib

In [2]:
# 1. Daten einlesen und vorbereiten
DATA_PATH = "dataset.csv"  # Passe den Pfad an, falls nötig
df = pd.read_csv(DATA_PATH)

In [3]:
# Einen ersten Blick auf die Daten werfen
print(df.head())

   Category Species  Weight   Height   Width  Length1  Length2  Length3
0         1   Bream   242.0  11.5200  4.0200     23.2     25.4     30.0
1         1   Bream   290.0  12.4800  4.3056     24.0     26.3     31.2
2         1   Bream   340.0  12.3778  4.6961     23.9     26.5     31.1
3         1   Bream   363.0  12.7300  4.4555     26.3     29.0     33.5
4         1   Bream   430.0  12.4440  5.1340     26.5     29.0     34.0


In [4]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159 entries, 0 to 158
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Category  159 non-null    int64  
 1   Species   159 non-null    object 
 2   Weight    159 non-null    float64
 3   Height    159 non-null    float64
 4   Width     159 non-null    float64
 5   Length1   159 non-null    float64
 6   Length2   159 non-null    float64
 7   Length3   159 non-null    float64
dtypes: float64(6), int64(1), object(1)
memory usage: 10.1+ KB
None


In [5]:
print(df.describe())

         Category       Weight      Height       Width     Length1  \
count  159.000000   159.000000  159.000000  159.000000  159.000000   
mean     3.264151   398.326415    8.970994    4.417486   26.247170   
std      1.704249   357.978317    4.286208    1.685804    9.996441   
min      1.000000     0.000000    1.728400    1.047600    7.500000   
25%      2.000000   120.000000    5.944800    3.385650   19.050000   
50%      3.000000   273.000000    7.786000    4.248500   25.200000   
75%      4.500000   650.000000   12.365900    5.584500   32.700000   
max      7.000000  1650.000000   18.957000    8.142000   59.000000   

          Length2     Length3  
count  159.000000  159.000000  
mean    28.415723   31.227044  
std     10.716328   11.610246  
min      8.400000    8.800000  
25%     21.000000   23.150000  
50%     27.300000   29.400000  
75%     35.500000   39.650000  
max     63.400000   68.000000  


In [6]:
# Prüfen auf fehlende Werte
print("\nMissing values:\n", df.isnull().sum())


Missing values:
 Category    0
Species     0
Weight      0
Height      0
Width       0
Length1     0
Length2     0
Length3     0
dtype: int64


In [7]:
# 2. Features und Zielvariable definieren
# Beispiel: Angenommen 'Weight' ist die Zielvariable und die restlichen numerischen Spalten sind Features
TARGET = "Weight"

df = df.drop(columns=['Species'])

FEATURES = [col for col in df.columns if col != TARGET]

X = df[FEATURES]
y = df[TARGET]

In [8]:
# 3. Datenaufteilung in Trainings- und Testdaten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# 4. Modelltraining
model = LinearRegression()
model.fit(X_train, y_train)

In [10]:
# 5. Modellbewertung
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"\nModel performance:\nRMSE: {rmse:.2f}\nR²: {r2:.2f}")


Model performance:
RMSE: 117.17
R²: 0.90


In [20]:
# Modellparameter speichern
model_params = {
    "coefficients": model.coef_.tolist(),
    "intercept": model.intercept_
}

# Pfad zum aktuellen Verzeichnis, wo das Notebook liegt
MODEL_PATH = os.path.join(os.getcwd(), "model.json")

# JSON-Datei speichern
with open(MODEL_PATH, "w") as f:
    json.dump(model_params, f, indent=4)

print(f"\nModel saved to: {MODEL_PATH}")


Model saved to: C:\Users\alexa\Desktop\fish_pred\model.json
