In [2]:
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

mlflow.autolog() # <- Tutaj właczamy "magiczny" proces autologowania

# Ładowanie danych
titanic = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# Przetwarzanie danych
titanic = titanic.drop(['Name', 'Ticket', 'Cabin'], axis=1)
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])
titanic = pd.get_dummies(titanic, drop_first=True)

X = titanic.drop('Survived', axis=1)
y = titanic['Survived']

scaler = StandardScaler()
X = scaler.fit_transform(X)

# Logujemy informację o użytych datasetach
mlflow_dataset = mlflow.data.from_pandas(titanic, targets = "Survived",
                                  name = "Titanic Dataset")
mlflow.log_input(mlflow_dataset, context = "training")

# Podział na zestawy treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Rozpoczęcie nowego eksperymentu
mlflow.set_experiment("Analiza danych Titanic")

# Definicja hiperparametrów do strojenia
param_grid = {
    'C': [0.1, 1, 10],
    'solver': ['liblinear', 'lbfgs']
}

# Inicjalizacja modelu
logreg = LogisticRegression(max_iter=200, random_state=42)

# Ustawienie GridSearchCV
grid_search = GridSearchCV(logreg, param_grid, cv=3, n_jobs=-1)

grid_search.fit(X_train, y_train)

# Najlepszy model
best_model = grid_search.best_estimator_

# Przewidywanie i ewaluacja
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

mlflow.end_run() # <- Koniec eksperymentu

2025/02/22 20:15:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/22 20:15:08 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2025/02/22 20:15:15 INFO mlflow.sklearn.utils: Logging the 5 best runs, one run will be omitted.


0.7988826815642458
