In [1]:
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

mlflow.set_tracking_uri("http://localhost:8001")
mlflow.autolog()

# Ładowanie danych
titanic = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')

# Przetwarzanie danych
titanic = titanic.drop(['PassengerId','Name', 'Ticket', 'Cabin'], axis=1)
titanic['Age'] = titanic['Age'].fillna(titanic['Age'].median())
titanic['Embarked'] = titanic['Embarked'].fillna(titanic['Embarked'].mode()[0])
titanic = pd.get_dummies(titanic, drop_first=True)
int_columns = titanic.select_dtypes(include='int').columns
titanic[int_columns] = titanic[int_columns].astype(float)

X = titanic.drop('Survived', axis=1)
y = titanic['Survived'].to_numpy()


# Podział na zestawy treningowy i testowy
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Rozpoczęcie nowego eksperymentu
mlflow.set_experiment("Analiza danych Titanic")

# Definicja modelu Random Forest
rf_model = RandomForestClassifier(random_state=42)

# Definicja siatki hiperparametrów do przeszukiwania
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Konfiguracja GridSearchCV
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

grid_search.fit(X_train, y_train)

# Najlepszy mode
best_model = grid_search.best_estimator_

# Przewidywanie i ewaluacja
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

mlflow.end_run()

2025/02/22 20:23:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2025/02/22 20:23:10 INFO mlflow.tracking.fluent: Autologging successfully enabled for pyspark.
2025/02/22 20:23:12 INFO mlflow.utils.autologging_utils: Created MLflow autologging run with ID '66aa594604fc4866a802ddaab5287027', which will track hyperparameters, performance metrics, model artifacts, and lineage information for the current sklearn workflow


Fitting 5 folds for each of 108 candidates, totalling 540 fits


2025/02/22 20:23:26 INFO mlflow.sklearn.utils: Logging the 5 best runs, 103 runs will be omitted.


🏃 View run efficient-conch-478 at: http://localhost:8001/#/experiments/425328548753294341/runs/4e18bd2a05484a3c9fdf5cba7cbea920
🧪 View experiment at: http://localhost:8001/#/experiments/425328548753294341
🏃 View run polite-shrike-815 at: http://localhost:8001/#/experiments/425328548753294341/runs/ea39d0c047894847bb4768b010ae7f5c
🧪 View experiment at: http://localhost:8001/#/experiments/425328548753294341
🏃 View run capable-fawn-661 at: http://localhost:8001/#/experiments/425328548753294341/runs/506b34a8b3db47caa41be8298f47c261
🧪 View experiment at: http://localhost:8001/#/experiments/425328548753294341
🏃 View run casual-elk-440 at: http://localhost:8001/#/experiments/425328548753294341/runs/5bde6859bbaa4a12bae8756bdf397305
🧪 View experiment at: http://localhost:8001/#/experiments/425328548753294341
🏃 View run thoughtful-turtle-98 at: http://localhost:8001/#/experiments/425328548753294341/runs/f8a69949e6494917a24fc30a01dcbf96
🧪 View experiment at: http://localhost:8001/#/experiments/425