In [4]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from pyspark.sql import SparkSession
import mlflow

spark = SparkSession.builder \
    .appName("NYC Taxi Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

train = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/train")
eval = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/eval")
holdout = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/holdout")

print(f"train rows : {train.count()}")
print(f"eval rows : {eval.count()}")
print(f"holdout rows : {holdout.count()}")

train.printSchema()

trainpd = train.sample(fraction=0.001, seed=42).toPandas()
X = trainpd.drop("total_amount", axis=1)
y = trainpd["total_amount"]

evalpd = eval.sample(fraction=0.001, seed=42).toPandas()
X_test = evalpd.drop("total_amount", axis=1)
y_test = evalpd["total_amount"]

regressor = DummyRegressor(strategy="mean")

# Правильный tracking URI для SQLite
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Создайте или получите эксперимент
experiment_name = "baseline_model"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

mlflow.set_experiment(experiment_name)

# Запускайте run
with mlflow.start_run(run_name="baseline model with median strategy") as run:
    regressor.fit(X, y)
    y_pred = regressor.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)  # Исправлено!
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # Опционально: логируй модель
    mlflow.sklearn.log_model(regressor, "model")

    if run.info.status == "RUNNING":
        print(f"MLflow run is logged! Run ID: {run.info.run_id}")


print(f"MSE: {mse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.4f}")



train rows : 57892523
eval rows : 10036979
holdout rows : 10378129
root
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: void (nullable = true)
 |-- airport_fee: void (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Week: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Hour: double (nullable = true)
 |-- Duration_minutes: double (nullable = true)



  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


MLflow run is logged! Run ID: 9c549528291148bebda8de37db395921
MSE: 207.33
MAE: 8.93
R2: -0.0029


In [5]:
from joblib import dump

dump(regressor, "../models/mean_regressor.pkl")

['../models/mean_regressor.pkl']