In [39]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error ,mean_absolute_error , r2_score
from pyspark.sql import SparkSession
import mlflow

spark = SparkSession.builder \
    .appName("NYC Taxi Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()



train = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/train")
eval = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/eval")
holdout = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/holdout")

print(f"train rows : {train.count()}")
print(f"eval rows : {eval.count()}")
print(f"holdout rows : {holdout.count()}")

train.printSchema()

train rows : 57892523
eval rows : 10036979
holdout rows : 10378129
root
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: long (nullable = true)
 |-- PULocationID: long (nullable = true)
 |-- DOLocationID: long (nullable = true)
 |-- payment_type: long (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: void (nullable = true)
 |-- airport_fee: void (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Week: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- DayOfWeek: integer (nullable = true)
 |-- Hour: double (nullable = true)
 |-- Duration_minutes: double (nullable = true)



In [40]:

trainpd = train.sample(fraction=0.001 , seed = 42).toPandas()
X = trainpd.drop("total_amount", axis = 1)
y = trainpd["total_amount"]

evalpd = eval.sample(fraction=0.001 , seed = 42).toPandas()
X_test = evalpd.drop("total_amount", axis = 1)
y_test = evalpd["total_amount"]


regressor = DummyRegressor(strategy="mean")


                                                                                

In [41]:
# Установите правильный tracking URI
mlflow.set_tracking_uri("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/notebooks/mlruns")

# Создайте или получите эксперимент
experiment_name = "baseline_model"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is None:
    experiment_id = mlflow.create_experiment(experiment_name)
else:
    experiment_id = experiment.experiment_id

# Используйте эксперимент
mlflow.set_experiment(experiment_name)

# Теперь запускайте run
with mlflow.start_run(run_name="baseline model with mean strategy"):
    regressor.fit(X, y)
    y_pred = regressor.predict(X_test)

    mse = mean_absolute_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("MSE", mse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)


Traceback (most recent call last):
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 379, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 477, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1662, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-t

In [42]:
print("Current tracking URI:", mlflow.get_tracking_uri())


Current tracking URI: /Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/notebooks/mlruns
