In [1]:
from sklearn.dummy import DummyRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score , root_mean_squared_error
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import mlflow
import warnings
warnings.filterwarnings("ignore")


spark = SparkSession.builder \
    .appName("NYC Taxi Analysis") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory", "4g") \
    .config("spark.driver.bindAddress", "127.0.0.1") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

train = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/train")
eval = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/eval")
holdout = spark.read.parquet("/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/data/processed/holdout")

print(f"train rows : {train.count()}")
print(f"eval rows : {eval.count()}")
print(f"holdout rows : {holdout.count()}")


Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
26/02/13 23:20:34 WARN Utils: Your hostname, Anatolijs-MacBook-Air-2.local, resolves to a loopback address: 127.0.0.1; using 192.168.1.14 instead (on interface en0)
26/02/13 23:20:34 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
26/02/13 23:20:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


train rows : 57892523
eval rows : 10036979
holdout rows : 10378129


In [2]:
# Подсчет NULL в каждой колонке
train.select([
    F.sum(F.col(c).isNull().cast('int')).alias(c)
    for c in train.columns
]).show()



+-------------+----------+------------+------------+------------+------------+----+-----+----+---+---------+----+----------------+
|trip_distance|RatecodeID|PULocationID|DOLocationID|payment_type|total_amount|Year|Month|Week|Day|DayOfWeek|Hour|Duration_minutes|
+-------------+----------+------------+------------+------------+------------+----+-----+----+---+---------+----+----------------+
|            0|         0|           0|           0|           0|           0|   0|    0|   0|  0|        0|   0|               0|
+-------------+----------+------------+------------+------------+------------+----+-----+----+---+---------+----+----------------+



                                                                                

In [3]:
import mlflow.sklearn

mlflow.set_experiment("Sklearn models")

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso , Ridge
from sklearn.ensemble import HistGradientBoostingRegressor

train_pd = train.sample(seed = 42 , fraction = 0.001).toPandas()
eval_pd = eval.sample(seed = 42 , fraction = 0.001).toPandas()


X_train = train_pd.drop(columns = "total_amount")
y_train = train_pd["total_amount"]
X_test = eval_pd.drop(columns = "total_amount")
y_test = eval_pd[["total_amount"]]

models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "HistGradientBoostingRegressor": HistGradientBoostingRegressor(),

}
for model_name, model in models.items():
    with mlflow.start_run(run_name = f"{model_name} with all features"):
        print("--" * 50)
        print(f"{model_name}")
        model.fit(X_train, ygit _train)

        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test,y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = root_mean_squared_error(y_test, y_pred)
        mlflow.log_metric("mse", mse)
        mlflow.log_metric("mae", mae)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("rmse", rmse)
        print(f"mse : {mse}")
        print(f"mae : {mae}")
        print(f"r2 : {r2}")
        print(f"rmse : {rmse}")

    print(train_pd.shape)



Traceback (most recent call last):
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 379, in search_experiments
    exp = self._get_experiment(exp_id, view_type)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 477, in _get_experiment
    meta = FileStore._read_yaml(experiment_dir, FileStore.META_DATA_FILE_NAME)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-taxi-ml-pipeline/.venv/lib/python3.12/site-packages/mlflow/store/tracking/file_store.py", line 1662, in _read_yaml
    return _read_helper(root, file_name, attempts_remaining=retries)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/anatolijperederij/PycharmProjects/nyc-t

----------------------------------------------------------------------------------------------------
LinearRegression
mse : 25.337011496297965
mae : 2.9124477860519664
r2 : 0.876767023968177
rmse : 5.033588332024975
(58273, 13)
----------------------------------------------------------------------------------------------------
RandomForestRegressor
mse : 10.30204301639637
mae : 1.034040149576855
r2 : 0.9498934031622522
rmse : 3.209679581577633
(58273, 13)
----------------------------------------------------------------------------------------------------
Lasso
mse : 31.37291339445354
mae : 2.897479474498695
r2 : 0.8474098855363409
rmse : 5.601152862978616
(58273, 13)
----------------------------------------------------------------------------------------------------
Ridge
mse : 25.337052479502766
mae : 2.9123799441777547
r2 : 0.876766824635975
rmse : 5.033592402996369
(58273, 13)
----------------------------------------------------------------------------------------------------
HistGr