In [0]:
# Import libraries
import pandas as pd
import mlflow
import mlflow.sklearn

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [0]:
f1_data = pd.read_csv("/dbfs/FileStore/tables/results-1.csv")


In [0]:
# Select features and target for prediction
features = ['grid', 'constructorId']  # Features we use for prediction
target = 'positionOrder'              # What we want to predict

X = f1_data[features]
y = f1_data[target]

# Display the first few rows of features and target
X.head(), y.head()


(   grid  constructorId
 0     1              1
 1     5              2
 2     7              3
 3    11              4
 4     3              1,
 0    1
 1    2
 2    3
 3    4
 4    5
 Name: positionOrder, dtype: int64)

In [0]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42
)

# Display the shape of the resulting sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape


((20069, 2), (6690, 2), (20069,), (6690,))

In [0]:
# Train a Logistic Regression model
with mlflow.start_run(run_name="Logistic Regression"):
    model1 = LogisticRegression(max_iter=1000)
    model1.fit(X_train, y_train)
    
    # Make predictions
    y_pred1 = model1.predict(X_test)
    
    # Calculate metrics
    acc1 = accuracy_score(y_test, y_pred1)
    prec1 = precision_score(y_test, y_pred1, average='weighted', zero_division=0)
    rec1 = recall_score(y_test, y_pred1, average='weighted', zero_division=0)
    f1_1 = f1_score(y_test, y_pred1, average='weighted', zero_division=0)
    
    # Log metrics
    mlflow.log_metric("accuracy", acc1)
    mlflow.log_metric("precision", prec1)
    mlflow.log_metric("recall", rec1)
    mlflow.log_metric("f1_score", f1_1)
    
    # Log model
    mlflow.sklearn.log_model(model1, "logistic_regression_model")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2025/04/29 01:49:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run Logistic Regression at: columbiau-gr5069.cloud.databricks.com/ml/experiments/743188476171287/runs/a5dc4cdbc16b4492ad6f0ea766b88b3d.
2025/04/29 01:49:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: columbiau-gr5069.cloud.databricks.com/ml/experiments/743188476171287.


In [0]:
# Train a Decision Tree Classifier
with mlflow.start_run(run_name="Decision Tree"):
    model2 = DecisionTreeClassifier(max_depth=5, random_state=42)
    model2.fit(X_train, y_train)
    
    # Make predictions
    y_pred2 = model2.predict(X_test)
    
    # Calculate metrics
    acc2 = accuracy_score(y_test, y_pred2)
    prec2 = precision_score(y_test, y_pred2, average='weighted', zero_division=0)
    rec2 = recall_score(y_test, y_pred2, average='weighted', zero_division=0)
    f1_2 = f1_score(y_test, y_pred2, average='weighted', zero_division=0)
    
    # Log metrics
    mlflow.log_metric("accuracy", acc2)
    mlflow.log_metric("precision", prec2)
    mlflow.log_metric("recall", rec2)
    mlflow.log_metric("f1_score", f1_2)
    
    # Log model
    mlflow.sklearn.log_model(model2, "decision_tree_model")




Uploading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

2025/04/29 01:50:36 INFO mlflow.tracking._tracking_service.client: 🏃 View run Decision Tree at: columbiau-gr5069.cloud.databricks.com/ml/experiments/743188476171287/runs/66d622ab35f84e3b805bf245d98f5081.
2025/04/29 01:50:36 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: columbiau-gr5069.cloud.databricks.com/ml/experiments/743188476171287.


In [0]:
# Create a database if not already exist
spark.sql("CREATE DATABASE IF NOT EXISTS student_db")


DataFrame[]

In [0]:
# Use the database
spark.sql("USE student_db")


DataFrame[]

In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS model1_predictions (
    id INT,
    prediction DOUBLE
)
""")


DataFrame[]

In [0]:
spark.sql("""
CREATE TABLE IF NOT EXISTS model2_predictions (
    id INT,
    prediction DOUBLE
)
""")


DataFrame[]

In [0]:
# Prepare predictions from Logistic Regression model
import numpy as np

predictions_df1 = pd.DataFrame({
    'id': np.arange(len(y_pred)),  # <-- fix id to be 0,1,2,...
    'prediction': y_pred
})

# Convert to Spark DataFrame
spark_df1 = spark.createDataFrame(predictions_df1)

# Save into the first table
spark_df1.write.mode("overwrite").saveAsTable("model1_predictions")


In [0]:
# Prepare predictions from Decision Tree model
predictions_df2 = pd.DataFrame({
    'id': np.arange(len(y_pred2)),  # <-- generate id from 0,1,2,...
    'prediction': y_pred2
})

# Convert to Spark DataFrame
spark_df2 = spark.createDataFrame(predictions_df2)

# Save into the second table
spark_df2.write.mode("overwrite").saveAsTable("model2_predictions")


In [0]:
# Check first table
spark.sql("SELECT * FROM model1_predictions LIMIT 5").show()

# Check second table
spark.sql("SELECT * FROM model2_predictions LIMIT 5").show()


+---+----------+
| id|prediction|
+---+----------+
|  0|         0|
|  1|         0|
+---+----------+

+---+----------+
| id|prediction|
+---+----------+
|  0|         3|
|  1|         1|
|  2|        15|
|  3|         5|
|  4|        15|
+---+----------+

