### Generate Synthetic Data
First, we'll create a synthetic dataset. This data will be saved to a CSV file.

In [2]:
import pandas as pd
import numpy as np

# Generate synthetic data
num_samples = 1000
num_features = 10

X = np.random.rand(num_samples, num_features)
y = X @ np.random.rand(num_features, 1) + np.random.normal(size=(num_samples, 1))

# Create a DataFrame and save to CSV
df = pd.DataFrame(np.hstack((X, y)), columns=[f"feature_{i}" for i in range(num_features)] + ["label"])
df.to_csv("synthetic_data.csv", index=False)

### Define a PyTorch Model
We’ll define a simple feedforward neural network in PyTorch for our regression task.

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleModel(nn.Module):
    def __init__(self, input_size):
        super(SimpleModel, self).__init__()
        self.fc = nn.Linear(input_size, 1)
    
    def forward(self, x):
        return self.fc(x)

### Set Up Spark and Horovod
Next, we’ll configure Spark and Horovod for distributed training. Make sure you have Spark and Horovod installed.

In [7]:
from pyspark.sql import SparkSession
import horovod.spark

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Horovod_PyTorch_Example") \
    .getOrCreate()

# Initialize Horovod
horovod.spark.run(spark)

24/06/10 12:35:22 WARN Utils: Your hostname, daniel-Yoga-Creator-7-15IMH05 resolves to a loopback address: 127.0.1.1; using 192.168.1.74 instead (on interface wlp0s20f3)
24/06/10 12:35:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/10 12:35:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Exception in thread Thread-7 (run_spark):                         (0 + 12) / 12]
Traceback (most recent call last):
  File "/home/daniel/miniconda3/envs/caa/lib/python3.12/threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "/home/daniel/miniconda3/envs/caa/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 766, in run_closure
    _threading_Thread_run(self)
  File "/home/daniel/miniconda3/envs/caa/lib/python3.12/threading.py", line 1010, in 

ValueError: Neither MPI nor Gloo support has been built. Try reinstalling Horovod ensuring that either MPI is installed (MPI) or CMake is installed (Gloo).

24/06/10 12:35:28 WARN TaskSetManager: Lost task 6.0 in stage 0.0 (TID 6) (daniel-Yoga-Creator-7-15IMH05.lan executor driver): TaskKilled (Stage cancelled: Job 0 cancelled part of cancelled job group horovod.spark.run.0)
24/06/10 12:35:30 WARN PythonRunner: Incomplete task 11.0 in stage 0 (TID 11) interrupted: Attempting to kill Python Worker
24/06/10 12:35:30 WARN PythonRunner: Incomplete task 2.0 in stage 0 (TID 2) interrupted: Attempting to kill Python Worker
24/06/10 12:35:30 WARN PythonRunner: Incomplete task 5.0 in stage 0 (TID 5) interrupted: Attempting to kill Python Worker
24/06/10 12:35:30 WARN PythonRunner: Incomplete task 10.0 in stage 0 (TID 10) interrupted: Attempting to kill Python Worker
24/06/10 12:35:30 WARN TaskSetManager: Lost task 2.0 in stage 0.0 (TID 2) (daniel-Yoga-Creator-7-15IMH05.lan executor driver): TaskKilled (Stage cancelled: Job 0 cancelled part of cancelled job group horovod.spark.run.0)
24/06/10 12:35:30 WARN TaskSetManager: Lost task 11.0 in stage 0.0

### Prepare Data for Spark
Load the data into Spark DataFrame and prepare it for training.

In [None]:
# Read CSV into a Spark DataFrame
df = spark.read.csv("synthetic_data.csv", header=True, inferSchema=True)

# Show the first few rows
df.show()

### Train the Model with Horovod
Set up the Horovod Spark Estimator and train the model.

In [None]:
from horovod.spark.pytorch import SparkEstimator

# Define the PyTorch model
input_size = num_features
model = SimpleModel(input_size)

# Define optimizer and loss function
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

# Configure the SparkEstimator
estimator = SparkEstimator(
    num_proc=4,  # Number of processes (adjust based on your cluster)
    model=model,
    optimizer=optimizer,
    loss=loss_fn,
    input_shapes=[(input_size,)],  # Input shape for the model
    label_shapes=[(1,)],  # Output shape (1D for regression)
    feature_columns=[f"feature_{i}" for i in range(num_features)],  # Features from the dataset
    label_columns=["label"],  # Label column
    batch_size=32,
    epochs=10
)

# Fit the model
estimator.fit(df)

### Evaluate and Save the Model
Finally, evaluate the model and save it for future use.

In [None]:
# Evaluate the model
evaluation_results = estimator.evaluate(df)
print("Evaluation Results:", evaluation_results)

# Save the model
estimator.save("trained_model.pth")