In [None]:
# ML Model Training and Evaluation

from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark session
spark = SparkSession.builder.appName("ML Model Training and Evaluation").getOrCreate()

# Load the dataset
df = spark.read.option("header", "true").csv("hdfs:///user/hadoop/datasets/transformed_global_confirmed_cases.csv")

# Feature engineering: Assembling features into a feature vector
assembler = VectorAssembler(inputCols=["cases", "log_cases"], outputCol="features")
feature_df = assembler.transform(df)

# Split the data into training and test sets
(trainingData, testData) = feature_df.randomSplit([0.7, 0.3])

# Train a linear regression model
lr = LinearRegression(labelCol="cases", featuresCol="features")
lr_model = lr.fit(trainingData)

# Make predictions on the test set
predictions = lr_model.transform(testData)

# Evaluate the model
evaluator = RegressionEvaluator(labelCol="cases", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print(f"Root Mean Squared Error (RMSE) on test data = {rmse}")

# Save the model
lr_model.save("hdfs:///user/hadoop/models/lr_model")

spark.stop()
