In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("TrainModel").getOrCreate()

In [None]:
# Cell 2 - Load batches
df1 = spark.read.json("batches/batch_0.json")
df2 = spark.read.json("batches/batch_1.json")
df3 = spark.read.json("batches/batch_2.json")

df1.show(5)


In [None]:
# Cell 3 - Assemble features
from pyspark.ml.feature import VectorAssembler

feature_columns = [col for col in df1.columns if df1.schema[col].dataType.simpleString() in ['double', 'int']]
print("Features used:", feature_columns)

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
df1_assembled = assembler.transform(df1)
df2_assembled = assembler.transform(df2)
df3_assembled = assembler.transform(df3)


In [None]:
# Cell 4 - Train 3 KMeans models
from pyspark.ml.clustering import KMeans

model1 = KMeans(k=3, seed=1).fit(df1_assembled)
model2 = KMeans(k=3, seed=1).fit(df2_assembled)
combined = df1_assembled.union(df2_assembled).union(df3_assembled)
model3 = KMeans(k=3, seed=1).fit(combined)


In [None]:
# Cell 5 - Save models
import os
if not os.path.exists("model_output"):
    os.makedirs("model_output")

model1.save("model_output/model1")
model2.save("model_output/model2")
model3.save("model_output/model3")

print("✅ Models saved to model_output/")
