# Batch Scoring with PySpark Example

In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import RandomForestClassificationModel
import pandas as pd

## Start Spark Session

In [2]:
spark = SparkSession.builder.appName('BridgeBatchScoring').getOrCreate()

## Load Data

In [3]:
df = pd.read_csv('../data/processed/features.csv')
features = [col for col in df.columns if col not in ['structure_id', 'failure_within_1yr']]
spark_df = spark.createDataFrame(df[features + ['structure_id']])

## Assemble Features

In [4]:
vec_assembler = VectorAssembler(inputCols=features, outputCol='features')
spark_df = vec_assembler.transform(spark_df)

## Load Model (Spark MLlib Format)

In [5]:
# Assume model was exported to Spark MLlib format previously as ../models/trained/spark_rf_model
model = RandomForestClassificationModel.load('../models/trained/spark_rf_model')

## Predict

In [6]:
scored = model.transform(spark_df)
scored.select('structure_id', 'probability', 'prediction').show(5)

## Save Results

In [7]:
scored.select('structure_id', 'probability', 'prediction').toPandas().to_csv('../models/evaluation/spark_batch_predictions.csv', index=False)
print('Batch scoring with Spark complete.')