# Infer RFHLTH for unseen data
In this notebook we demonstrate inference on unseen data. In this case we use synthetic data but could be new data for real individuals. 

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PopHealthRisk").getOrCreate()

In [0]:
from pyspark.ml.classification import RandomForestClassificationModel

loaded_model = RandomForestClassificationModel.load("/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/models/rf-1")

In [0]:
# Load synthetic data and assemble for inference
from pyspark.ml.feature import VectorAssembler

df = spark.read.parquet('/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/synthetic_data.parquet', header=True, inferSchema=True)
feature_cols=["_AGEG5YR_clean","EDUCA_clean",'_BMI5', '_SMOKER3_clean', 'DRNKANY6_clean','INCOME3_clean','num_conditions','income_adj_pov']

assembler = VectorAssembler(
    inputCols=feature_cols,
    outputCol="features"
)

df_assembled = assembler.transform(df)

In [0]:
# Infer
predictions = loaded_model.transform(df_assembled)

RFHLTH_adj,prediction,count
0.0,0.0,54
0.0,1.0,22
1.0,0.0,8
1.0,1.0,16


In [0]:
# write prediction labels
predictions.select('prediction').write.parquet('/Volumes/pophealthrisk/pophealthrisk/pophealthrisk/synthetic_data_inferred.parquet')