# 1. Setup Your PySpark Notebook

In [None]:
import os
print(os.path.exists("/home/jovyan/work/jars/postgresql-42.7.7.jar"))

from pyspark.sql import SparkSession
!spark-submit --jars /home/jovyan/work/jars/postgresql-42.7.7.jar pyspark_modeling.py

!pip install findspark

import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("AirbnbMLPipeline") \
    .config("spark.jars", "/home/jovyan/work/jars/postgresql-42.7.7.jar") \
    .config("spark.driver.extraClassPath", "/home/jovyan/work/jars/postgresql-42.7.7.jar") \
    .getOrCreate()

# Now you can use spark to load data from PostgreSQL
df = spark.read \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres:5432/airbnb_amsterdam") \
    .option("dbtable", "airbnb_cleaned_listings") \
    .option("user", "airbnb") \
    .option("password", "airbnb123") \
    .option("driver", "org.postgresql.Driver") \
    .load()

df.printSchema()
df.select("price", "availability_365", "number_of_reviews").show(5)

# 2. Data Preparation

In [None]:
from pyspark.sql.functions import col

# Drop rows with nulls in relevant columns
df_ml = df.select(
    col("price").cast("double"),
    col("availability_365").cast("double"),
    col("number_of_reviews").cast("double"),
    col("minimum_nights").cast("double")
).dropna()

# 3. Feature Engineering

In [None]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

assembler = VectorAssembler(
    inputCols=["availability_365", "number_of_reviews", "minimum_nights"],
    outputCol="unscaled_features"
)
df_assembled = assembler.transform(df_ml)

scaler = StandardScaler(inputCol="unscaled_features", outputCol="features", withStd=True, withMean=False)
df_scaled = scaler.fit(df_assembled).transform(df_assembled)

# 4. Clustering Model: KMeans

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=4, seed=1, featuresCol="features", predictionCol="cluster")
model = kmeans.fit(df_scaled)
df_clustered = model.transform(df_scaled)

df_clustered.select("price", "cluster").groupBy("cluster").avg("price").show()

# 5. Predictive Model: Linear Regression for Price

In [None]:
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Split into train/test
train_data, test_data = df_scaled.randomSplit([0.8, 0.2], seed=42)

lr = LinearRegression(featuresCol="features", labelCol="price")
lr_model = lr.fit(train_data)

predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(predictionCol="prediction", labelCol="price", metricName="rmse")

rmse = evaluator.evaluate(predictions)
print(f"RMSE: {rmse:.2f}")

# 6. Save Models

In [None]:
model.save("/home/jovyan/work/ml/kmeans_model")
lr_model.save("/home/jovyan/work/ml/linear_regression_model")

# Export Final Datasets

## 1. Clustered Listings (from KMeans)

In [None]:
df_clustered.toPandas().to_csv("/home/jovyan/work/data/clustered_listings.csv", index=False)

## 2. Regression Predictions (Price Prediction)

In [None]:
predictions.select("price", "prediction", "availability_365", "number_of_reviews", "minimum_nights") \
    .toPandas().to_csv("/home/jovyan/work/data/ml_predictions.csv", index=False)