In [10]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression, RandomForestRegressor, GBTRegressor, DecisionTreeRegressor, GeneralizedLinearRegression, IsotonicRegression, AFTSurvivalRegression, FMRegressor
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import OneHotEncoder, StringIndexer


In [14]:
# Create SparkSession
spark = SparkSession.builder.master('local').appName("HousePricePrediction")

In [15]:
spark = spark.getOrCreate()

KeyboardInterrupt: 

In [None]:
# Load data from CSV file
data = spark.read.csv("housing.csv", header=True, inferSchema=True)

In [None]:
# Step 3: Analyze the data
data.printSchema()
data.show(5)
print("Total number of rows:", data.count())

# Unique values in categorical columns
for col in ["ocean_proximity"]:
    print(f"Unique values in {col}:")
    data.select(col).distinct().show()

# Print na rates
for col in data.columns:
    print(f"NA rate in {col}: {data.filter(data[col].isNull()).count() / data.count()}")

In [None]:
# Do one-hot encoding for categorical columns
indexer = StringIndexer(inputCol="ocean_proximity", outputCol="ocean_proximity_index")
data = indexer.fit(data).transform(data)
encoder = OneHotEncoder(inputCol="ocean_proximity_index", outputCol="ocean_proximity_encoded")
encModel = encoder.fit(data)
data = encModel.transform(data)

# Handle missing values in total_bedrooms column
data = data.fillna(data.approxQuantile("total_bedrooms", [0.5], 0.001)[0], subset=["total_bedrooms"])

# Select features
features = ["longitude", "latitude", "housing_median_age", "total_rooms", "total_bedrooms", "population", "households", "median_income", "ocean_proximity_encoded"]

# Create a VectorAssembler to combine features into a single vector
assembler = VectorAssembler(inputCols=features, outputCol="features")
data = assembler.transform(data)

# Scale features
scaler = StandardScaler(inputCol="features", outputCol="scaledFeatures")
data = scaler.fit(data).transform(data)

# Split data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

In [None]:
# Create a Regression models
models = [
    ("Linear Regression", LinearRegression(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Random Forest", RandomForestRegressor(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Gradient-Boosted Tree", GBTRegressor(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Decision Tree", DecisionTreeRegressor(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Generalized Linear Regression", GeneralizedLinearRegression(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Isotonic Regression", IsotonicRegression(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Accelerated Failure Time Survival Regression", AFTSurvivalRegression(labelCol="median_house_value", featuresCol="scaledFeatures")),
    ("Factorization Machines", FMRegressor(labelCol="median_house_value", featuresCol="scaledFeatures"))
]


In [None]:


evaluator = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction")

for name, model in models:
    model = model.fit(train_data)
    predictions = model.transform(test_data)
    mse = evaluator.evaluate(predictions, {evaluator.metricName: "mse"})
    rmse = evaluator.evaluate(predictions, {evaluator.metricName: "rmse"})
    r2 = evaluator.evaluate(predictions, {evaluator.metricName: "r2"})
    print(f"Model: {name}")
    print(f"    MSE: {mse}, RMSE: {rmse}, R-squared: {r2}")

In [None]:
# Step 6: Measure the performance of the model
predictions = lr_model.transform(test_data)
evaluator = RegressionEvaluator(labelCol="median_house_value", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data:", rmse)

# Print coefficients and intercept
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

# Stop the SparkSession
spark.stop()