# PySpark Machine Learning Example ðŸ¤–

This notebook demonstrates a simple machine learning workflow using PySpark MLlib.

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

# Initialize Spark
spark = SparkSession.builder \
    .appName('ML Example') \
    .master('spark://spark-master:7077') \
    .getOrCreate()

print('Spark session initialized!')

## Create Sample Dataset

Let's create a simple dataset to predict house prices based on features.

In [None]:
# Sample house data: (bedrooms, bathrooms, sqft, age, price)
data = [
    (3, 2, 1500, 10, 300000),
    (4, 3, 2000, 5, 450000),
    (2, 1, 1000, 20, 200000),
    (5, 4, 3000, 2, 650000),
    (3, 2, 1600, 12, 320000),
    (4, 2, 1800, 8, 380000),
    (2, 2, 1200, 15, 250000),
    (3, 3, 1700, 7, 350000),
    (4, 3, 2200, 4, 480000),
    (5, 3, 2500, 6, 550000),
    (3, 2, 1550, 11, 310000),
    (4, 2, 1900, 9, 400000),
    (2, 1, 950, 25, 180000),
    (6, 4, 3500, 1, 750000),
    (3, 2, 1450, 13, 290000)
]

columns = ['bedrooms', 'bathrooms', 'sqft', 'age', 'price']
df = spark.createDataFrame(data, columns)

df.show()
df.describe().show()

## Feature Engineering

In [None]:
# Prepare features
feature_columns = ['bedrooms', 'bathrooms', 'sqft', 'age']

# Assemble features into a single vector
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol='raw_features'
)

df_assembled = assembler.transform(df)

# Scale features
scaler = StandardScaler(
    inputCol='raw_features',
    outputCol='features',
    withStd=True,
    withMean=True
)

scaler_model = scaler.fit(df_assembled)
df_scaled = scaler_model.transform(df_assembled)

df_scaled.select('features', 'price').show(5, truncate=False)

## Split Data into Train and Test Sets

In [None]:
# Split data: 80% training, 20% testing
train_data, test_data = df_scaled.randomSplit([0.8, 0.2], seed=42)

print(f'Training set size: {train_data.count()}')
print(f'Test set size: {test_data.count()}')

## Train Linear Regression Model

In [None]:
# Create Linear Regression model
lr = LinearRegression(
    featuresCol='features',
    labelCol='price',
    maxIter=100,
    regParam=0.1,
    elasticNetParam=0.8
)

# Train the model
lr_model = lr.fit(train_data)

print('Model trained successfully!')
print(f'Coefficients: {lr_model.coefficients}')
print(f'Intercept: {lr_model.intercept}')

## Make Predictions

In [None]:
# Make predictions on test data
predictions = lr_model.transform(test_data)

# Show predictions
predictions.select(
    'bedrooms', 'bathrooms', 'sqft', 'age', 
    'price', 'prediction'
).show()

## Evaluate Model Performance

In [None]:
# Create evaluator
evaluator = RegressionEvaluator(
    labelCol='price',
    predictionCol='prediction'
)

# Calculate metrics
rmse = evaluator.evaluate(predictions, {evaluator.metricName: 'rmse'})
mae = evaluator.evaluate(predictions, {evaluator.metricName: 'mae'})
r2 = evaluator.evaluate(predictions, {evaluator.metricName: 'r2'})

print(f'Root Mean Squared Error (RMSE): ${rmse:,.2f}')
print(f'Mean Absolute Error (MAE): ${mae:,.2f}')
print(f'R-squared (R2): {r2:.4f}')

# Training summary
training_summary = lr_model.summary
print(f'\nTraining RMSE: ${training_summary.rootMeanSquaredError:,.2f}')
print(f'Training R2: {training_summary.r2:.4f}')

## Make Predictions on New Data

In [None]:
# New house to predict
new_house = [
    (4, 3, 2100, 7, 0)  # price is unknown, set to 0
]

new_df = spark.createDataFrame(new_house, columns)

# Apply same transformations
new_assembled = assembler.transform(new_df)
new_scaled = scaler_model.transform(new_assembled)

# Predict
new_prediction = lr_model.transform(new_scaled)

predicted_price = new_prediction.select('prediction').first()[0]
print(f'\nPredicted price for new house: ${predicted_price:,.2f}')

## Summary

In this notebook, we:
1. Created a sample dataset
2. Performed feature engineering (assembly and scaling)
3. Split data into training and test sets
4. Trained a Linear Regression model
5. Made predictions
6. Evaluated model performance
7. Used the model to predict prices for new houses

Next steps:
- Try other algorithms (Decision Trees, Random Forest, Gradient Boosting)
- Perform hyperparameter tuning
- Add more features
- Handle categorical variables