<a href="https://colab.research.google.com/github/SUTHARSHANARAM/SUTHARSHANARAM/blob/main/Spark%20program%20to%20perform%20regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row

# Initialize Spark session
spark = SparkSession.builder.appName("LinearRegressionExample").getOrCreate()

# Sample data (replace or expand as needed)
data = [
    Row(feature1=1.0, feature2=2.0, label=3.0),
    Row(feature1=2.0, feature2=1.0, label=2.5),
    Row(feature1=3.0, feature2=3.0, label=6.0),
    Row(feature1=4.0, feature2=5.0, label=9.0),
    Row(feature1=5.0, feature2=3.0, label=7.5),
    Row(feature1=6.0, feature2=6.0, label=12.0),
    Row(feature1=7.0, feature2=8.0, label=15.0)
]

# Create DataFrame
df = spark.createDataFrame(data)
df.show()

# Step 1: Assemble features into a single vector column
assembler = VectorAssembler(inputCols=["feature1", "feature2"], outputCol="features")
df = assembler.transform(df)

# Step 2: Prepare the data for training
df = df.select("features", "label")

# Step 3: Split the data into training and testing sets
train_data, test_data = df.randomSplit([0.8, 0.2], seed=123)

# Step 4: Define and fit the linear regression model
lr = LinearRegression(featuresCol="features", labelCol="label")
lr_model = lr.fit(train_data)

# Step 5: Make predictions on the test set
test_predictions = lr_model.transform(test_data)
test_predictions.select("features", "label", "prediction").show()

# Step 6: Evaluate the model
print("Coefficients:", lr_model.coefficients)
print("Intercept:", lr_model.intercept)

# Evaluate model performance on test data
test_summary = lr_model.evaluate(test_data)
print("RMSE:", test_summary.rootMeanSquaredError)
print("R2:", test_summary.r2)

# Stop Spark session
spark.stop()


+--------+--------+-----+
|feature1|feature2|label|
+--------+--------+-----+
|     1.0|     2.0|  3.0|
|     2.0|     1.0|  2.5|
|     3.0|     3.0|  6.0|
|     4.0|     5.0|  9.0|
|     5.0|     3.0|  7.5|
|     6.0|     6.0| 12.0|
|     7.0|     8.0| 15.0|
+--------+--------+-----+

+---------+-----+------------------+
| features|label|        prediction|
+---------+-----+------------------+
|[3.0,3.0]|  6.0| 5.789473684210524|
|[6.0,6.0]| 12.0|11.828947368421058|
+---------+-----+------------------+

Coefficients: [0.825657894736843,1.1875000000000018]
Intercept: -0.2500000000000109
RMSE: 0.19180762811990992
R2: 0.9959122037550016
