In [2]:
# Importing the required packages
from sklearn.datasets import make_regression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession

In [3]:
# Initialising the spark session
spark = SparkSession.builder\
                    .appName("RegressionExample")\
                    .getOrCreate()

In [4]:
# Creating the regression dataset with 10_000 rows and 5 columns
X, y = make_regression(n_samples = 10_000, n_features = 4, noise = 10, random_state = 7)

data = [(float(X[i, 0]), float(X[i, 1]), float(X[i, 2]), float(X[i, 3]), float(y[i])) for i in range(len(X))]

In [5]:
# Creating the Spark Dataframe
spark_df = spark.createDataFrame(data, ["Feature 1", "Feature 2", "Feature 3", "Feature 4", "Target"])

In [6]:
# Printing the column names of the dataset
print(spark_df.columns)

['Feature 1', 'Feature 2', 'Feature 3', 'Feature 4', 'Target']


In [7]:
# Printing the schema of each columns
spark_df.printSchema()

root
 |-- Feature 1: double (nullable = true)
 |-- Feature 2: double (nullable = true)
 |-- Feature 3: double (nullable = true)
 |-- Feature 4: double (nullable = true)
 |-- Target: double (nullable = true)



In [8]:
# Printing the summary of the columns
spark_df.summary().show()

+-------+--------------------+--------------------+--------------------+--------------------+-------------------+
|summary|           Feature 1|           Feature 2|           Feature 3|           Feature 4|             Target|
+-------+--------------------+--------------------+--------------------+--------------------+-------------------+
|  count|               10000|               10000|               10000|               10000|              10000|
|   mean|0.002544132739425...|-0.01451103179417...|0.009994074793825022|-0.00527202077587...|-0.5945901595480387|
| stddev|   0.994910756457706|  0.9905901068781727|  0.9925380827115298|  0.9951669116152843| 113.06668726241749|
|    min|  -4.114495828487157| -3.6714407164475813| -3.8480111366237453| -3.4156615653390623| -448.1156168306303|
|    25%| -0.6694742894547767| -0.6718483340586358| -0.6582615623787684| -0.6710175711541245| -75.83562881697048|
|    50%|0.006222023298174971|-0.00436592371459...|0.007047689107622504| 0.0063156528099

In [9]:
# Randomly splitting the dataset into train and test dataset
train_data, test_data = spark_df.randomSplit([0.8, 0.2], seed = 7)

In [10]:
# Combining features into a single vector column
assembler = VectorAssembler(inputCols = ["Feature 1", "Feature 2", "Feature 3", "Feature 4"],
                            outputCol = "features")

train_data = assembler.transform(train_data)
test_data = assembler.transform(test_data)

In [11]:
# Training the linear regression model
lr = LinearRegression(featuresCol = "features", labelCol = "Target")
model = lr.fit(train_data)

In [12]:
# Making the predictions on the test data
predictions = model.transform(test_data)

In [13]:
# Evaluating the model using MSE and r2 score
evaluater = RegressionEvaluator(labelCol = "Target", predictionCol ="prediction", metricName = "mse")
mse = evaluater.evaluate(predictions)

print(f"MSE score for the model is: {mse}")

MSE score for the model is: 100.30719796366067


In [14]:
r2 = model.summary.r2

print(f"r2 score of the model is: {r2}")

r2 score of the model is: 0.9923060623080251


- **make_regression** function generates simulated regression data for model testing.
- **LinearRegression** represents a linear regression model in PySpark.
- **RegressionEvaluator** is used to evaluate the model's performance.
- **VectorAssembler** is used to assemble the input features into a single vector column.
- **SparkSession** is used to create a SparkSession, which is the entry point for working with DataFrames in PySpark.