In [3]:

import pyspark
from pyspark.sql import SparkSession

# Importing reqÄ±ired modules

spark = SparkSession.builder.appName("linear_app").getOrCreate()

# Creating a SparkSession named "linear_app"
# SparkSession is the entry point to Spark functionality in PySpark.

spark

In [19]:
# Reading the CSV file into a DataFrame

df = spark.read.csv("Linear_regression_dataset.csv", header = True, inferSchema = True)

# Displaying the first 5 rows of the DataFrame
df.show(5, False)

+-----+-----+-----+-----+-----+------+
|var_1|var_2|var_3|var_4|var_5|output|
+-----+-----+-----+-----+-----+------+
|734  |688  |81   |0.328|0.259|0.418 |
|700  |600  |94   |0.32 |0.247|0.389 |
|712  |705  |93   |0.311|0.247|0.417 |
|734  |806  |69   |0.315|0.26 |0.415 |
|613  |759  |61   |0.302|0.24 |0.378 |
+-----+-----+-----+-----+-----+------+
only showing top 5 rows



In [20]:

from pyspark.ml.linalg import Vector
from pyspark.ml.feature import VectorAssembler

# Importing required modules for vectorizasyon operations.

# Drop the target column
features = df.drop("output")
#features.columns

In [21]:

# Assemble the features into a single feature vector column.
vector_assembler = VectorAssembler(inputCols = features.columns, outputCol = "features")

features_df = vector_assembler.transform(df)

# Display the first 5 rows of the DataFrame
features_df.show(5, False)

+-----+-----+-----+-----+-----+------+------------------------------+
|var_1|var_2|var_3|var_4|var_5|output|features                      |
+-----+-----+-----+-----+-----+------+------------------------------+
|734  |688  |81   |0.328|0.259|0.418 |[734.0,688.0,81.0,0.328,0.259]|
|700  |600  |94   |0.32 |0.247|0.389 |[700.0,600.0,94.0,0.32,0.247] |
|712  |705  |93   |0.311|0.247|0.417 |[712.0,705.0,93.0,0.311,0.247]|
|734  |806  |69   |0.315|0.26 |0.415 |[734.0,806.0,69.0,0.315,0.26] |
|613  |759  |61   |0.302|0.24 |0.378 |[613.0,759.0,61.0,0.302,0.24] |
+-----+-----+-----+-----+-----+------+------------------------------+
only showing top 5 rows



In [22]:
# Print the schema of the DataFrame

features_df.printSchema()

root
 |-- var_1: integer (nullable = true)
 |-- var_2: integer (nullable = true)
 |-- var_3: integer (nullable = true)
 |-- var_4: double (nullable = true)
 |-- var_5: double (nullable = true)
 |-- output: double (nullable = true)
 |-- features: vector (nullable = true)



In [23]:
# Selecting the "features" and "output" columns from the features DataFrame
# Creating a new DataFrame named model_df

model_df = features_df.select("features", "output")

# Displaying the first 5 rows of the DataFrame
model_df.show(5, False)

+------------------------------+------+
|features                      |output|
+------------------------------+------+
|[734.0,688.0,81.0,0.328,0.259]|0.418 |
|[700.0,600.0,94.0,0.32,0.247] |0.389 |
|[712.0,705.0,93.0,0.311,0.247]|0.417 |
|[734.0,806.0,69.0,0.315,0.26] |0.415 |
|[613.0,759.0,61.0,0.302,0.24] |0.378 |
+------------------------------+------+
only showing top 5 rows



In [25]:
# Splitting the model DataFrame into training and testing datasets

train_df, test_df = model_df.randomSplit([0.7, 0.3])

In [26]:
# Counting the number of rows in the train DataFrame

train_df.count()

866

In [27]:
# Counting the number of rows in the test DataFrame

test_df.count()

366

In [28]:
from pyspark.ml.regression import LinearRegression

# Defining a Linear Regression model with specified feature and label columns
lin_reg = LinearRegression(featuresCol = "features", labelCol = "output")


# Training the Linear Regression model on the training dataset
lin_model = lin_reg.fit(train_df) 

# Printing the intercept of the trained model
print(lin_model.intercept)

# Printing the coefficients of the trained model
print(lin_model.coefficients)

0.17558729802057205
[0.00033543706932609793,5.911525748678716e-05,0.00023829620124718822,-0.6403899225466363,0.4994986199998822]


In [29]:
# Evaluating the trained linear regression model on the test dataset

test_pred = lin_model.evaluate(test_df)

# Displaying the first 5 predictions from the evaluation DataFrame
test_pred.predictions.show(5, False)

+------------------------------+------+-------------------+
|features                      |output|prediction         |
+------------------------------+------+-------------------+
|[468.0,746.0,52.0,0.285,0.225]|0.329 |0.31893929258936504|
|[470.0,509.0,76.0,0.289,0.23] |0.319 |0.31125489294339415|
|[486.0,610.0,61.0,0.293,0.233]|0.332 |0.3179550202098824 |
|[524.0,665.0,65.0,0.287,0.224]|0.336 |0.3342530047663171 |
|[531.0,734.0,55.0,0.291,0.235]|0.34  |0.3412299801355283 |
+------------------------------+------+-------------------+
only showing top 5 rows



In [39]:
# Printing the R-squared (R^2) value of the linear regression model on the test dataset

print("R-squared (R^2) value:", test_pred.r2)

R-squared (R^2) value: 0.8585967053005741


In [40]:
# Printing the mean squared error of the linear regression model on the test dataset

print("Mean Squared Error:", test_pred.meanSquaredError)

Mean Squared Error: 0.00014411179122737384


In [41]:
# Printing the mean absolute error of the linear regression model on the test dataset

print("Mean Absolute Error:", test_pred.meanAbsoluteError)

Mean Absolute Error: 0.009375567695583134


In [42]:
# Evaluating the trained linear regression model on the training dataset
train_pred = lin_model.evaluate(train_df)

# Extracting the R-squared (R^2) value from the evaluation results
train_r2 = train_pred.r2

# Printing the R-squared (R^2) value of the linear regression model on the training dataset
print("R-squared (R^2) value on training dataset:", train_r2)

R-squared (R^2) value on training dataset: 0.8729995516932249


0.872530015076529
