In [2]:
#!pip install pyspark

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Regression_Spark").getOrCreate()

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=2fb46a7f426faa5c4be6d8fdb3ce1ff29f547ff1ca48291b4b9d36738118f03a
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [7]:
import random

additional_rows = []

for _ in range(60):  # Number of rows needed to reach 60
    name = ''.join(random.choices('ABCDEFGHIJKLMNOPQRSTUVWXYZ', k=2))
    age = random.randint(20, 60)
    salary = random.randint(40000, 90000)
    experience = random.randint(1, 30)
    parameters = (name, age, salary, experience)
    additional_rows.append(parameters)

data = spark.createDataFrame(additional_rows, ["name", "age", "salary", "experience"])
data.show()

+----+---+------+----------+
|name|age|salary|experience|
+----+---+------+----------+
|  UZ| 36| 67397|         5|
|  BK| 48| 52394|        27|
|  LW| 50| 73074|        30|
|  GC| 20| 49735|        14|
|  LB| 27| 69004|        13|
|  YR| 41| 60547|         7|
|  TV| 30| 69285|         4|
|  KH| 25| 57484|        26|
|  AB| 35| 64738|        11|
|  SR| 45| 45130|         6|
|  XM| 28| 83385|        30|
|  TQ| 56| 82785|         8|
|  PC| 57| 41324|        23|
|  JD| 20| 47820|         1|
|  SG| 39| 62691|        19|
|  DP| 34| 75992|        19|
|  TU| 29| 73819|        13|
|  TG| 46| 81376|        25|
|  BN| 46| 88994|        12|
|  YB| 50| 45185|         5|
+----+---+------+----------+
only showing top 20 rows



In [9]:
from pyspark.ml.feature import VectorAssembler

assembled = VectorAssembler(inputCols = ["age", "experience"],
                            outputCol = "Features")

output = assembled.transform(data)
output.show()

+----+---+------+----------+-----------+
|name|age|salary|experience|   Features|
+----+---+------+----------+-----------+
|  UZ| 36| 67397|         5| [36.0,5.0]|
|  BK| 48| 52394|        27|[48.0,27.0]|
|  LW| 50| 73074|        30|[50.0,30.0]|
|  GC| 20| 49735|        14|[20.0,14.0]|
|  LB| 27| 69004|        13|[27.0,13.0]|
|  YR| 41| 60547|         7| [41.0,7.0]|
|  TV| 30| 69285|         4| [30.0,4.0]|
|  KH| 25| 57484|        26|[25.0,26.0]|
|  AB| 35| 64738|        11|[35.0,11.0]|
|  SR| 45| 45130|         6| [45.0,6.0]|
|  XM| 28| 83385|        30|[28.0,30.0]|
|  TQ| 56| 82785|         8| [56.0,8.0]|
|  PC| 57| 41324|        23|[57.0,23.0]|
|  JD| 20| 47820|         1| [20.0,1.0]|
|  SG| 39| 62691|        19|[39.0,19.0]|
|  DP| 34| 75992|        19|[34.0,19.0]|
|  TU| 29| 73819|        13|[29.0,13.0]|
|  TG| 46| 81376|        25|[46.0,25.0]|
|  BN| 46| 88994|        12|[46.0,12.0]|
|  YB| 50| 45185|         5| [50.0,5.0]|
+----+---+------+----------+-----------+
only showing top

In [10]:
final_data = output.select(["Features", "Salary"])
final_data.show()

+-----------+------+
|   Features|Salary|
+-----------+------+
| [36.0,5.0]| 67397|
|[48.0,27.0]| 52394|
|[50.0,30.0]| 73074|
|[20.0,14.0]| 49735|
|[27.0,13.0]| 69004|
| [41.0,7.0]| 60547|
| [30.0,4.0]| 69285|
|[25.0,26.0]| 57484|
|[35.0,11.0]| 64738|
| [45.0,6.0]| 45130|
|[28.0,30.0]| 83385|
| [56.0,8.0]| 82785|
|[57.0,23.0]| 41324|
| [20.0,1.0]| 47820|
|[39.0,19.0]| 62691|
|[34.0,19.0]| 75992|
|[29.0,13.0]| 73819|
|[46.0,25.0]| 81376|
|[46.0,12.0]| 88994|
| [50.0,5.0]| 45185|
+-----------+------+
only showing top 20 rows



In [23]:
from pyspark.ml.regression import LinearRegression

train_data, test_data = final_data.randomSplit([0.8, 0.2])

regression = LinearRegression(featuresCol="Features", labelCol="Salary")
regression = regression.fit(train_data)

In [25]:
print(regression.intercept)
print(regression.coefficients)

60920.029034949985
[-66.68197899805746,351.39201623853097]


In [28]:
pred = regression.evaluate(test_data)
pred.predictions.show()

+-----------+------+------------------+
|   Features|Salary|        prediction|
+-----------+------+------------------+
|[25.0,26.0]| 57484| 68389.17198220035|
|[27.0,17.0]| 89754| 65093.27987805746|
| [56.0,8.0]| 82785| 59996.97434096701|
|[24.0,28.0]| 87103| 69158.63799367547|
|[33.0,11.0]| 44047| 62584.83590663793|
|[38.0,13.0]| 54629|  62954.2100441247|
|[41.0,13.0]| 75242|62754.164107130535|
|[49.0,29.0]| 50661| 67842.98053496257|
|[54.0,24.0]| 77297| 65752.61055877962|
|[55.0,25.0]| 56656|  66037.3205960201|
+-----------+------+------------------+

