In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=873247d96ed69f13305e97e28a4260d624d769f188af1c6d13ca0f6b8b3fc8b8
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from pyspark.sql import SparkSession

In [None]:
spark=SparkSession.builder.appName('Missing').getOrCreate()

In [None]:
training = spark.read.csv('customer_shopping_data.csv', header=True, inferSchema=True)

In [None]:
training.show()

+----------+-----------+------+---+---------------+--------+-------+--------------+------------+-----------------+
|invoice_no|customer_id|gender|age|       category|quantity|  price|payment_method|invoice_date|    shopping_mall|
+----------+-----------+------+---+---------------+--------+-------+--------------+------------+-----------------+
|   I138884|    C241288|Female| 28|       Clothing|       5| 1500.4|   Credit Card|    5/8/2022|           Kanyon|
|   I317333|    C111565|  Male| 21|          Shoes|       3|1800.51|    Debit Card|  12/12/2021|   Forum Istanbul|
|   I127801|    C266599|  Male| 20|       Clothing|       1| 300.08|          Cash|   9/11/2021|        Metrocity|
|   I173702|    C988172|Female| 66|          Shoes|       5|3000.85|   Credit Card|  16/05/2021|     Metropol AVM|
|   I337046|    C189076|Female| 53|          Books|       4|   60.6|          Cash|  24/10/2021|           Kanyon|
|   I227836|    C657758|Female| 28|       Clothing|       5| 1500.4|   Credit Ca

In [None]:
training.columns

['invoice_no',
 'customer_id',
 'gender',
 'age',
 'category',
 'quantity',
 'price',
 'payment_method',
 'invoice_date',
 'shopping_mall']

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
feature=VectorAssembler(inputCols=["age","price"],outputCol="input")

In [None]:
output=feature.transform(training)

In [None]:
output.show()

+----------+-----------+------+---+---------------+--------+-------+--------------+------------+-----------------+--------------+
|invoice_no|customer_id|gender|age|       category|quantity|  price|payment_method|invoice_date|    shopping_mall|         input|
+----------+-----------+------+---+---------------+--------+-------+--------------+------------+-----------------+--------------+
|   I138884|    C241288|Female| 28|       Clothing|       5| 1500.4|   Credit Card|    5/8/2022|           Kanyon| [28.0,1500.4]|
|   I317333|    C111565|  Male| 21|          Shoes|       3|1800.51|    Debit Card|  12/12/2021|   Forum Istanbul|[21.0,1800.51]|
|   I127801|    C266599|  Male| 20|       Clothing|       1| 300.08|          Cash|   9/11/2021|        Metrocity| [20.0,300.08]|
|   I173702|    C988172|Female| 66|          Shoes|       5|3000.85|   Credit Card|  16/05/2021|     Metropol AVM|[66.0,3000.85]|
|   I337046|    C189076|Female| 53|          Books|       4|   60.6|          Cash|  24/10

In [None]:
df=output.select("input","quantity")

In [None]:
df.show()

+--------------+--------+
|         input|quantity|
+--------------+--------+
| [28.0,1500.4]|       5|
|[21.0,1800.51]|       3|
| [20.0,300.08]|       1|
|[66.0,3000.85]|       5|
|   [53.0,60.6]|       4|
| [28.0,1500.4]|       5|
|  [49.0,40.66]|       1|
| [32.0,600.16]|       2|
| [69.0,900.24]|       3|
| [60.0,600.16]|       2|
|  [36.0,10.46]|       2|
|  [29.0,15.15]|       1|
| [67.0,143.36]|       4|
| [25.0,600.16]|       2|
| [67.0,600.16]|       2|
|[24.0,3000.85]|       5|
|   [65.0,30.3]|       2|
|  [42.0,15.69]|       3|
| [46.0,600.16]|       2|
| [24.0,143.36]|       4|
+--------------+--------+
only showing top 20 rows



In [None]:
from pyspark.ml.regression import LinearRegression

In [None]:
train_data,test_data = df.randomSplit([0.8,0.2])

In [None]:
reg=LinearRegression(featuresCol="input",labelCol='quantity')

In [None]:
reg=reg.fit(train_data)

In [None]:
reg.coefficients

DenseVector([0.0007, 0.0005])

In [None]:
reg.intercept

2.5930375616089334

In [None]:
Pred_result=reg.evaluate(test_data)

In [None]:
Pred_result.predictions.show()

+-------------+--------+------------------+
|        input|quantity|        prediction|
+-------------+--------+------------------+
|  [18.0,5.23]|       1| 2.608541013119544|
|  [18.0,5.23]|       1| 2.608541013119544|
|  [18.0,5.23]|       1| 2.608541013119544|
| [18.0,10.46]|       2|2.6113080874891064|
| [18.0,15.69]|       3|2.6140751618586684|
| [18.0,15.69]|       3|2.6140751618586684|
| [18.0,20.92]|       4|2.6168422362282304|
| [18.0,23.46]|       2| 2.618186092614213|
|  [18.0,30.3]|       2|2.6218049814646536|
| [18.0,35.19]|       3|2.6243921695463284|
| [18.0,35.84]|       1|2.6247360698025837|
| [18.0,40.66]|       1|2.6272862224720464|
| [18.0,45.45]|       3|  2.62982050282199|
| [18.0,45.45]|       3|  2.62982050282199|
|  [18.0,60.6]|       4|2.6378360241793257|
| [18.0,75.75]|       5|2.6458515455366616|
| [18.0,81.32]|       2|2.6487985061941113|
| [18.0,81.32]|       2|2.6487985061941113|
| [18.0,81.32]|       2|2.6487985061941113|
|[18.0,162.64]|       4|2.691823

In [None]:
Pred_result.meanAbsoluteError,Pred_result.meanSquaredError

(1.1763180385095802, 1.842447741004744)