<a href="https://colab.research.google.com/github/Ricardo-Jaramillo/PySpark/blob/main/06_Logistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Logistic Regression in Spark

We use logistic function to output a value ranging from 0 to 1. Based off of this probability we assign a class.

We evaluate the logistic regression model with a confussion matrix.

Binary classification has some of its own special classification metrics. These include visualization of metrics from the confussion matrix. The **Reveiver Operator curve (ROC)** curve was developed during World War II to help analyze radar data.

In [1]:
# Install pyspark
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=b0e4ee3e730d0cff32a1d7a001a1653295588102a92bdc68e87ccacc9c2afc14
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# Download the file
!wget https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/sample_libsvm_data.txt

--2023-10-03 18:05:15--  https://raw.githubusercontent.com/Ricardo-Jaramillo/PySpark/main/datasets/sample_libsvm_data.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 104736 (102K) [text/plain]
Saving to: ‘sample_libsvm_data.txt’


2023-10-03 18:05:15 (3.62 MB/s) - ‘sample_libsvm_data.txt’ saved [104736/104736]



In [6]:
# Import the libraries
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression

In [4]:
# Init a session
spark = SparkSession.builder.appName('mylogreg').getOrCreate()

In [5]:
# Read in the file
my_data = spark.read.format('libsvm').load('sample_libsvm_data.txt')

In [7]:
# Show data
my_data.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|(692,[127,128,129...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[124,125,126...|
|  1.0|(692,[152,153,154...|
|  1.0|(692,[151,152,153...|
|  0.0|(692,[129,130,131...|
|  1.0|(692,[158,159,160...|
|  1.0|(692,[99,100,101,...|
|  0.0|(692,[154,155,156...|
|  0.0|(692,[127,128,129...|
|  1.0|(692,[154,155,156...|
|  0.0|(692,[153,154,155...|
|  0.0|(692,[151,152,153...|
|  1.0|(692,[129,130,131...|
|  0.0|(692,[154,155,156...|
|  1.0|(692,[150,151,152...|
|  0.0|(692,[124,125,126...|
|  0.0|(692,[152,153,154...|
|  1.0|(692,[97,98,99,12...|
|  1.0|(692,[124,125,126...|
+-----+--------------------+
only showing top 20 rows



## Create a Logistic Regression model

In [9]:
# Create the model
my_log_reg_model = LogisticRegression()

In [12]:
# Fit
fitted_logreg = my_log_reg_model.fit(my_data)

In [13]:
# Get a summary of the model
log_summary = fitted_logreg.summary

In [14]:
log_summary.predictions.printSchema()

root
 |-- label: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [15]:
# Show the predictions with its own fitted values
log_summary.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[127,128,129...|[20.3777627514862...|[0.99999999858729...|       0.0|
|  1.0|(692,[158,159,160...|[-21.114014198867...|[6.76550380001560...|       1.0|
|  1.0|(692,[124,125,126...|[-23.743613234676...|[4.87842678715831...|       1.0|
|  1.0|(692,[152,153,154...|[-19.192574012719...|[4.62137287298722...|       1.0|
|  1.0|(692,[151,152,153...|[-20.125398874697...|[1.81823629113437...|       1.0|
|  0.0|(692,[129,130,131...|[20.4890549504187...|[0.99999999873608...|       0.0|
|  1.0|(692,[158,159,160...|[-21.082940212813...|[6.97903542824686...|       1.0|
|  1.0|(692,[99,100,101,...|[-19.622713503566...|[3.00582577441380...|       1.0|
|  0.0|(692,[154,155,156...|[21.1594863606570...|[0.99999999935352...|       0.0|
|  0.0|(692,[127

## Recreate the model this time with train and test data

In [18]:
# Let's split data into train and test
lr_train, lr_test = my_data.randomSplit([0.7, 0.3])

In [19]:
# Create the final model
final_model = LogisticRegression()

In [20]:
# Fit
fit_final = final_model.fit(lr_train)

In [35]:
# Predict on test_labels
prediction_and_labels = fit_final.evaluate(lr_test)

In [26]:
# Show the predicted values from the test_data
prediction_and_labels.predictions.show()

+-----+--------------------+--------------------+--------------------+----------+
|label|            features|       rawPrediction|         probability|prediction|
+-----+--------------------+--------------------+--------------------+----------+
|  0.0|(692,[98,99,100,1...|[29.6833079991584...|[0.99999999999987...|       0.0|
|  0.0|(692,[122,123,124...|[17.7154635373606...|[0.99999997975713...|       0.0|
|  0.0|(692,[123,124,125...|[32.1792903550335...|[0.99999999999998...|       0.0|
|  0.0|(692,[124,125,126...|[21.2478159674730...|[0.99999999940817...|       0.0|
|  0.0|(692,[125,126,127...|[25.6108362437688...|[0.99999999999246...|       0.0|
|  0.0|(692,[126,127,128...|[26.8158102281239...|[0.99999999999774...|       0.0|
|  0.0|(692,[127,128,129...|[18.1487106634118...|[0.99999998687452...|       0.0|
|  0.0|(692,[128,129,130...|[18.2615423856376...|[0.99999998827499...|       0.0|
|  0.0|(692,[152,153,154...|[15.1233939839040...|[0.99999972960831...|       0.0|
|  0.0|(692,[152

In [27]:
## Evaluate the model

In [29]:
# Import necessary functions
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

In [30]:
# Create an evaluator object
my_eval = BinaryClassificationEvaluator()

In [36]:
# Evaluate predictions
my_final_roc = my_eval.evaluate(prediction_and_labels.predictions)
my_final_roc

0.9554655870445342