### DS In Pyspark
This notebook examplifies the fundamental skills in DS python integrated to `Pyspark`
- Column Work
- Data Cleansing
- SQL Functions
- Analysis 
- Data Pre-processing
- Visualization

In [181]:
import pandas as pd
import numpy as np
import datetime 
import torch
from datetime import date, datetime, timedelta
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, mean, stddev, variance, when
import pyspark.sql.functions as F
from pyspark.ml.regression import LinearRegression
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import RegressionEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, StandardScaler, MinMaxScaler,  VectorAssembler
from pyspark.sql.types import *

### Column work + DateTime


In [99]:
spark = SparkSession.builder.getOrCreate()
store_df = spark.read.csv('store_train.csv', header=True, inferSchema=True)
store_df.show(3)

+------+--------------+----------+----------+------------+-----------+---------------+---------+-------------+-----------+----------+-----------+------+---------------+---------------+------------+--------------------+------+
|Row ID|      Order ID|Order Date| Ship Date|   Ship Mode|Customer ID|  Customer Name|  Segment|      Country|       City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name| Sales|
+------+--------------+----------+----------+------------+-----------+---------------+---------+-------------+-----------+----------+-----------+------+---------------+---------------+------------+--------------------+------+
|     1|CA-2017-152156|08/11/2017|11/11/2017|Second Class|   CG-12520|    Claire Gute| Consumer|United States|  Henderson|  Kentucky|      42420| South|FUR-BO-10001798|      Furniture|   Bookcases|Bush Somerset Col...|261.96|
|     2|CA-2017-152156|08/11/2017|11/11/2017|Second Class|   CG-12520|    Claire Gute| Consumer|

In [100]:
#Change to date format 
store_df = store_df.withColumn('Order Date', F.to_date(F.col('Order Date'), 'd/M/y'))\
.withColumn('Ship Date', F.to_date(F.col('Ship Date'), 'd/M/y'))

store_df.printSchema()

root
 |-- Row ID: integer (nullable = true)
 |-- Order ID: string (nullable = true)
 |-- Order Date: date (nullable = true)
 |-- Ship Date: date (nullable = true)
 |-- Ship Mode: string (nullable = true)
 |-- Customer ID: string (nullable = true)
 |-- Customer Name: string (nullable = true)
 |-- Segment: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- State: string (nullable = true)
 |-- Postal Code: integer (nullable = true)
 |-- Region: string (nullable = true)
 |-- Product ID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Sub-Category: string (nullable = true)
 |-- Product Name: string (nullable = true)
 |-- Sales: string (nullable = true)



In [101]:
#Find time differences
store_df = store_df.withColumn('ship_time', F.datediff('Ship Date', 'Order Date'))\
.withColumn('time_pass', F.datediff(F.current_date(), 'Ship Date')) #MULTIPLY BY 1440, ETC TO FIND IN MINUTES OR OTHER UNIT
store_df.show()

+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+---------+---------+
|Row ID|      Order ID|Order Date| Ship Date|     Ship Mode|Customer ID|     Customer Name|    Segment|      Country|           City|         State|Postal Code| Region|     Product ID|       Category|Sub-Category|        Product Name|   Sales|ship_time|time_pass|
+------+--------------+----------+----------+--------------+-----------+------------------+-----------+-------------+---------------+--------------+-----------+-------+---------------+---------------+------------+--------------------+--------+---------+---------+
|     1|CA-2017-152156|2017-11-08|2017-11-11|  Second Class|   CG-12520|       Claire Gute|   Consumer|United States|      Henderson|      Kentucky|      42420|  South|FUR-BO-10001798|      Furniture|   Bookc

In [102]:
#TIME DIFF USING TIMEDELTA!!! --> NEED FOR NIKE

# Define the run date as a string
run_date = "2023-06-08"

# Parse the run date string into a datetime object
run_datetime = datetime.strptime(run_date, "%Y-%m-%d")

# Generate a series of dates by subtracting a time difference of 30*x days
date_series = [str(run_datetime - timedelta(days=30*x))[:10] for x in [0, 1, 2, 3, 4, 5, 6]]

print(date_series)
# Output: ['2023-06-08', '2023-05-09', '2023-04-09', '2023-03-10', '2023-02-08', '2023-01-08', '2022-12-09']


['2023-06-08', '2023-05-09', '2023-04-09', '2023-03-10', '2023-02-08', '2023-01-09', '2022-12-10']


In [103]:
#null values check
for col_ in store_df.columns:
  print(f'{col_} N/A:', store_df.filter(col('Customer ID').isNull()).count())

Row ID N/A: 0
Order ID N/A: 0
Order Date N/A: 0
Ship Date N/A: 0
Ship Mode N/A: 0
Customer ID N/A: 0
Customer Name N/A: 0
Segment N/A: 0
Country N/A: 0
City N/A: 0
State N/A: 0
Postal Code N/A: 0
Region N/A: 0
Product ID N/A: 0
Category N/A: 0
Sub-Category N/A: 0
Product Name N/A: 0
Sales N/A: 0
ship_time N/A: 0
time_pass N/A: 0


#### Filter Data

In [104]:
store_df.filter((F.col('city') == 'San Francisco') & (F.col('Ship Mode') == 'Second Class')).show(3)

+------+--------------+----------+----------+------------+-----------+------------------+--------+-------------+-------------+----------+-----------+------+---------------+---------------+------------+--------------------+------+---------+---------+
|Row ID|      Order ID|Order Date| Ship Date|   Ship Mode|Customer ID|     Customer Name| Segment|      Country|         City|     State|Postal Code|Region|     Product ID|       Category|Sub-Category|        Product Name| Sales|ship_time|time_pass|
+------+--------------+----------+----------+------------+-----------+------------------+--------+-------------+-------------+----------+-----------+------+---------------+---------------+------------+--------------------+------+---------+---------+
|    19|CA-2015-143336|2015-08-27|2015-09-01|Second Class|   ZD-21925|Zuschuss Donatelli|Consumer|United States|San Francisco|California|      94109|  West|OFF-AR-10003056|Office Supplies|         Art|          Newell 341|  8.56|        5|     2841|


In [105]:
#diff postal codes for second clss sf flights
store_df.filter((F.col('city') == 'San Francisco') & (F.col('Ship Mode') == 'Second Class')).select('Postal Code').distinct().show()

+-----------+
|Postal Code|
+-----------+
|      94109|
|      94122|
|      94110|
+-----------+



In [106]:
# Perform value counts on a column
value_counts = store_df.groupBy('Ship Mode').count()

# Display the value counts
value_counts.show()

+--------------+-----+
|     Ship Mode|count|
+--------------+-----+
|   First Class| 1501|
|      Same Day|  538|
|  Second Class| 1902|
|Standard Class| 5859|
+--------------+-----+



### Group and Agg data

In [108]:
store_df.select(['Ship Mode']).groupby('Ship Mode').count().show()

+--------------+-----+
|     Ship Mode|count|
+--------------+-----+
|   First Class| 1501|
|      Same Day|  538|
|  Second Class| 1902|
|Standard Class| 5859|
+--------------+-----+



In [112]:
store_df.select(['Ship Mode', 'ship_time']).groupby('Ship Mode').mean().show(truncate=True)

+--------------+-------------------+
|     Ship Mode|     avg(ship_time)|
+--------------+-------------------+
|   First Class|  2.179213857428381|
|      Same Day|0.04460966542750929|
|  Second Class| 3.2492113564668768|
|Standard Class|  5.008363201911589|
+--------------+-------------------+



### Build a logistic regression model

In [186]:
iris_df = spark.read.csv('Iris.csv', header=True, inferSchema=True)
iris_df.show(3)

+---+-------------+------------+-------------+------------+-----------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|    Species|
+---+-------------+------------+-------------+------------+-----------+
|  1|          5.1|         3.5|          1.4|         0.2|Iris-setosa|
|  2|          4.9|         3.0|          1.4|         0.2|Iris-setosa|
|  3|          4.7|         3.2|          1.3|         0.2|Iris-setosa|
+---+-------------+------------+-------------+------------+-----------+
only showing top 3 rows



In [187]:
#Encode to categorical!
indexer = StringIndexer(inputCol='Species', outputCol='Species_encoded').fit(iris_df)
iris_df = indexer.transform(iris_df)
iris_df = iris_df.drop('Species')

In [188]:
iris_df.show(3)

+---+-------------+------------+-------------+------------+---------------+
| Id|SepalLengthCm|SepalWidthCm|PetalLengthCm|PetalWidthCm|Species_encoded|
+---+-------------+------------+-------------+------------+---------------+
|  1|          5.1|         3.5|          1.4|         0.2|            0.0|
|  2|          4.9|         3.0|          1.4|         0.2|            0.0|
|  3|          4.7|         3.2|          1.3|         0.2|            0.0|
+---+-------------+------------+-------------+------------+---------------+
only showing top 3 rows



In [189]:
#Vectorize Data for Model
vector_df = VectorAssembler(inputCols=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm'], outputCol='features').transform(iris_df)\
.select(['features', 'Species_encoded'])

vector_df.show(3)

+-----------------+---------------+
|         features|Species_encoded|
+-----------------+---------------+
|[5.1,3.5,1.4,0.2]|            0.0|
|[4.9,3.0,1.4,0.2]|            0.0|
|[4.7,3.2,1.3,0.2]|            0.0|
+-----------------+---------------+
only showing top 3 rows



In [190]:
#Scale Data
scaler = MinMaxScaler(min=0, max=1., inputCol='features', outputCol='features_scaled').fit(vector_df)
vector_df = scaler.transform(vector_df).select(['features_scaled', 'Species_encoded'])
vector_df.show(3)

+--------------------+---------------+
|     features_scaled|Species_encoded|
+--------------------+---------------+
|[0.22222222222222...|            0.0|
|[0.16666666666666...|            0.0|
|[0.11111111111111...|            0.0|
+--------------------+---------------+
only showing top 3 rows



In [191]:
vector_df.cache()

DataFrame[features_scaled: vector, Species_encoded: double]

In [192]:
#Split Data
train,test = vector_df.randomSplit([0.8,0.2], seed=0)

In [193]:
train.show()

+--------------------+---------------+
|     features_scaled|Species_encoded|
+--------------------+---------------+
|[0.0,0.4166666666...|            0.0|
|[0.02777777777777...|            0.0|
|[0.02777777777777...|            0.0|
|[0.02777777777777...|            0.0|
|[0.05555555555555...|            0.0|
|[0.08333333333333...|            0.0|
|[0.08333333333333...|            0.0|
|[0.08333333333333...|            0.0|
|[0.08333333333333...|            0.0|
|[0.11111111111111...|            0.0|
|[0.11111111111111...|            0.0|
|[0.13888888888888...|            0.0|
|[0.13888888888888...|            0.0|
|[0.13888888888888...|            0.0|
|[0.13888888888888...|            0.0|
|[0.13888888888888...|            0.0|
|[0.16666666666666...|            1.0|
|[0.16666666666666...|            2.0|
|[0.16666666666666...|            0.0|
|[0.16666666666666...|            0.0|
+--------------------+---------------+
only showing top 20 rows



In [194]:
#Develop model
lr = LogisticRegression(featuresCol='features_scaled', labelCol='Species_encoded', predictionCol='prediction')
lr_model = lr.fit(train)
pred = lr_model.transform(test)

In [200]:
#Multiclass evaluator

evaluator = MulticlassClassificationEvaluator(labelCol='Species_encoded', metricName='accuracy')


In [196]:
#GRIDSEARCHcv

#grid
param_grid = ParamGridBuilder() \
    .addGrid(lr_model.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr_model.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

In [201]:
#Gridsearch
cross_validator = CrossValidator(estimator=lr, #not lr_model since that is already fitted
                                 estimatorParamMaps=param_grid,
                                 evaluator = evaluator,
                                 numFolds=3)

cv_model = cross_validator.fit(train)

In [202]:
# Get the best model from CrossValidator

best_model = cv_model.bestModel #This is the Logit model w/ best hyperparameters***

# Access the best parameter values
best_reg_param = best_model.getOrDefault('regParam')
best_elastic_net_param = best_model.getOrDefault('elasticNetParam')

# Print the best parameter values
print("Best regParam:", best_reg_param)
print("Best elasticNetParam:", best_elastic_net_param)

Best regParam: 0.01
Best elasticNetParam: 0.0


In [205]:
pred = best_model.transform(test) #Predictions + Probability
pred.show(3)

+--------------------+---------------+--------------------+--------------------+----------+
|     features_scaled|Species_encoded|       rawPrediction|         probability|prediction|
+--------------------+---------------+--------------------+--------------------+----------+
|[0.16666666666666...|            0.0|[5.49350714942006...|[0.95179006999633...|       0.0|
|[0.16666666666666...|            0.0|[5.49350714942006...|[0.95179006999633...|       0.0|
|[0.19444444444444...|            0.0|[5.35218208902097...|[0.96302833364352...|       0.0|
+--------------------+---------------+--------------------+--------------------+----------+
only showing top 3 rows



In [212]:
evaluator = MulticlassClassificationEvaluator(labelCol='Species_encoded')

# Calculate precision
precision = evaluator.evaluate(pred, {evaluator.metricName: 'weightedPrecision'})

# Calculate recall
recall = evaluator.evaluate(pred, {evaluator.metricName: 'weightedRecall'})

# Calculate F1-score
f1_score = evaluator.evaluate(pred, {evaluator.metricName: 'f1'})

# Print the evaluation metrics
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: 1.0
Recall: 1.0
F1-score: 1.0


## A GOOD LOGISTIC REGRESSION
- Sigmoid Cutoff

In [243]:
df = spark.read.csv('bank.csv', header=True, inferSchema=True)
df.show(3)

+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
|age|       job|marital|education|default|balance|housing|loan|contact|day|month|duration|campaign|pdays|previous|poutcome|deposit|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
| 59|    admin.|married|secondary|     no|   2343|    yes|  no|unknown|  5|  may|    1042|       1|   -1|       0| unknown|    yes|
| 56|    admin.|married|secondary|     no|     45|     no|  no|unknown|  5|  may|    1467|       1|   -1|       0| unknown|    yes|
| 41|technician|married|secondary|     no|   1270|    yes|  no|unknown|  5|  may|    1389|       1|   -1|       0| unknown|    yes|
+---+----------+-------+---------+-------+-------+-------+----+-------+---+-----+--------+--------+-----+--------+--------+-------+
only showing top 3 rows



In [244]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- job: string (nullable = true)
 |-- marital: string (nullable = true)
 |-- education: string (nullable = true)
 |-- default: string (nullable = true)
 |-- balance: integer (nullable = true)
 |-- housing: string (nullable = true)
 |-- loan: string (nullable = true)
 |-- contact: string (nullable = true)
 |-- day: integer (nullable = true)
 |-- month: string (nullable = true)
 |-- duration: integer (nullable = true)
 |-- campaign: integer (nullable = true)
 |-- pdays: integer (nullable = true)
 |-- previous: integer (nullable = true)
 |-- poutcome: string (nullable = true)
 |-- deposit: string (nullable = true)



In [245]:
features = ['job', 'marital', 'education', 'default', 'housing', 'loan',
                                   'contact', 'month', 'poutcome', 'deposit']       

In [246]:
indexer = StringIndexer(inputCols=features, outputCols=[x+'_e' for x in features]).fit(df)


df = indexer.transform(df).select([x+'_e' for x in features])

In [247]:
df.show(3)

+-----+---------+-----------+---------+---------+------+---------+-------+----------+---------+
|job_e|marital_e|education_e|default_e|housing_e|loan_e|contact_e|month_e|poutcome_e|deposit_e|
+-----+---------+-----------+---------+---------+------+---------+-------+----------+---------+
|  3.0|      0.0|        0.0|      0.0|      1.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  3.0|      0.0|        0.0|      0.0|      0.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  2.0|      0.0|        0.0|      0.0|      1.0|   0.0|      1.0|    0.0|       0.0|      1.0|
+-----+---------+-----------+---------+---------+------+---------+-------+----------+---------+
only showing top 3 rows



In [249]:
# Vectorize Data
vector_df = VectorAssembler(inputCols=[x+'_e' for x in features], outputCol='features').transform(df)\
.select(['features', 'deposit_e'])

vector_df

DataFrame[features: vector, deposit_e: double]

In [250]:
vector_df.show(3)

+--------------------+---------+
|            features|deposit_e|
+--------------------+---------+
|(10,[0,4,6,9],[3....|      1.0|
|(10,[0,6,9],[3.0,...|      1.0|
|(10,[0,4,6,9],[2....|      1.0|
+--------------------+---------+
only showing top 3 rows



In [252]:
#Scale Data
scaler = MinMaxScaler(inputCol='features', outputCol='features_scaled').fit(vector_df)

vector_df = scaler.transform(vector_df).select(['features_scaled', 'deposit_e'])
vector_df.show(3)

+--------------------+---------+
|     features_scaled|deposit_e|
+--------------------+---------+
|(10,[0,4,6,9],[0....|      1.0|
|(10,[0,6,9],[0.27...|      1.0|
|(10,[0,4,6,9],[0....|      1.0|
+--------------------+---------+
only showing top 3 rows



In [255]:
#split data
train,test = vector_df.randomSplit([.8,.2], seed=0)

In [256]:
#Develop model
lr = LogisticRegression(featuresCol='features_scaled', labelCol='deposit_e', predictionCol='prediction')
lr_model = lr.fit(train) #fitted model
pred = lr_model.transform(test)

In [264]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# Create a LogisticRegression instance
lr = LogisticRegression(featuresCol='features_scaled', labelCol='deposit_e', predictionCol='prediction')

# Train the logistic regression model
lr_model = lr.fit(train)

# Make predictions on the test data
predictions = lr_model.transform(test)

# Create a BinaryClassificationEvaluator with accuracy metric
evaluator = BinaryClassificationEvaluator(labelCol='deposit_e', metricName='areaUnderROC')

# Calculate the accuracy score
accuracy = evaluator.evaluate(predictions)

# Print the accuracy score
print("Accuracy:", accuracy)


Accuracy: 1.0


In [266]:
df.show()

+-----+---------+-----------+---------+---------+------+---------+-------+----------+---------+
|job_e|marital_e|education_e|default_e|housing_e|loan_e|contact_e|month_e|poutcome_e|deposit_e|
+-----+---------+-----------+---------+---------+------+---------+-------+----------+---------+
|  3.0|      0.0|        0.0|      0.0|      1.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  3.0|      0.0|        0.0|      0.0|      0.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  2.0|      0.0|        0.0|      0.0|      1.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  4.0|      0.0|        0.0|      0.0|      1.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  3.0|      0.0|        1.0|      0.0|      0.0|   0.0|      1.0|    0.0|       0.0|      1.0|
|  0.0|      1.0|        1.0|      0.0|      1.0|   1.0|      1.0|    0.0|       0.0|      1.0|
|  0.0|      0.0|        1.0|      0.0|      1.0|   1.0|      1.0|    0.0|       0.0|      1.0|
|  5.0|      2.0|        0.0|      0.0| 

In [267]:
#Find correlation with outcome var
def corr_(var):
  for col_ in df.columns:
    if col_ != var: continue
    for col2 in df.columns:
      if col_==col2 or df.select(F.col(col_)).dtypes[0][1] == 'string' or df.select(F.col(col2)).dtypes[0][1] == 'string': continue
      else:print('Correlation ({}, {}):'.format(col_, col2), df.select(F.corr(col_, col2)).first()[0])

corr_('deposit_e')

Correlation (deposit_e, job_e): 0.04114278586355233
Correlation (deposit_e, marital_e): 0.06832450727589358
Correlation (deposit_e, education_e): 0.012163805221309847
Correlation (deposit_e, default_e): -0.04068008708114991
Correlation (deposit_e, housing_e): -0.20388774933788276
Correlation (deposit_e, loan_e): -0.11057998365404831
Correlation (deposit_e, contact_e): -0.158744041418865
Correlation (deposit_e, month_e): 0.26334776846057434
Correlation (deposit_e, poutcome_e): 0.23869148422388253


In [268]:
### Another logit regress

### Another logit regress

In [270]:
df = spark.read.csv('winequality-red.csv', header=True, sep=';', 
                    inferSchema=True)
df.show(3)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.4|             0.7|        0.0|           1.9|    0.076|               11.0|                34.0| 0.9978|3.51|     0.56|    9.4|      5|
|          7.8|            0.88|        0.0|           2.6|    0.098|               25.0|                67.0| 0.9968| 3.2|     0.68|    9.8|      5|
|          7.8|            0.76|       0.04|           2.3|    0.092|               15.0|                54.0|  0.997|3.26|     0.65|    9.8|      5|
+-------------+----------------+-----------+--------------+---------+-------------------+-----------

In [274]:
df.groupBy('quality').count().show() #sampling technique

+-------+-----+
|quality|count|
+-------+-----+
|      6|  638|
|      3|   10|
|      5|  681|
|      4|   53|
|      8|   18|
|      7|  199|
+-------+-----+



In [277]:
# Vectorize Data
vector_df = VectorAssembler(inputCols=[d for d in df.columns[0:-1]], outputCol='features').transform(df)\
.select(['features', 'quality'])

vector_df.show(3)

+--------------------+-------+
|            features|quality|
+--------------------+-------+
|[7.4,0.7,0.0,1.9,...|      5|
|[7.8,0.88,0.0,2.6...|      5|
|[7.8,0.76,0.04,2....|      5|
+--------------------+-------+
only showing top 3 rows



In [278]:
#Scale Data
scaler = MinMaxScaler(inputCol='features', outputCol='features_scaled').fit(vector_df)

vector_df = scaler.transform(vector_df).select(['features_scaled', 'quality'])
vector_df.show(3)

+--------------------+-------+
|     features_scaled|quality|
+--------------------+-------+
|[0.24778761061946...|      5|
|[0.28318584070796...|      5|
|[0.28318584070796...|      5|
+--------------------+-------+
only showing top 3 rows



In [279]:
#SPLIT
train, test= vector_df.randomSplit([0.8, 0.2], seed=0)

In [280]:
#Develop model
lr = LogisticRegression(featuresCol='features_scaled', labelCol='quality', predictionCol='prediction')
lr_model = lr.fit(train) #fitted model
pred = lr_model.transform(test)

In [283]:
# Create a BinaryClassificationEvaluator with accuracy metric
evaluator = MulticlassClassificationEvaluator(labelCol='quality', metricName='accuracy')

# Calculate the accuracy score
accuracy = evaluator.evaluate(pred)

# Print the accuracy score
print("Accuracy:", accuracy)


Accuracy: 0.6005830903790087


### PRELIMINARY MODEL ABOVE^^

### THIS IS REAL MODEL W/OPTIMIZED HYPERPARMETERS

In [285]:
#Develop model
lr = LogisticRegression(featuresCol='features_scaled', labelCol='quality', predictionCol='prediction')
lr_model = lr.fit(train)
pred = lr_model.transform(test)

evaluator = MulticlassClassificationEvaluator(labelCol='quality', metricName='accuracy')


#grid
param_grid = ParamGridBuilder() \
    .addGrid(lr_model.regParam, [0.01, 0.1, 1.0]) \
    .addGrid(lr_model.elasticNetParam, [0.0, 0.5, 1.0]) \
    .build()

  #Gridsearch
cross_validator = CrossValidator(estimator=lr, #not lr_model since that is already fitted
                                 estimatorParamMaps=param_grid,
                                 evaluator = evaluator,
                                 numFolds=3)

cv_model = cross_validator.fit(train)

In [288]:
best_model = cv_model.bestModel
 
 # Make predictions using the best model
predictions = best_model.transform(test)

# Create a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol='quality')

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: 'accuracy'})

# Calculate precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedPrecision'})

# Calculate recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: 'weightedRecall'})

# Calculate F1-score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: 'f1'})

# Print the evaluation metrics
print('Accuracy', accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Accuracy 0.5801749271137027
Precision: 0.5646232518981619
Recall: 0.5801749271137027
F1-score: 0.5422970347084131
