In [71]:
import findspark
findspark.init()



In [72]:
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline
from pyspark.sql.types import *

In [73]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("LogisticRegression").master("local[*]").getOrCreate()


In [74]:
PARAMETERS = {
    'INPUT_DATA_PATH': 'data/heart.csv',
    'TRAIN_DATA_PATH': 'data/train',
    'TEST_DATA_PATH': 'data/test',
    'INFER_SCHEMA': True,
    'HEADER': True,
    'TARGET_COLUMN': 'target',
    'TEST_SIZE': 0.2,
    'RANDOM_SEED': 42,
    'MAX_ITER': 10,
    'REG_PARAM': 0.3,
    'MODEL_PATH': 'models/logistic_regression',

    }





In [75]:
heart = spark.read.csv(PARAMETERS['INPUT_DATA_PATH'], 
                       inferSchema = PARAMETERS['INFER_SCHEMA'], 
                       header = PARAMETERS['HEADER'])
heart.show(3)

ORIGINAL_COLUMNS = heart.columns
COLUMNS_TO_SCALE = ['age','trestbps','chol','thalach','oldpeak']
NOT_TO_SCALE = [x for x in ORIGINAL_COLUMNS if x not in COLUMNS_TO_SCALE]

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 3 rows



In [76]:
heart.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: double (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [77]:
testDF, trainDF = heart.randomSplit([0.3, 0.7])
print(f"Train set length: {trainDF.count()}")
trainDF.show(3)

Train set length: 209
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 29|  1|  1|     130| 204|  0|      0|    202|    0|    0.0|    2|  0|   2|     1|
| 34|  1|  3|     118| 182|  0|      0|    174|    0|    0.0|    2|  0|   2|     1|
| 35|  1|  0|     120| 198|  0|      1|    130|    1|    1.6|    1|  0|   3|     0|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 3 rows



In [78]:
testDF.write.mode("overwrite").option("header", "true").csv(PARAMETERS['TEST_DATA_PATH'])
trainDF.write.mode("overwrite").option("header", "true").csv(PARAMETERS['TRAIN_DATA_PATH'])


In [79]:
trainDF_ = spark.read.csv(PARAMETERS['TRAIN_DATA_PATH'], inferSchema=PARAMETERS['INFER_SCHEMA'], header=True)
testDF = spark.read.csv(PARAMETERS['TEST_DATA_PATH'], inferSchema=PARAMETERS['INFER_SCHEMA'], header=True)
# Create the logistic regression model
lr = LogisticRegression(maxIter=PARAMETERS['MAX_ITER'], regParam= PARAMETERS['REG_PARAM'], labelCol=PARAMETERS['TARGET_COLUMN'])

In [80]:
# We scale our inputs
assembler_scale = VectorAssembler(inputCols=COLUMNS_TO_SCALE, outputCol="columns_to_scale")
scaler = MinMaxScaler(inputCol="columns_to_scale", outputCol="features_scaled")
columns = NOT_TO_SCALE + ['features_scaled']
# We create a second assembler for the scaled features.
assembler_train = VectorAssembler(inputCols=columns, outputCol="features")
                         


In [81]:
# Create stages list
myStages = [assembler_scale, scaler,  assembler_train, lr]

# Set up the pipeline
pipeline = Pipeline(stages= myStages)

In [82]:
# We fit the model using the training data.
pModel = pipeline.fit(trainDF)

# We transform the data.
trainingPred = pModel.transform(trainDF)

# # We select the actual label, probability and predictions
trainingPred.select('target','probability','prediction').show()

+------+--------------------+----------+
|target|         probability|prediction|
+------+--------------------+----------+
|     1|[0.11811290805099...|       1.0|
|     1|[0.09747661278283...|       1.0|
|     0|[0.76617856301984...|       0.0|
|     0|[0.68093490665240...|       0.0|
|     1|[0.12413018888222...|       1.0|
|     1|[0.18186659652377...|       1.0|
|     1|[0.18186659652377...|       1.0|
|     1|[0.07349765729741...|       1.0|
|     1|[0.12911017745246...|       1.0|
|     0|[0.66755096454978...|       0.0|
|     1|[0.14222147807076...|       1.0|
|     0|[0.80936526883131...|       0.0|
|     1|[0.20842148066763...|       1.0|
|     1|[0.11077611502264...|       1.0|
|     1|[0.12225122449727...|       1.0|
|     1|[0.14413629619967...|       1.0|
|     0|[0.54578947908463...|       0.0|
|     1|[0.14635697706560...|       1.0|
|     1|[0.11796814947501...|       1.0|
|     1|[0.17242473297768...|       1.0|
+------+--------------------+----------+
only showing top

In [83]:
import os

if os.path.exists(PARAMETERS['MODEL_PATH']):
    os.remove(PARAMETERS['MODEL_PATH'])

pModel.save(PARAMETERS['MODEL_PATH'])