In [32]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *

In [33]:
spark = SparkSession.builder.appName("demo").getOrCreate()

In [34]:
schema = StructType([
    StructField("age", IntegerType(),nullable=True),
    StructField("sex", IntegerType(),nullable=True),
    StructField("cp", IntegerType(),nullable=True),
    StructField("trestbps", IntegerType(),nullable=True),
    StructField("chol", IntegerType(),nullable=True),
    StructField("fbs", IntegerType(),nullable=True),
    StructField("restecg", IntegerType(),nullable=True),
    StructField("thalach", IntegerType(),nullable=True),
    StructField("exang", IntegerType(),nullable=True),
    StructField("oldpeak", FloatType(),nullable=True),
    StructField("slope", IntegerType(),nullable=True),
    StructField("ca", IntegerType(),nullable=True),
    StructField("thal", IntegerType(),nullable=True),
    StructField("target", IntegerType(),nullable=True),
])

In [35]:
path = "heart.csv"
df = spark.read.csv(path=path,schema=schema,header=True)

In [36]:
df.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+
only showing top 5 rows



In [37]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- sex: integer (nullable = true)
 |-- cp: integer (nullable = true)
 |-- trestbps: integer (nullable = true)
 |-- chol: integer (nullable = true)
 |-- fbs: integer (nullable = true)
 |-- restecg: integer (nullable = true)
 |-- thalach: integer (nullable = true)
 |-- exang: integer (nullable = true)
 |-- oldpeak: float (nullable = true)
 |-- slope: integer (nullable = true)
 |-- ca: integer (nullable = true)
 |-- thal: integer (nullable = true)
 |-- target: integer (nullable = true)



In [38]:
for i in df.columns:
    print(i," ",":",df.stat.corr(i,"target"))

age   : -0.22543871587483838
sex   : -0.28093657550176687
cp   : 0.4337982615068946
trestbps   : -0.14493112849775
chol   : -0.08523910513756904
fbs   : -0.02804576027271302
restecg   : 0.1372295028737732
thalach   : 0.4217409338106742
exang   : -0.43675708335330315
oldpeak   : -0.4306960030062106
slope   : 0.34587707824172464
ca   : -0.39172399235125244
thal   : -0.34402926803830997
target   : 1.0


In [39]:
data = df[['age', 'trestbps', 'chol', 'thalach', 'oldpeak','sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal','target']]

In [40]:
data.show(5)

+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
|age|trestbps|chol|thalach|oldpeak|sex| cp|fbs|restecg|exang|slope| ca|thal|target|
+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
| 63|     145| 233|    150|    2.3|  1|  3|  1|      0|    0|    0|  0|   1|     1|
| 37|     130| 250|    187|    3.5|  1|  2|  0|      1|    0|    0|  0|   2|     1|
| 41|     130| 204|    172|    1.4|  0|  1|  0|      0|    0|    2|  0|   2|     1|
| 56|     120| 236|    178|    0.8|  1|  1|  0|      1|    0|    2|  0|   2|     1|
| 57|     120| 354|    163|    0.6|  0|  0|  0|      1|    1|    2|  0|   2|     1|
+---+--------+----+-------+-------+---+---+---+-------+-----+-----+---+----+------+
only showing top 5 rows



In [41]:
from pyspark.ml.feature import *

In [42]:
feature = VectorAssembler(inputCols = df.columns[:len(df.columns)-1],outputCol="features")
feature_vector= feature.transform(df)

In [43]:
feature_vector.show(5)

+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
|age|sex| cp|trestbps|chol|fbs|restecg|thalach|exang|oldpeak|slope| ca|thal|target|            features|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
| 63|  1|  3|     145| 233|  1|      0|    150|    0|    2.3|    0|  0|   1|     1|[63.0,1.0,3.0,145...|
| 37|  1|  2|     130| 250|  0|      1|    187|    0|    3.5|    0|  0|   2|     1|[37.0,1.0,2.0,130...|
| 41|  0|  1|     130| 204|  0|      0|    172|    0|    1.4|    2|  0|   2|     1|[41.0,0.0,1.0,130...|
| 56|  1|  1|     120| 236|  0|      1|    178|    0|    0.8|    2|  0|   2|     1|[56.0,1.0,1.0,120...|
| 57|  0|  0|     120| 354|  0|      1|    163|    1|    0.6|    2|  0|   2|     1|[57.0,0.0,0.0,120...|
+---+---+---+--------+----+---+-------+-------+-----+-------+-----+---+----+------+--------------------+
only showing top 5 rows



In [44]:
pandasDF = feature_vector.toPandas()

In [45]:
feature_vector_select = feature_vector.select(['features','target'])

In [46]:
(x_train, x_test) = feature_vector_select.randomSplit([0.8, 0.2])

In [47]:
type(x_train)

pyspark.sql.dataframe.DataFrame

In [48]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit

In [49]:
rf = RandomForestClassifier(labelCol='target',featuresCol="features")

In [50]:
model = rf.fit(x_train)

In [51]:
prediction = model.transform(x_test)
prediction.show(5)

In [52]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [53]:
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(prediction))

In [31]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='target',featuresCol="features")
model_lr = lr.fit(x_train)
prediction_lr = model_lr.transform(x_test)
print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(prediction_lr))

Accuracy:  0.8571428571428571


paramGrid = ParamGridBuilder()\
                                .addGrid(rf.maxDepth, [5, 10, 20]) \
                                .addGrid(rf.maxBins, [20, 32, 50]) \
                                .addGrid(rf.numTrees, [20, 40, 60 ]) \
                                .addGrid(rf.impurity, ["gini", "entropy"]) \
                                .addGrid(rf.minInstancesPerNode, [1, 5, 10]) \
                                .build()

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
tvs = TrainValidationSplit( estimator=rf
                           ,estimatorParamMaps=paramGrid
                           ,evaluator=MulticlassClassificationEvaluator(labelCol='target')
                           ,trainRatio=0.8)
model = tvs.fit(x_train)
model_predictions= model.transform(x_test)


print('Accuracy: ', MulticlassClassificationEvaluator(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(model_predictions))

In [54]:
import importlib
module = importlib.import_module("pyspark.ml.classification")
model_name = "RandomForestClassifier"
class_ref = getattr(module, model_name)
class_ref

pyspark.ml.classification.RandomForestClassifier

In [55]:
rf = class_ref(labelCol='target',featuresCol="features").fit(x_train)

In [56]:
module_eval = importlib.import_module("pyspark.ml.evaluation")
eval_name = "MulticlassClassificationEvaluator"
class_ref_eval = getattr(module_eval, eval_name)
class_ref_eval

pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [57]:
print(class_ref_eval(labelCol='target',predictionCol="prediction",metricName='accuracy').evaluate(prediction))

0.8571428571428571


In [58]:
from Classification.constant import *

In [59]:
from Classification.utils.util import read_yaml_file

In [60]:
x = read_yaml_file(file_path="E:\PROJECTS\Spark-Project\config\model.yaml")

In [61]:
models = x['model_selection']["module_0"]
models

{'class': 'RandomForestClassifier',
 'module': 'pyspark.ml.classification',
 'params': {'featuresCol': 'features',
  'labelCol': 'target',
  'predictionCol': 'prediction'},
 'evaluation_module': 'pyspark.ml.evaluation',
 'evaluation_class': 'MulticlassClassificationEvaluator',
 'evaluationmetric': {'metricName': 'accuracy'}}

In [62]:
RandomForestClassifier_model_name = models[MODULE_CLASS_KEY]
RandomForestClassifier_model_library = models[MODULE_CLASS_MODULE_KEY]
RandomForestClassifier_modeleval_library = models[MODULE_EVAL_KEY]
RandomForestClassifier_modeleval_name = models[MODULE_EVAL_CLASS_KEY]
RandomForestClassifier_model_params_features = models[MODULE_PARAMS_KEY][MODULE_PARAMS_FEATURECOL_KEY]
RandomForestClassifier_model_params_labelcol = models[MODULE_PARAMS_KEY][MODULE_PARAMS_LABELCOL_KEY]
RandomForestClassifier_model_params_predictionCol = models[MODULE_PARAMS_KEY][MODULE_PARAMS_PREDICTIONCOL_KEY]
RandomForestClassifier_model_evaluation_metric = models[MODULE_EVAL_METRIC_KEY][MODULE_EVAL_METRIC_CLASS_KEY]

In [63]:
print(RandomForestClassifier_model_name)
print(RandomForestClassifier_modeleval_library)
print(RandomForestClassifier_model_library)
print(RandomForestClassifier_model_params_features)
print(RandomForestClassifier_modeleval_name)
print(RandomForestClassifier_model_params_labelcol)
print(RandomForestClassifier_model_evaluation_metric)
print(RandomForestClassifier_model_params_predictionCol)

RandomForestClassifier
pyspark.ml.evaluation
pyspark.ml.classification
features
MulticlassClassificationEvaluator
target
accuracy
prediction


In [64]:
model_lr = x['model_selection']["module_1"]

In [65]:
LogisticRegression_model_name = model_lr[MODULE_CLASS_KEY]
LogisticRegression_model_library = model_lr[MODULE_CLASS_MODULE_KEY]
LogisticRegression_model_params_features = model_lr[MODULE_PARAMS_KEY][MODULE_PARAMS_FEATURECOL_KEY]
LogisticRegression_model_params_labelcol = model_lr[MODULE_PARAMS_KEY][MODULE_PARAMS_LABELCOL_KEY]

In [71]:
module_lr = importlib.import_module(LogisticRegression_model_library)
class_ref = getattr(module_lr, LogisticRegression_model_name)
lr = class_ref(labelCol=LogisticRegression_model_params_labelcol,featuresCol=LogisticRegression_model_params_features).fit(x_train)
            

In [72]:
module_rf = importlib.import_module(RandomForestClassifier_model_library)
class_ref_rf = getattr(module_rf, RandomForestClassifier_model_name)
rf = class_ref_rf(labelCol=RandomForestClassifier_model_params_labelcol,featuresCol=RandomForestClassifier_model_params_features).fit(x_train)

In [74]:
x = [lr,rf]

In [75]:
x[1]

RandomForestClassificationModel: uid=RandomForestClassifier_611b59497fb2, numTrees=20, numClasses=2, numFeatures=13

In [76]:
x[0]

LogisticRegressionModel: uid=LogisticRegression_3c9d3d1fecc0, numClasses=2, numFeatures=13

In [8]:
import importlib
module = importlib.import_module(RandomForestClassifier_model_library)
class_ref = getattr(module, RandomForestClassifier_model_name)
class_ref

pyspark.ml.classification.RandomForestClassifier

In [58]:
rf = class_ref(labelCol=RandomForestClassifier_model_params_labelcol,featuresCol=RandomForestClassifier_model_params_features).fit(x_train)

In [59]:
module_eval = importlib.import_module(RandomForestClassifier_modeleval_library)
class_ref_eval = getattr(module_eval, RandomForestClassifier_modeleval_name)
class_ref_eval

pyspark.ml.evaluation.MulticlassClassificationEvaluator

In [60]:
print(class_ref_eval(labelCol= RandomForestClassifier_model_params_labelcol,
                    predictionCol=RandomForestClassifier_model_params_predictionCol,
                    metricName= RandomForestClassifier_model_evaluation_metric).evaluate(prediction))

0.8771929824561403


In [None]:
RandomForestClassifier_modeleval_library = models[MODULE_EVAL_KEY]
RandomForestClassifier_modeleval_name = models[MODULE_EVAL_CLASS_KEY]
RandomForestClassifier_model_params_predictionCol = models[MODULE_PARAMS_KEY][MODULE_PARAMS_PREDICTIONCOL_KEY]
RandomForestClassifier_model_evaluation_metric = models[MODULE_EVAL_METRIC_KEY][MODULE_EVAL_METRIC_CLASS_KEY]
