In [119]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import LogisticRegression 
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [84]:
# Start Sparksession
spark = SparkSession.builder.appName('Project').getOrCreate()
spark

In [85]:
# Read the dataset
df = spark.read.option('header','true').csv('indian_liver_patient.csv',inferSchema=True)


In [86]:
# Check the schema
df.printSchema()

root
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Total_Bilirubin: double (nullable = true)
 |-- Direct_Bilirubin: double (nullable = true)
 |-- Alkaline_Phosphotase: integer (nullable = true)
 |-- Alamine_Aminotransferase: integer (nullable = true)
 |-- Aspartate_Aminotransferase: integer (nullable = true)
 |-- Total_Protiens: double (nullable = true)
 |-- Albumin: double (nullable = true)
 |-- Albumin_and_Globulin_Ratio: double (nullable = true)
 |-- Dataset: integer (nullable = true)



In [87]:
# Show first 5 rows
df.show(5)

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                      0.74|      1|
| 62|  Male|            7.3|             4.1|                 490|                      60|                        

In [88]:
# Describe
df.describe().show(3)

+-------+------------------+------+-----------------+------------------+--------------------+------------------------+--------------------------+------------------+-----------------+--------------------------+------------------+
|summary|               Age|Gender|  Total_Bilirubin|  Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|    Total_Protiens|          Albumin|Albumin_and_Globulin_Ratio|           Dataset|
+-------+------------------+------+-----------------+------------------+--------------------+------------------------+--------------------------+------------------+-----------------+--------------------------+------------------+
|  count|               583|   583|              583|               583|                 583|                     583|                       583|               583|              583|                       579|               583|
|   mean| 44.74614065180103|  null|3.298799313893652|1.4861063464837074|  290.576329

In [89]:
df.columns

['Age',
 'Gender',
 'Total_Bilirubin',
 'Direct_Bilirubin',
 'Alkaline_Phosphotase',
 'Alamine_Aminotransferase',
 'Aspartate_Aminotransferase',
 'Total_Protiens',
 'Albumin',
 'Albumin_and_Globulin_Ratio',
 'Dataset']

In [90]:
# Categorical_columns
categorical_columns = []
# Numerical Columns
numerical_columns = []

for column in df.schema:
    if str(column.dataType) == 'StringType':
        categorical_columns.append(column.name)
    else:
        numerical_columns.append(column.name)

In [91]:
print(categorical_columns)

['Gender']


In [92]:
print(numerical_columns)

['Age', 'Total_Bilirubin', 'Direct_Bilirubin', 'Alkaline_Phosphotase', 'Alamine_Aminotransferase', 'Aspartate_Aminotransferase', 'Total_Protiens', 'Albumin', 'Albumin_and_Globulin_Ratio', 'Dataset']


In [93]:
# Count the number of null values in each column
null_counts = df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns])

# Display the null counts
null_counts.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|  0|     0|              0|               0|                   0|                       0|                         0|             0|      0|                         4|      0|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+



In [104]:
df.na.fill(0.0,'Albumin_and_Globulin_Ratio').show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|                      0.74|      1|
| 62|  Male|            7.3|             4.1|                 490|                      60|                        

In [113]:
# Categorical encoding
indexer = StringIndexer(inputCol='Gender',outputCol='Gender_encoded')
indexed_df = indexer.fit(df).transform(df)


In [115]:
# VectorAssembler
featureassembler = VectorAssembler(inputCols=['Age','Total_Bilirubin','Direct_Bilirubin','Alkaline_Phosphotase','Alamine_Aminotransferase',
'Aspartate_Aminotransferase','Total_Protiens','Albumin_and_Globulin_Ratio','Indexed_column'],outputCol='Independent Fatures')

In [116]:
# Output df
output = featureassembler.transform(indexed_df)
output.show()

+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+--------------+--------------------+
|Age|Gender|Total_Bilirubin|Direct_Bilirubin|Alkaline_Phosphotase|Alamine_Aminotransferase|Aspartate_Aminotransferase|Total_Protiens|Albumin|Albumin_and_Globulin_Ratio|Dataset|Indexed_column| Independent Fatures|
+---+------+---------------+----------------+--------------------+------------------------+--------------------------+--------------+-------+--------------------------+-------+--------------+--------------------+
| 65|Female|            0.7|             0.1|                 187|                      16|                        18|           6.8|    3.3|                       0.9|      1|           1.0|[65.0,0.7,0.1,187...|
| 62|  Male|           10.9|             5.5|                 699|                      64|                       100|           7.5|    3.2|       

In [117]:
output.columns

['Age',
 'Gender',
 'Total_Bilirubin',
 'Direct_Bilirubin',
 'Alkaline_Phosphotase',
 'Alamine_Aminotransferase',
 'Aspartate_Aminotransferase',
 'Total_Protiens',
 'Albumin',
 'Albumin_and_Globulin_Ratio',
 'Dataset',
 'Indexed_column',
 'Independent Fatures']

In [118]:
# Finalized data
finalized_data = output.select('Independent Fatures','Dataset')
finalized_data.show()

+--------------------+-------+
| Independent Fatures|Dataset|
+--------------------+-------+
|[65.0,0.7,0.1,187...|      1|
|[62.0,10.9,5.5,69...|      1|
|[62.0,7.3,4.1,490...|      1|
|[58.0,1.0,0.4,182...|      1|
|[72.0,3.9,2.0,195...|      1|
|[46.0,1.8,0.7,208...|      1|
|[26.0,0.9,0.2,154...|      1|
|[29.0,0.9,0.3,202...|      1|
|[17.0,0.9,0.3,202...|      2|
|[55.0,0.7,0.2,290...|      1|
|[57.0,0.6,0.1,210...|      1|
|[72.0,2.7,1.3,260...|      1|
|[64.0,0.9,0.3,310...|      2|
|[74.0,1.1,0.4,214...|      1|
|[61.0,0.7,0.2,145...|      1|
|[25.0,0.6,0.1,183...|      2|
|[38.0,1.8,0.8,342...|      1|
|[33.0,1.6,0.5,165...|      2|
|[40.0,0.9,0.3,293...|      1|
|[40.0,0.9,0.3,293...|      1|
+--------------------+-------+
only showing top 20 rows



In [120]:
# Split Train and Test
train_data, test_data = finalized_data.randomSplit([0.8,0.2],seed=42)


In [124]:
# Model
model = LogisticRegression()
# Initialized dataset to the model
model.fit(train_data)


In [None]:
# Predictions
predictions = model.transform(test_data)
Evaluator = BinaryClassificationEvaluator()
accuracy = evaluator.evaluate(predictions)