In [1]:
!pip install pyspark

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/45/b0/9d6860891ab14a39d4bddf80ba26ce51c2f9dc4805e5c6978ac0472c120a/pyspark-3.1.1.tar.gz (212.3MB)
[K     |████████████████████████████████| 212.3MB 67kB/s 
[?25hCollecting py4j==0.10.9
[?25l  Downloading https://files.pythonhosted.org/packages/9e/b6/6a4fb90cd235dc8e265a6a2067f2a2c99f0d91787f06aca4bcf7c23f3f80/py4j-0.10.9-py2.py3-none-any.whl (198kB)
[K     |████████████████████████████████| 204kB 41.1MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.1.1-py2.py3-none-any.whl size=212767604 sha256=ce7465939119dc1bdcb59c523bfb088263236b58d0d7b9ed35dae354ce7df018
  Stored in directory: /root/.cache/pip/wheels/0b/90/c0/01de724414ef122bd05f056541fb6a0ecf47c7ca655f8b3c0f
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9 pyspark-3.1.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.functions import col
import numpy as np
import os

In [3]:
os.environ["HADOOP_HOME"] = "C:/winutils"

In [4]:
# Creating spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

In [5]:
data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load("/content/drive/MyDrive/diabetic_data.csv")
data.show(5)

+------------+-----------+---------------+------+-------+------+-----------------+------------------------+-------------------+----------------+----------+--------------------+------------------+--------------+---------------+-----------------+----------------+----------------+------+------+------+----------------+-------------+---------+---------+-----------+-----------+--------------+-----------+-------------+---------+---------+-----------+------------+-------------+--------+--------+------------+----------+-------+-----------+-------+-------------------+-------------------+------------------------+-----------------------+----------------------+------+-----------+----------+
|encounter_id|patient_nbr|           race|gender|    age|weight|admission_type_id|discharge_disposition_id|admission_source_id|time_in_hospital|payer_code|   medical_specialty|num_lab_procedures|num_procedures|num_medications|number_outpatient|number_emergency|number_inpatient|diag_1|diag_2|diag_3|number_diagnos

In [6]:
data.printSchema()

root
 |-- encounter_id: integer (nullable = true)
 |-- patient_nbr: integer (nullable = true)
 |-- race: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- weight: string (nullable = true)
 |-- admission_type_id: integer (nullable = true)
 |-- discharge_disposition_id: integer (nullable = true)
 |-- admission_source_id: integer (nullable = true)
 |-- time_in_hospital: integer (nullable = true)
 |-- payer_code: string (nullable = true)
 |-- medical_specialty: string (nullable = true)
 |-- num_lab_procedures: integer (nullable = true)
 |-- num_procedures: integer (nullable = true)
 |-- num_medications: integer (nullable = true)
 |-- number_outpatient: integer (nullable = true)
 |-- number_emergency: integer (nullable = true)
 |-- number_inpatient: integer (nullable = true)
 |-- diag_1: string (nullable = true)
 |-- diag_2: string (nullable = true)
 |-- diag_3: string (nullable = true)
 |-- number_diagnoses: integer (nullable = true)
 |-

In [7]:
numeric_features = [t[0] for t in data.dtypes if t[1] == 'int']
data.select(numeric_features).describe().toPandas().transpose()

Unnamed: 0,0,1,2,3,4
summary,count,mean,stddev,min,max
encounter_id,101766,1.652016456229782E8,1.0264029598345754E8,12522,443867222
patient_nbr,101766,5.4330400694947235E7,3.869635934653452E7,135,189502619
admission_type_id,101766,2.024006053102215,1.4454028297561141,1,8
discharge_disposition_id,101766,3.7156417664052825,5.280165509299276,1,28
admission_source_id,101766,5.754436648782501,4.064080834283895,1,25
time_in_hospital,101766,4.395986871843248,2.985107767471271,1,14
num_lab_procedures,101766,43.09564098028811,19.674362249142067,1,132
num_procedures,101766,1.339730361810428,1.7058069791211607,0,6
num_medications,101766,16.021844230882614,8.127566209167297,1,81


In [8]:
data = data.select("encounter_id", "patient_nbr", "admission_type_id", "discharge_disposition_id", "admission_source_id", "time_in_hospital", "num_lab_procedures", "num_procedures", "num_medications", "number_outpatient", "number_emergency", "number_inpatient", "number_diagnoses")


In [9]:
# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=data.columns, outputCol="features")
data = assembler.transform(data)

In [12]:
# Trains a k-means model.
kmeans = KMeans().setK(4).setSeed(1)
model = kmeans.fit(data)

In [13]:
# Make predictions
predictions = model.transform(data)

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[2.52253565e+08 6.69269641e+07 1.75386082e+00 3.10166826e+00
 5.45081030e+00 4.18875119e+00 4.36344137e+01 1.31282173e+00
 1.66933746e+01 5.58198284e-01 2.97855100e-01 7.11058151e-01
 7.99285033e+00]
[6.37113388e+07 2.36018150e+07 2.28603305e+00 4.73503742e+00
 6.76842254e+00 4.60556419e+00 4.38627313e+01 1.39341901e+00
 1.53232029e+01 1.86216636e-01 9.04674481e-02 5.78675328e-01
 6.67843525e+00]
[1.54459882e+08 6.62242825e+07 2.02153280e+00 3.27142655e+00
 4.99793172e+00 4.37988384e+00 4.23099589e+01 1.29689758e+00
 1.61003258e+01 4.10454739e-01 2.23629409e-01 6.47967134e-01
 7.60481655e+00]
[3.77976343e+08 9.43735415e+07 1.67479675e+00 2.96886774e+00
 5.47392425e+00 4.14772953e+00 4.20316280e+01 1.35712869e+00
 1.68027960e+01 4.75609756e-01 2.76422764e-01 6.34840373e-01
 8.21108467e+00]
