In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('practice_pyspark').getOrCreate()

In [10]:
df_spark = spark.read.option('header', 'True').csv('healthcare-dataset-stroke-data.csv')
df_spark.show(2)

+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender|age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male| 67|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female| 61|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+---+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [11]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [12]:
spark

In [13]:
#Lets try to check the schema of the Dataframe
df_spark.printSchema()
#it is observed that all the features are by default considers as 'String'
#to correct it, Inferschema needs to be set to True

root
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: string (nullable = true)
 |-- hypertension: string (nullable = true)
 |-- heart_disease: string (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: string (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: string (nullable = true)



In [16]:
df_spark = spark.read.csv('healthcare-dataset-stroke-data.csv', header = True, inferSchema = True)
df_spark.printSchema()
#Now all the features are considered according to the feature itself

root
 |-- id: integer (nullable = true)
 |-- gender: string (nullable = true)
 |-- age: double (nullable = true)
 |-- hypertension: integer (nullable = true)
 |-- heart_disease: integer (nullable = true)
 |-- ever_married: string (nullable = true)
 |-- work_type: string (nullable = true)
 |-- Residence_type: string (nullable = true)
 |-- avg_glucose_level: double (nullable = true)
 |-- bmi: string (nullable = true)
 |-- smoking_status: string (nullable = true)
 |-- stroke: integer (nullable = true)



In [21]:
#how to get all the column nams
df_spark.columns
#how to select particular column or multiple columns
df_spark.select(['age', 'id']).show(2)

+----+-----+
| age|   id|
+----+-----+
|67.0| 9046|
|61.0|51676|
+----+-----+
only showing top 2 rows



In [27]:
#lets check whether describe works in pysaprk
#df_spark.describe().show()
df_spark.select(['age', 'id']).describe().show()

+-------+------------------+-----------------+
|summary|               age|               id|
+-------+------------------+-----------------+
|  count|              5110|             5110|
|   mean|43.226614481409015|36517.82935420744|
| stddev| 22.61264672311348|21161.72162482715|
|    min|              0.08|               67|
|    max|              82.0|            72940|
+-------+------------------+-----------------+



In [35]:
'''collect() can be used to collect all the distributed data to driver side as the local data in python.This can cause memeory
error if the dataset is too large. hence take() or tail() is used to avoid out of memeory exception.'''
df_spark.collect()
df_spark.take(1)

[Row(id=9046, gender='Male', age=67.0, hypertension=0, heart_disease=1, ever_married='Yes', work_type='Private', Residence_type='Urban', avg_glucose_level=228.69, bmi='36.6', smoking_status='formerly smoked', stroke=1)]

In [37]:
#pyspark has conversion back to pandas df to leavarage pandas API's
df_spark.toPandas()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24,never smoked,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5105,18234,Female,80.0,1,0,Yes,Private,Urban,83.75,,never smoked,0
5106,44873,Female,81.0,0,0,Yes,Self-employed,Urban,125.20,40,never smoked,0
5107,19723,Female,35.0,0,0,Yes,Self-employed,Rural,82.99,30.6,never smoked,0
5108,37544,Male,51.0,0,0,Yes,Private,Rural,166.29,25.6,formerly smoked,0


In [42]:
#adding columns
df_spark = df_spark.withColumn('age after 2 years', df_spark['age']+2)

In [45]:
#dropping columns
df_spark = df_spark.drop('age after 2 years')
df_spark

DataFrame[id: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: string, smoking_status: string, stroke: int]

In [47]:
# Rename columns
df_spark.withColumnRenamed('id','patient ID')

DataFrame[patient ID: int, gender: string, age: double, hypertension: int, heart_disease: int, ever_married: string, work_type: string, Residence_type: string, avg_glucose_level: double, bmi: string, smoking_status: string, stroke: int]

In [49]:
#dropping null values
df_spark.na.drop().show(2)
#it will drop all the rows whereve there us null value

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [52]:
#any = how drops the row where all the rows are null
df_spark.na.drop(how = 'any').show(2)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [57]:
df_spark.na.drop(how = 'any', thresh = 1).show(2)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [62]:
#dropping rows from specific values
df_spark.na.drop(how = 'any', subset =['bmi']).show(1)

+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
|  id|gender| age|hypertension|heart_disease|ever_married|work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
|9046|  Male|67.0|           0|            1|         Yes|  Private|         Urban|           228.69|36.6|formerly smoked|     1|
+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
only showing top 1 row



In [65]:
#filling the missing values
df_spark.na.fill('not Available').show(1)

+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
|  id|gender| age|hypertension|heart_disease|ever_married|work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
|9046|  Male|67.0|           0|            1|         Yes|  Private|         Urban|           228.69|36.6|formerly smoked|     1|
+----+------+----+------------+-------------+------------+---------+--------------+-----------------+----+---------------+------+
only showing top 1 row



In [69]:
#filter operations
df_spark.filter('age>=50').show(2)

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [71]:
df_spark.filter('age>=50').select(['id', 'age']).show(2)

+-----+----+
|   id| age|
+-----+----+
| 9046|67.0|
|51676|61.0|
+-----+----+
only showing top 2 rows



In [78]:
#how  to write multiple conditions
df_spark.filter((df_spark['age']>= 50) & (df_spark['avg_glucose_level'] >= 200)).show(2) 

+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
|   id|gender| age|hypertension|heart_disease|ever_married|    work_type|Residence_type|avg_glucose_level| bmi| smoking_status|stroke|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
| 9046|  Male|67.0|           0|            1|         Yes|      Private|         Urban|           228.69|36.6|formerly smoked|     1|
|51676|Female|61.0|           0|            0|         Yes|Self-employed|         Rural|           202.21| N/A|   never smoked|     1|
+-----+------+----+------------+-------------+------------+-------------+--------------+-----------------+----+---------------+------+
only showing top 2 rows



In [88]:
#Groupby 
df_spark.groupBy('age').count().show(5)

+----+-----+
| age|count|
+----+-----+
|67.0|   49|
|70.0|   45|
| 8.0|   58|
|69.0|   54|
| 7.0|   32|
+----+-----+
only showing top 5 rows



In [89]:
spark.stop

<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x000001B3708ED8D0>>