
## Overview

This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# File location and type
file_location = "/FileStore/tables/tuberculosis_xray_dataset.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Patient_ID,Age,Gender,Chest_Pain,Cough_Severity,Breathlessness,Fatigue,Weight_Loss,Fever,Night_Sweats,Sputum_Production,Blood_in_Sputum,Smoking_History,Previous_TB_History,Class
PID000001,69,Male,Yes,1,2,3,2.37,Moderate,Yes,Medium,Yes,Former,Yes,Normal
PID000002,32,Female,Yes,3,0,9,6.09,Moderate,No,Medium,No,Current,Yes,Normal
PID000003,89,Male,No,7,0,3,2.86,Mild,Yes,Medium,No,Current,No,Tuberculosis
PID000004,78,Female,Yes,2,0,6,4.57,Moderate,No,High,Yes,Never,Yes,Tuberculosis
PID000005,38,Male,No,7,2,5,13.86,High,Yes,Low,No,Never,Yes,Tuberculosis
PID000006,41,Female,Yes,1,3,7,2.23,Mild,No,Low,Yes,Current,No,Normal
PID000007,20,Male,Yes,0,1,0,7.11,Moderate,Yes,Low,No,Never,Yes,Normal
PID000008,39,Male,Yes,5,0,9,9.91,High,Yes,High,No,Never,No,Normal
PID000009,70,Male,Yes,7,0,1,13.77,Moderate,Yes,Medium,No,Never,No,Tuberculosis
PID000010,19,Female,No,8,2,1,13.85,Moderate,Yes,High,No,Former,No,Tuberculosis


In [0]:
# Create a view or table

temp_table_name = "tuberculosis_xray_dataset_csv"

df.createOrReplaceTempView(temp_table_name)

In [0]:
%sql

/* Query the created temp table in a SQL cell */

select * from `tuberculosis_xray_dataset_csv`

Patient_ID,Age,Gender,Chest_Pain,Cough_Severity,Breathlessness,Fatigue,Weight_Loss,Fever,Night_Sweats,Sputum_Production,Blood_in_Sputum,Smoking_History,Previous_TB_History,Class
PID000001,69,Male,Yes,1,2,3,2.37,Moderate,Yes,Medium,Yes,Former,Yes,Normal
PID000002,32,Female,Yes,3,0,9,6.09,Moderate,No,Medium,No,Current,Yes,Normal
PID000003,89,Male,No,7,0,3,2.86,Mild,Yes,Medium,No,Current,No,Tuberculosis
PID000004,78,Female,Yes,2,0,6,4.57,Moderate,No,High,Yes,Never,Yes,Tuberculosis
PID000005,38,Male,No,7,2,5,13.86,High,Yes,Low,No,Never,Yes,Tuberculosis
PID000006,41,Female,Yes,1,3,7,2.23,Mild,No,Low,Yes,Current,No,Normal
PID000007,20,Male,Yes,0,1,0,7.11,Moderate,Yes,Low,No,Never,Yes,Normal
PID000008,39,Male,Yes,5,0,9,9.91,High,Yes,High,No,Never,No,Normal
PID000009,70,Male,Yes,7,0,1,13.77,Moderate,Yes,Medium,No,Never,No,Tuberculosis
PID000010,19,Female,No,8,2,1,13.85,Moderate,Yes,High,No,Former,No,Tuberculosis


In [0]:
# With this registered as a temp view, it will only be available to this particular notebook. If you'd like other users to be able to query this table, you can also create a table from the DataFrame.
# Once saved, this table will persist across cluster restarts as well as allow various users across different notebooks to query this data.
# To do so, choose your table name and uncomment the bottom line.

permanent_table_name = "tuberculosis_xray_dataset_csv"

# df.write.format("parquet").saveAsTable(permanent_table_name)

In [0]:
#import packages
from pyspark.sql.types import *
import pyspark
from pyspark.sql.functions import *
from pyspark.sql import functions as F

In [0]:
df.printSchema()

root
 |-- Patient_ID: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Gender: string (nullable = true)
 |-- Chest_Pain: string (nullable = true)
 |-- Cough_Severity: integer (nullable = true)
 |-- Breathlessness: integer (nullable = true)
 |-- Fatigue: integer (nullable = true)
 |-- Weight_Loss: double (nullable = true)
 |-- Fever: string (nullable = true)
 |-- Night_Sweats: string (nullable = true)
 |-- Sputum_Production: string (nullable = true)
 |-- Blood_in_Sputum: string (nullable = true)
 |-- Smoking_History: string (nullable = true)
 |-- Previous_TB_History: string (nullable = true)
 |-- Class: string (nullable = true)



In [0]:
#Find the average Cough Severity for each Class:
s1 = df.groupBy("Class").agg(avg("Cough_Severity")).display()

Class,avg(Cough_Severity)
Tuberculosis,4.496282527881041
Normal,4.489277091322255


In [0]:
#Determine the count of patients with Chest Pain for each gender:
s2 = df.filter(col("Chest_Pain")=="Yes").groupBy("Gender").count().display()

Gender,count
Female,4897
Male,4898


In [0]:
#Identify the age group with the highest Breathlessness severity:

from pyspark.sql.functions import col
# Find the maximum value of Breathlessness
s3 = df.agg({"Breathlessness": "max"}).collect()[0][0]  # Collects the result and accesses the value
# Filter the rows where Breathlessness matches the maximum value
result = df.filter(col("Breathlessness") == s3).select("Age", "Breathlessness")
# Show the result
result.show()

+---+--------------+
|Age|Breathlessness|
+---+--------------+
| 77|             4|
| 32|             4|
| 79|             4|
| 79|             4|
| 68|             4|
| 38|             4|
| 21|             4|
| 77|             4|
| 88|             4|
| 61|             4|
| 52|             4|
| 21|             4|
| 71|             4|
| 80|             4|
| 31|             4|
| 89|             4|
| 41|             4|
| 62|             4|
| 80|             4|
| 52|             4|
+---+--------------+
only showing top 20 rows



In [0]:
#Find the percentage of patients with Smoking History categorized by Class:
s4 = df.filter(col("Smoking_History")=="Current")
s41 = s4.groupBy("Class").count().display()

Class,count
Tuberculosis,1919
Normal,4674


In [0]:
#Identify patients with Blood in Sputum and severe Fatigue (above a threshold) Sputum is Medium and Fatigue > 5

s5 = df.filter((col("Sputum_Production")=="Medium") & (col("Fatigue")>5)).groupby("Gender").count().display()

Gender,count
Female,1321
Male,1409


In [0]:
#Find the average Weight Loss for patients with Previous Tuberculosis History:
s6 = df.filter(col("Previous_TB_History")=="Yes").groupBy("Previous_TB_History").agg(avg("Weight_Loss")).display()

Previous_TB_History,avg(Weight_Loss)
Yes,7.504503824376658


In [0]:
#Find the maximum Cough Severity for patients with a history of Smoking:
s7 = df.filter((col("Smoking_History")=="Current")).agg(max("Cough_Severity")).display()

max(Cough_Severity)
9


In [0]:
#Count the number of patients who exhibit both Night Sweats and Fever by Gender
s8 =df.filter((col("Night_Sweats")=="Yes") & (col("Fever")=="High")).groupBy("Gender").count().display()

Gender,count
Female,1548
Male,1660


In [0]:
#Determine which Gender has a higher average Fatigue:
s9 = df.groupby("Gender").agg(avg("Fatigue")).display()

Gender,avg(Fatigue)
Female,4.516024010580934
Male,4.501130665617933
