In [8]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, DateType

In [9]:
spark = SparkSession.builder \
    .appName("LungCancerAnalysis") \
    .getOrCreate()

In [10]:
df = spark.read.csv("Lung Cancer.csv", header=True, inferSchema=True)

                                                                                

In [11]:
df.head(5)

[Row(id=1, age=64.0, gender='Male', country='Sweden', diagnosis_date=datetime.date(2016, 4, 5), cancer_stage='Stage I', family_history='Yes', smoking_status='Passive Smoker', bmi=29.4, cholesterol_level=199, hypertension=0, asthma=0, cirrhosis=1, other_cancer=0, treatment_type='Chemotherapy', end_treatment_date=datetime.date(2017, 9, 10), survived=0),
 Row(id=2, age=50.0, gender='Female', country='Netherlands', diagnosis_date=datetime.date(2023, 4, 20), cancer_stage='Stage III', family_history='Yes', smoking_status='Passive Smoker', bmi=41.2, cholesterol_level=280, hypertension=1, asthma=1, cirrhosis=0, other_cancer=0, treatment_type='Surgery', end_treatment_date=datetime.date(2024, 6, 17), survived=1),
 Row(id=3, age=65.0, gender='Female', country='Hungary', diagnosis_date=datetime.date(2023, 4, 5), cancer_stage='Stage III', family_history='Yes', smoking_status='Former Smoker', bmi=44.0, cholesterol_level=268, hypertension=1, asthma=1, cirrhosis=0, other_cancer=0, treatment_type='Comb

In [13]:
# ===============================================
#  Task 1: Clean data
#  - Remove duplicates
#  - Ensure correct types for numeric/date columns
#  - Convert Yes/No fields to 1/0
# ===============================================
from pyspark.sql import functions as F

def clean_data(df):
    df = df.dropDuplicates()

    # Convert date columns
    df = df.withColumn("diagnosis_date", F.to_date("diagnosis_date", "yyyy-MM-dd")) \
           .withColumn("end_treatment_date", F.to_date("end_treatment_date", "yyyy-MM-dd"))

    # Detect potential Yes/No columns safely
    yes_no_cols = []
    for c in df.columns:
        distinct_values = [str(x[c]).lower() for x in df.select(c).distinct().collect() if x[c] is not None]
        if all(v in ["yes", "no", "1", "0"] for v in distinct_values) and len(distinct_values) <= 4:
            yes_no_cols.append(c)

    print("🟢 Detected Yes/No columns:", yes_no_cols)

    # Convert Yes/No -> 1/0
    for c in yes_no_cols:
        df = df.withColumn(
            c,
            F.when(F.lower(F.col(c)) == "yes", F.lit(1))
             .when(F.lower(F.col(c)) == "no", F.lit(0))
             .otherwise(F.col(c))
        )

    return df

df = clean_data(df)

                                                                                

🟢 Detected Yes/No columns: ['family_history', 'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'survived']


In [14]:
# ===============================================
#  Task 2: Add treatment_duration_days
#  and get average per treatment_type
# ===============================================
df = df.withColumn(
    "treatment_duration_days",
    F.datediff(F.col("end_treatment_date"), F.col("diagnosis_date"))
)

avg_duration = df.groupBy("treatment_type") \
    .agg(F.round(F.avg("treatment_duration_days"), 2).alias("avg_treatment_duration_days"))

print("✅ Average Treatment Duration by Type:")
avg_duration.show(truncate=False)

✅ Average Treatment Duration by Type:


[Stage 60:====>                                                   (1 + 11) / 12]

+--------------+---------------------------+
|treatment_type|avg_treatment_duration_days|
+--------------+---------------------------+
|Radiation     |458.4                      |
|Chemotherapy  |458.4                      |
|Combined      |457.82                     |
|Surgery       |457.74                     |
+--------------+---------------------------+



                                                                                

In [15]:
# ===============================================
#  Task 3: Smoking_status group with highest survival rate
# ===============================================
smoking_survival = df.groupBy("smoking_status") \
    .agg(F.round(F.avg("survived") * 100, 2).alias("survival_rate_percent")) \
    .orderBy(F.desc("survival_rate_percent"))

print("✅ Smoking status with highest survival rate:")
smoking_survival.show(1, truncate=False)

✅ Smoking status with highest survival rate:


[Stage 66:====>                                                   (1 + 11) / 12]

+--------------+---------------------+
|smoking_status|survival_rate_percent|
+--------------+---------------------+
|Never Smoked  |22.09                |
+--------------+---------------------+
only showing top 1 row


                                                                                

In [16]:
# ===============================================
#  Task 4: Top 3 countries with highest % of Stage IV diagnoses
# ===============================================
stage_iv = df.groupBy("country") \
    .agg(
        (F.sum(F.when(F.col("cancer_stage") == "Stage IV", 1).otherwise(0)) /
         F.count("*") * 100).alias("stage_IV_percentage")
    ) \
    .orderBy(F.desc("stage_IV_percentage"))

print("✅ Top 3 countries with highest Stage IV percentage:")
stage_iv.show(3, truncate=False)

✅ Top 3 countries with highest Stage IV percentage:




+--------------+-------------------+
|country       |stage_IV_percentage|
+--------------+-------------------+
|Greece        |25.50223889628464  |
|Croatia       |25.427002233085883 |
|Czech Republic|25.291166185190818 |
+--------------+-------------------+
only showing top 3 rows


                                                                                

In [17]:
# ===============================================
#  Task 5: Filter specific patient criteria
#  Conditions:
#   - Male
#   - Stage III or IV
#   - Family history == Yes (1)
#   - Current smoker
#   - BMI > 30
#   - Survived == 1
# Then return:
#   - Average age
#   - % who had hypertension
# ===============================================
filtered = df.filter(
    (F.lower(F.col("gender")) == "male") &
    (F.col("cancer_stage").isin("Stage III", "Stage IV")) &
    ((F.lower(F.col("family_history")) == "yes") | (F.col("family_history") == 1)) &
    (F.lower(F.col("smoking_status")) == "current smoker") &
    (F.col("bmi") > 30) &
    (F.col("survived") == 1)
)

result = filtered.agg(
    F.round(F.avg("age"), 2).alias("average_age"),
    F.round(F.avg("hypertension") * 100, 2).alias("hypertension_percentage")
)

print("✅ Task 5 Results:")
result.show()

✅ Task 5 Results:


[Stage 78:====>                                                   (1 + 11) / 12]

+-----------+-----------------------+
|average_age|hypertension_percentage|
+-----------+-----------------------+
|      55.18|                  74.77|
+-----------+-----------------------+



                                                                                

In [None]:
spark.stop()