In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Setup Spark Environment:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()


In [None]:
# Initialize SparkContext:
from pyspark import SparkContext
try:
    sc = SparkContext("local", "HealthcareRecoveryAnalysis")
except ValueError:
    print("SparkContext already exists.")


In [None]:
# Load the Data and Verify:
file_path = "/content/drive/MyDrive/Colab Notebooks/Data transformation/ExportCSV.csv"
data_rdd = sc.textFile(file_path)

# Preview the data to confirm structure
print("Data preview:", data_rdd.take(5))


Data preview: ['Patient ID,Patient Name,Patient Last Name,Gender,Age group,Location,BMI,Diabetes,Blood pressure Systolic,Blood pressure Diastolic,Hear Rate,Smoking Status,Alcohol Use,Exercise level,Type of surgery,Surgery duration,Anaesthesia Type,Recovery Time,Length of hospital stay,Pain levels,Physical Therapy Sessions,Infection occurrence', '1,Percy,Olivier,Male,4,Lesotho,14.4346038063218,1,16,16.5702644365701,52,1,1,33.2785492368408,1,7.90176550666884,1,15,25,6,3,0', '2,Boris,Addis,Male,3,Uzbekistan,54.2754787841232,0,5,5.6898040360258,68,0,0,6.90878864233791,0,5.06706765902558,0,3,30,4,5,1', '3,Tony,Poulton,Male,1,Saudi Arabia,28.9115789341329,0,8,12.7164000383189,189,0,1,32.5247485854313,1,7.58874520081503,1,17,21,6,4,1', '4,Molly,Wild,Female,3,Tonga,24.8498306208522,0,14,4.92664265023854,167,0,0,11.7092604347082,1,6.19238860401902,0,13,9,3,4,0']


In [None]:
pip install pyspark



In [None]:
# Filter Out the Header and Parse Rows:
header = data_rdd.first()  # Extract the header row
data_rdd = data_rdd.filter(lambda row: row != header)  # Filter out the header


In [None]:
# Extract Features and Handle Errors:
def extract_features(record):
    fields = record.split(",")
    try:
        age_group = int(fields[4].strip())
        diabetes_status = int(fields[7].strip())
        recovery_time = int(fields[17].strip())
        return (diabetes_status, age_group, recovery_time)
    except (ValueError, IndexError) as e:
        print(f"Skipping row due to error: {e}")
        return None

features_rdd = data_rdd.map(extract_features).filter(lambda x: x is not None)


In [None]:
# Aggregate and Calculate Averages:
mapped_rdd = features_rdd.map(lambda x: ((x[0], x[1]), (x[2], 1)))
reduced_rdd = mapped_rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
average_recovery_rdd = reduced_rdd.mapValues(lambda x: x[0] / x[1])


In [None]:
# Display Results
age_group_labels = {0: "18-30", 1: "31-45", 2: "46-60", 3: "61-75", 4: "76+"}
results = average_recovery_rdd.collect()

for (diabetes_status, age_group), avg_recovery_time in results:
    diabetes_text = "Diabetic" if diabetes_status == 1 else "Non-Diabetic"
    age_group_text = age_group_labels.get(age_group, "Unknown")
    print(f"Diabetes Status: {diabetes_text}, Age Group: {age_group_text}, Average Recovery Time: {avg_recovery_time:.2f} days")


Diabetes Status: Diabetic, Age Group: 76+, Average Recovery Time: 15.56 days
Diabetes Status: Non-Diabetic, Age Group: 61-75, Average Recovery Time: 15.49 days
Diabetes Status: Non-Diabetic, Age Group: 31-45, Average Recovery Time: 15.56 days
Diabetes Status: Non-Diabetic, Age Group: 46-60, Average Recovery Time: 15.45 days
Diabetes Status: Diabetic, Age Group: 18-30, Average Recovery Time: 15.46 days
Diabetes Status: Diabetic, Age Group: 61-75, Average Recovery Time: 15.43 days
Diabetes Status: Non-Diabetic, Age Group: 18-30, Average Recovery Time: 15.54 days
Diabetes Status: Diabetic, Age Group: 31-45, Average Recovery Time: 15.51 days
Diabetes Status: Non-Diabetic, Age Group: 76+, Average Recovery Time: 15.50 days
Diabetes Status: Diabetic, Age Group: 46-60, Average Recovery Time: 15.57 days


In [None]:
sc.stop()
