In [74]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [75]:
#Setup Spark Environment:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz
!pip install -q findspark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()


In [92]:
# Initialize SparkContext:
from pyspark import SparkContext
try:
    sc = SparkContext("local", "HealthcareRecoveryAnalysis")
except ValueError:
    print("SparkContext already exists.")

SparkContext already exists.


In [93]:
# Load the Data and Verify:
file_path = "/content/drive/MyDrive/Colab Notebooks/Data transformation/ExportCSV.csv"
data_rdd = sc.textFile(file_path)

# Preview the data to confirm structure
print("Data preview:", data_rdd.take(5))


Data preview: ['Patient ID,Patient Name,Patient Last Name,Gender,Age group,Location,BMI,Diabetes,Blood pressure Systolic,Blood pressure Diastolic,Hear Rate,Smoking Status,Alcohol Use,Exercise level,Type of surgery,Surgery duration,Anaesthesia Type,Recovery Time,Length of hospital stay,Pain levels,Physical Therapy Sessions,Infection occurrence', '1,Percy,Olivier,Male,4,Lesotho,14.4346038063218,1,16,16.5702644365701,52,1,1,33.2785492368408,1,7.90176550666884,1,15,25,6,3,0', '2,Boris,Addis,Male,3,Uzbekistan,54.2754787841232,0,5,5.6898040360258,68,0,0,6.90878864233791,0,5.06706765902558,0,3,30,4,5,1', '3,Tony,Poulton,Male,1,Saudi Arabia,28.9115789341329,0,8,12.7164000383189,189,0,1,32.5247485854313,1,7.58874520081503,1,17,21,6,4,1', '4,Molly,Wild,Female,3,Tonga,24.8498306208522,0,14,4.92664265023854,167,0,0,11.7092604347082,1,6.19238860401902,0,13,9,3,4,0']


In [53]:
pip install pyspark



In [94]:
# Filter Out the Header and Parse Rows:
header = data_rdd.first()  # Extract the header row
data_rdd = data_rdd.filter(lambda row: row != header)  # Filter out the header


In [95]:
# Define a function to extract features and ensure binary values
def extract_features(record):
    fields = record.split(",")
    try:
        smoking_status = int(fields[11].strip())  # Smoking Status
        alcohol_use = int(fields[12].strip())  # Alcohol Use
        recovery_time = float(fields[17].strip())  # Recovery Time

        # Ensure binary values for Smoking Status and Alcohol Use
        if smoking_status in {0, 1} and alcohol_use in {0, 1}:
            return ((smoking_status, alcohol_use), (recovery_time, 1))
        else:
            print(f"Skipping row due to non-binary values: Smoking Status={smoking_status}, Alcohol Use={alcohol_use}")
            return None
    except (ValueError, IndexError) as e:
        print(f"Skipping row due to error: {e}")
        return None

# Apply extraction and filter out None results
features_rdd = data_rdd.map(extract_features).filter(lambda x: x is not None)

# Aggregate by key to calculate the total recovery time and count for each combination
aggregated_rdd = features_rdd.reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))

# Calculate the average recovery time for each combination
average_recovery_rdd = aggregated_rdd.mapValues(lambda x: x[0] / x[1])

# Collect and print results to verify the number of unique combinations
results = average_recovery_rdd.collect()
print("Number of unique combinations:", len(results))

# Interpret and display the results
for (smoking_status, alcohol_use), avg_recovery_time in results:
    smoking_text = "Smoker" if smoking_status == 1 else "Non-Smoker"
    alcohol_text = "Alcohol User" if alcohol_use == 1 else "Non-Alcohol User"
    print(f"Smoking Status: {smoking_text}, Alcohol Use: {alcohol_text}, Average Recovery Time: {avg_recovery_time:.2f} days")

# Stop the SparkContext
sc.stop()

Number of unique combinations: 4
Smoking Status: Smoker, Alcohol Use: Alcohol User, Average Recovery Time: 15.52 days
Smoking Status: Non-Smoker, Alcohol Use: Non-Alcohol User, Average Recovery Time: 15.51 days
Smoking Status: Non-Smoker, Alcohol Use: Alcohol User, Average Recovery Time: 15.52 days
Smoking Status: Smoker, Alcohol Use: Non-Alcohol User, Average Recovery Time: 15.48 days
