In [1]:
# RUN the following 2 installation lines only one time
!pip install pyspark
!pip install findspark

Collecting pyspark
  Downloading pyspark-3.5.3.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.3-py2.py3-none-any.whl size=317840625 sha256=6994e2f364d7ba4ee796c929e9f8c066adcfe83837cef788eaf8f909400229b2
  Stored in directory: /root/.cache/pip/wheels/1b/3a/92/28b93e2fbfdbb07509ca4d6f50c5e407f48dce4ddbda69a4ab
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.3
Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl.metadata (352 bytes)
Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master('local[*]').appName('Basics').getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True) # Property used to format output tables better

In [3]:
#Creating spark context-Its like connecting to spark cluster
from pyspark import SparkConf
from pyspark.context import SparkContext

sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

In [5]:
data_rdd = sc.textFile('/content/drive/MyDrive/Pyspark/Project/ExportCSV.csv')
data_rdd.take(5)

['Patient ID,Patient Name,Patient Last Name,Gender,Age group,Location,BMI,Diabetes,Blood pressure Systolic,Blood pressure Diastolic,Hear Rate,Smoking Status,Alcohol Use,Exercise level,Type of surgery,Surgery duration,Anaesthesia Type,Recovery Time,Length of hospital stay,Pain levels,Physical Therapy Sessions,Infection occurrence',
 '1,Percy,Olivier,Male,4,Lesotho,14.4346038063218,1,16,16.5702644365701,52,1,1,33.2785492368408,1,7.90176550666884,1,15,25,6,3,0',
 '2,Boris,Addis,Male,3,Uzbekistan,54.2754787841232,0,5,5.6898040360258,68,0,0,6.90878864233791,0,5.06706765902558,0,3,30,4,5,1',
 '3,Tony,Poulton,Male,1,Saudi Arabia,28.9115789341329,0,8,12.7164000383189,189,0,1,32.5247485854313,1,7.58874520081503,1,17,21,6,4,1',
 '4,Molly,Wild,Female,3,Tonga,24.8498306208522,0,14,4.92664265023854,167,0,0,11.7092604347082,1,6.19238860401902,0,13,9,3,4,0']

In [8]:
# Filter Out the Header and Parse Rows:
header = data_rdd.first()  # Extract the header row
data_rdd = data_rdd.filter(lambda row: row != header)  # Filter out the header
data_rdd.take(5)

['1,Percy,Olivier,Male,4,Lesotho,14.4346038063218,1,16,16.5702644365701,52,1,1,33.2785492368408,1,7.90176550666884,1,15,25,6,3,0',
 '2,Boris,Addis,Male,3,Uzbekistan,54.2754787841232,0,5,5.6898040360258,68,0,0,6.90878864233791,0,5.06706765902558,0,3,30,4,5,1',
 '3,Tony,Poulton,Male,1,Saudi Arabia,28.9115789341329,0,8,12.7164000383189,189,0,1,32.5247485854313,1,7.58874520081503,1,17,21,6,4,1',
 '4,Molly,Wild,Female,3,Tonga,24.8498306208522,0,14,4.92664265023854,167,0,0,11.7092604347082,1,6.19238860401902,0,13,9,3,4,0',
 '5,Cherish,Silva,Female,2,Belgium,50.5227091398662,0,18,3.65124117659928,123,0,0,5.58206343538224,0,7.59810593519272,1,20,6,5,6,1']

In [9]:
# Function to parse the CSV line
def parse_line(line):
    fields = line.split(',')
    try:
        # Extracting values (Type of Surgery, Surgery Duration, Anesthesia Type, Recovery Time)
        #Note that to reach better combination I round the "Surgery Duration"
        return (int(fields[14]), round(float(fields[15])), int(fields[16]), int(fields[17]))
    except (IndexError, ValueError) as e:
        print(f"Error parsing line: {line} - {e}")
        return None

# Map and filter out None values
mapped_rdd = data_rdd.map(parse_line).filter(lambda x: x is not None)


# Extract relevant info for max recovery time
info_rdd = mapped_rdd.map(lambda record: ((record[0], record[1], record[2]), record[3]))

max_recovery_rdd = info_rdd.reduceByKey(lambda a, b: max(a, b))

# Collect results
results = max_recovery_rdd.collect()

# Find the combination with the maximum recovery time
max_combination = max(results, key=lambda x: x[1])

# Show the result
print(f"Combination with Maximum Recovery Time:")
print(f"Type of Surgery: {max_combination[0][0]}, Surgery Duration: {max_combination[0][1]}, Anesthesia Type: {max_combination[0][2]}, Max Recovery Time: {max_combination[1]} days")




Combination with Maximum Recovery Time:
Type of Surgery: 0, Surgery Duration: 5, Anesthesia Type: 0, Max Recovery Time: 30 days
