In [33]:
import os
# Find the latest version of spark 3.2 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.3'
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Hit:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:4 http://archive.ubuntu.com/ubuntu focal InRelease
Hit:5 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:6 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:7 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Hit:8 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:9 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Get:10 http://security.ubuntu.com/ubuntu focal-security/main amd64 Packages [2,593 kB]
Get:11 http://archive.ubuntu.com/ubuntu focal-updates/universe amd64 Packages [1,325 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Hit:13 http://ppa.launchpad.net/ubuntugis/ppa/ubuntu focal InRelease

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-04-13 22:47:38--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2023-04-13 22:47:39 (3.40 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Final-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

### Load Amazon Data into Spark DataFrame

In [4]:
from pyspark import SparkFiles
# Load in the review data from S3 into the dataframe
url = "https://unbearable-1-project-bucket.s3.us-east-2.amazonaws.com/Diabetes_Key_Indicators.csv"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep=",", header=True, inferSchema=True)
df.show()

+---+------+------+-----------+------+-----+-------+--------+-------------+-------------+--------+--------+--------+--------+--------+-------+-------+--------+-------+--------------------+--------+--------+--------+--------+--------+-------+-------+--------+--------+-------+--------+-------+------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+------+--------+-------+--------+--------+--------+--------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+--------+-------+-----+--------+--------+--------+--------+--------+-------+--------+-------+--------+-------+-------+----+-----+------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+--------+--------+--------+--------+-------+--------+------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+----

In [36]:
new_df = df.select(["DIABETE4",
                    "_RACE",
                    "TOLDHI3",
                    "BPHIGH6",
                    "_BMI5",
                    "SMOKE100",
                    "_RFBING5",
                    "EDUCA",
                    "GENHLTH",
                    "_AGEG5YR",
                    "EXERANY2",
                    "FRUIT2",
                    "VEGETAB2",
                    "_INCOMG1",
                    "MEDCOST1",
                    "_SEX"])
new_df.show()

+--------+-----+-------+-------+------+--------+--------+-----+-------+--------+--------+------+--------+--------+--------+----+
|DIABETE4|_RACE|TOLDHI3|BPHIGH6| _BMI5|SMOKE100|_RFBING5|EDUCA|GENHLTH|_AGEG5YR|EXERANY2|FRUIT2|VEGETAB2|_INCOMG1|MEDCOST1|_SEX|
+--------+-----+-------+-------+------+--------+--------+-----+-------+--------+--------+------+--------+--------+--------+----+
|     3.0|  1.0|    1.0|    3.0|1454.0|     1.0|     1.0|  4.0|    5.0|    11.0|     2.0| 101.0|   101.0|     3.0|     2.0| 2.0|
|     1.0|  2.0|    1.0|    1.0|  null|     2.0|     1.0|  6.0|    3.0|    10.0|     1.0| 101.0|   207.0|     9.0|     2.0| 2.0|
|     1.0|  2.0|    2.0|    1.0|2829.0|     2.0|     1.0|  4.0|    2.0|    11.0|     2.0| 101.0|   203.0|     2.0|     2.0| 2.0|
|     1.0|  1.0|    1.0|    1.0|3347.0|     2.0|     2.0|  4.0|    2.0|     9.0|     1.0| 203.0|   205.0|     5.0|     2.0| 2.0|
|     1.0|  7.0|    1.0|    4.0|2873.0|     2.0|     1.0|  3.0|    5.0|    12.0|     1.0| 101.0| 

### Clean up Columns

In [37]:
import pandas as pd
pandas_df = new_df.toPandas()
pandas_df = pandas_df.dropna()

pandas_df

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,3.0,1.0,1.0,3.0,1454.0,1.0,1.0,4.0,5.0,11.0,2.0,101.0,101.0,3.0,2.0,2.0
2,1.0,2.0,2.0,1.0,2829.0,2.0,1.0,4.0,2.0,11.0,2.0,101.0,203.0,2.0,2.0,2.0
3,1.0,1.0,1.0,1.0,3347.0,2.0,2.0,4.0,2.0,9.0,1.0,203.0,205.0,5.0,2.0,2.0
4,1.0,7.0,1.0,4.0,2873.0,2.0,1.0,3.0,5.0,12.0,1.0,101.0,101.0,2.0,2.0,1.0
5,3.0,1.0,2.0,3.0,2437.0,1.0,1.0,5.0,3.0,13.0,2.0,202.0,201.0,4.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438687,1.0,8.0,1.0,1.0,2148.0,2.0,2.0,2.0,4.0,10.0,1.0,315.0,205.0,2.0,2.0,1.0
438688,2.0,9.0,2.0,1.0,2469.0,1.0,1.0,4.0,2.0,3.0,1.0,210.0,103.0,3.0,1.0,2.0
438690,3.0,2.0,1.0,3.0,3068.0,2.0,1.0,6.0,2.0,7.0,1.0,102.0,101.0,6.0,2.0,1.0
438691,3.0,9.0,2.0,1.0,2373.0,2.0,1.0,4.0,2.0,10.0,1.0,101.0,101.0,4.0,2.0,1.0


In [25]:
# check for unique values to replace for encoding
pandas_df.EDUCA.unique()

array([4., 5., 6., 3., 2., 1., 9.])

In [44]:
# Replace Values based on responses provided in survey response form and confirm updated values
pandas_df["EDUCA"] = pandas_df["EDUCA"].replace({4.:"High School Grad"}).replace({6.:"College Grad"}).replace({3.:"Some High School"}).replace({5.:"Some College"}).replace({2.:"Elementary"}).replace({1.:"None"}).replace({9.:""})
pandas_df.EDUCA.unique()


array(['High School Grad', 'Some High School', 'Some College',
       'College Grad', 'Elementary', '', 'None'], dtype=object)

In [27]:
pandas_df.GENHLTH.unique()

array([2., 3., 4., 7., 1., 5., 9.])

In [45]:
# Replace Values based on responses provided in survey response form and confirm confirm updated values
pandas_df["GENHLTH"] = pandas_df["GENHLTH"].replace({1.:"Excellent"}).replace({2.:"Very Good"}).replace({3.:"Good"}).replace({4.:"Fair"}).replace({5.:"Poor"}).replace({7.:""}).replace({9.:""})
pandas_df.GENHLTH.unique()

array(['Poor', 'Very Good', 'Good', 'Fair', 'Excellent', ''], dtype=object)

In [38]:
pandas_df._INCOMG1.unique()

array([3., 2., 5., 4., 9., 1., 6., 7.])

In [46]:
# Replace Values based on responses provided in survey response form and confirm confirm updated values
pandas_df["_INCOMG1"] = pandas_df["_INCOMG1"].replace({1.:"Less than $15,000"}).replace({2.:"$15,000 to < $25,000"}).replace({3.:"$25,000 to < $35,000"}).replace({4.:"$35,000 to < $50,000"}).replace({5.:"$50,000 to < $100,000"}).replace({6.:"$100,000 to < $200,000"}).replace({7.:" > $200,000"}).replace({9.:""})
pandas_df._INCOMG1.unique()

array(['$25,000 to < $35,000', '$15,000 to < $25,000',
       '$50,000 to < $100,000', '$35,000 to < $50,000', nan,
       'Less than $15,000', '$100,000 to < $200,000', ' > $200,000'],
      dtype=object)

In [43]:
pandas_df

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,3.0,1.0,1.0,3.0,1454.0,1.0,1.0,4.0,5.0,11.0,2.0,101.0,101.0,"$25,000 to < $35,000",2.0,2.0
2,1.0,2.0,2.0,1.0,2829.0,2.0,1.0,4.0,2.0,11.0,2.0,101.0,203.0,"$15,000 to < $25,000",2.0,2.0
3,1.0,1.0,1.0,1.0,3347.0,2.0,2.0,4.0,2.0,9.0,1.0,203.0,205.0,"$50,000 to < $100,000",2.0,2.0
4,1.0,7.0,1.0,4.0,2873.0,2.0,1.0,3.0,5.0,12.0,1.0,101.0,101.0,"$15,000 to < $25,000",2.0,1.0
5,3.0,1.0,2.0,3.0,2437.0,1.0,1.0,5.0,3.0,13.0,2.0,202.0,201.0,"$35,000 to < $50,000",2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438687,1.0,8.0,1.0,1.0,2148.0,2.0,2.0,2.0,4.0,10.0,1.0,315.0,205.0,"$15,000 to < $25,000",2.0,1.0
438688,2.0,9.0,2.0,1.0,2469.0,1.0,1.0,4.0,2.0,3.0,1.0,210.0,103.0,"$25,000 to < $35,000",1.0,2.0
438690,3.0,2.0,1.0,3.0,3068.0,2.0,1.0,6.0,2.0,7.0,1.0,102.0,101.0,"$100,000 to < $200,000",2.0,1.0
438691,3.0,9.0,2.0,1.0,2373.0,2.0,1.0,4.0,2.0,10.0,1.0,101.0,101.0,"$35,000 to < $50,000",2.0,1.0


In [41]:
pandas_df=pandas_df.mask(pandas_df == "")

In [42]:
pandas_df.isna().sum()

DIABETE4        0
_RACE           0
TOLDHI3         0
BPHIGH6         0
_BMI5           0
SMOKE100        0
_RFBING5        0
EDUCA           0
GENHLTH         0
_AGEG5YR        0
EXERANY2        0
FRUIT2          0
VEGETAB2        0
_INCOMG1    51064
MEDCOST1        0
_SEX            0
dtype: int64