In [1]:
import os
# Find the latest version of spark 3.2 from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.2.3'
spark_version = 'spark-3.2.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://archive.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
Get:2 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
Get:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease [1,581 B]
Get:5 http://archive.ubuntu.com/ubuntu focal-updates InRelease [114 kB]
Hit:6 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu focal InRelease
Get:7 http://archive.ubuntu.com/ubuntu focal-backports InRelease [108 kB]
Hit:8 http://ppa.launchpad.net/cran/libgit2/ubuntu focal InRelease
Get:9 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  Packages [993 kB]
Hit:10 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu focal InRelease
Get:11 http://security.ubuntu.com/ubuntu focal-security/restricted amd64 Packages [2,139 kB]
Hit:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu focal InRelease
Get:13 http://archive.ubuntu.com/ubuntu focal-updates/res

In [2]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2023-04-20 15:37:31--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2023-04-20 15:37:31 (6.01 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Final-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

### Load Amazon Data into Spark DataFrame

In [4]:
from pyspark import SparkFiles
# Load in the review data from S3 into the dataframe
url = "https://unbearable-1-project-bucket.s3.us-east-2.amazonaws.com/Diabetes_Key_Indicators.csv"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get(""), sep=",", header=True, inferSchema=True)
df.show()

+---+------+------+-----------+------+-----+-------+--------+-------------+-------------+--------+--------+--------+--------+--------+-------+-------+--------+-------+--------------------+--------+--------+--------+--------+--------+-------+-------+--------+--------+-------+--------+-------+------+-------+--------+--------+--------+--------+--------+--------+--------+--------+-------+------+--------+-------+--------+--------+--------+--------+-------+-------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+--------+--------+-------+-----+--------+--------+--------+--------+--------+-------+--------+-------+--------+-------+-------+----+-----+------+--------+--------+--------+--------+--------+-------+--------+-------+--------+--------+--------+--------+--------+--------+--------+-------+--------+------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+--------+-------+--------+-------+----

In [5]:
new_df = df.select(["DIABETE4",
                    "_RACE",
                    "TOLDHI3",
                    "BPHIGH6",
                    "_BMI5",
                    "SMOKE100",
                    "_RFBING5",
                    "EDUCA",
                    "GENHLTH",
                    "_AGEG5YR",
                    "EXERANY2",
                    "FRUIT2",
                    "VEGETAB2",
                    "_INCOMG1",
                    "MEDCOST1",
                    "_SEX"])
new_df.show()

+--------+-----+-------+-------+------+--------+--------+-----+-------+--------+--------+------+--------+--------+--------+----+
|DIABETE4|_RACE|TOLDHI3|BPHIGH6| _BMI5|SMOKE100|_RFBING5|EDUCA|GENHLTH|_AGEG5YR|EXERANY2|FRUIT2|VEGETAB2|_INCOMG1|MEDCOST1|_SEX|
+--------+-----+-------+-------+------+--------+--------+-----+-------+--------+--------+------+--------+--------+--------+----+
|     3.0|  1.0|    1.0|    3.0|1454.0|     1.0|     1.0|  4.0|    5.0|    11.0|     2.0| 101.0|   101.0|     3.0|     2.0| 2.0|
|     1.0|  2.0|    1.0|    1.0|  null|     2.0|     1.0|  6.0|    3.0|    10.0|     1.0| 101.0|   207.0|     9.0|     2.0| 2.0|
|     1.0|  2.0|    2.0|    1.0|2829.0|     2.0|     1.0|  4.0|    2.0|    11.0|     2.0| 101.0|   203.0|     2.0|     2.0| 2.0|
|     1.0|  1.0|    1.0|    1.0|3347.0|     2.0|     2.0|  4.0|    2.0|     9.0|     1.0| 203.0|   205.0|     5.0|     2.0| 2.0|
|     1.0|  7.0|    1.0|    4.0|2873.0|     2.0|     1.0|  3.0|    5.0|    12.0|     1.0| 101.0| 

### Clean up Columns

In [6]:
import pandas as pd
pandas_df = new_df.toPandas()
pandas_df = pandas_df.dropna()

pandas_df

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,3.0,1.0,1.0,3.0,1454.0,1.0,1.0,4.0,5.0,11.0,2.0,101.0,101.0,3.0,2.0,2.0
2,1.0,2.0,2.0,1.0,2829.0,2.0,1.0,4.0,2.0,11.0,2.0,101.0,203.0,2.0,2.0,2.0
3,1.0,1.0,1.0,1.0,3347.0,2.0,2.0,4.0,2.0,9.0,1.0,203.0,205.0,5.0,2.0,2.0
4,1.0,7.0,1.0,4.0,2873.0,2.0,1.0,3.0,5.0,12.0,1.0,101.0,101.0,2.0,2.0,1.0
5,3.0,1.0,2.0,3.0,2437.0,1.0,1.0,5.0,3.0,13.0,2.0,202.0,201.0,4.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438687,1.0,8.0,1.0,1.0,2148.0,2.0,2.0,2.0,4.0,10.0,1.0,315.0,205.0,2.0,2.0,1.0
438688,2.0,9.0,2.0,1.0,2469.0,1.0,1.0,4.0,2.0,3.0,1.0,210.0,103.0,3.0,1.0,2.0
438690,3.0,2.0,1.0,3.0,3068.0,2.0,1.0,6.0,2.0,7.0,1.0,102.0,101.0,6.0,2.0,1.0
438691,3.0,9.0,2.0,1.0,2373.0,2.0,1.0,4.0,2.0,10.0,1.0,101.0,101.0,4.0,2.0,1.0


In [7]:
# check for unique values to replace for encoding
pandas_df.EDUCA.unique()

array([4., 3., 5., 6., 2., 9., 1.])

In [8]:
# Replace Values based on responses provided in survey response form and confirm updated values
pandas_df = pandas_df.replace({"EDUCA":{4.:"High School Grad",
                                        6.:"College Grad",
                                        3.:"Some High School",
                                        5.:"Some College",
                                        2.:"Elementary",
                                        1.:"None",
                                        9.:""}})
pandas_df.EDUCA.unique()


array(['High School Grad', 'Some High School', 'Some College',
       'College Grad', 'Elementary', '', 'None'], dtype=object)

In [9]:
pandas_df.GENHLTH.unique()

array([5., 2., 3., 4., 1., 7., 9.])

In [10]:
# Replace Values based on responses provided in survey response form and confirm confirm updated values
pandas_df = pandas_df.replace({"GENHLTH":{1.:"Excellent",
                                          2.:"Very Good",
                                          3.:"Good",
                                          4.:"Fair",
                                          5.:"Poor",
                                          7.:"",
                                          9.:""}})
pandas_df.GENHLTH.unique()

array(['Poor', 'Very Good', 'Good', 'Fair', 'Excellent', ''], dtype=object)

In [11]:
pandas_df._INCOMG1.unique()

array([3., 2., 5., 4., 9., 1., 6., 7.])

In [12]:
# Replace Values based on responses provided in survey response form and confirm confirm updated values
pandas_df = pandas_df.replace({"_INCOMG1":{1.:"Less than $15,000",
                                           2.:"$15,000 to < $25,000",
                                           3.:"$25,000 to < $35,000",
                                           4.:"$35,000 to < $50,000",
                                           5.:"$50,000 to < $100,000",
                                           6.:"$100,000 to < $200,000",
                                           7.:" > $200,000",
                                           9.:""}})
pandas_df._INCOMG1.unique()

array(['$25,000 to < $35,000', '$15,000 to < $25,000',
       '$50,000 to < $100,000', '$35,000 to < $50,000', '',
       'Less than $15,000', '$100,000 to < $200,000', ' > $200,000'],
      dtype=object)

In [13]:
# Check unique values for Race
pandas_df._RACE.unique()

array([1., 2., 7., 9., 8., 3., 4., 6., 5.])

In [14]:
# Replace values based on responses in survey for Race
pandas_df = pandas_df.replace({"_RACE":{1.:"White",
                               2.:"Black",
                               3.:"American Indian or Alaskan Native",
                               4.:"Asian",
                               5.:" Native Hawaiian or other Pacific Islander",
                               6.:"Other",
                               7.:"Multiracial",
                               8.:"Hispanic",
                               9.:""}})
pandas_df._RACE.unique()

array(['White', 'Black', 'Multiracial', '', 'Hispanic',
       'American Indian or Alaskan Native', 'Asian', 'Other',
       ' Native Hawaiian or other Pacific Islander'], dtype=object)

In [15]:
# Check unique values for TOLDHI3 (high cholesterol)
pandas_df.TOLDHI3.unique()

array([1., 2., 7., 9.])

In [16]:
# Replace values based on responses in survey for cholesterol
pandas_df = pandas_df.replace({"TOLDHI3":{1.:"Yes",
                                          2.:"No",
                                          7.:"",
                                          9.:""}})
pandas_df.TOLDHI3.unique()

array(['Yes', 'No', ''], dtype=object)

In [17]:
# BMI Decimal Change
pandas_df['_BMI5'] = pandas_df['_BMI5'] / 100.0
pandas_df['_BMI5'] = round(pandas_df['_BMI5'], 2)

In [18]:
# Check unique values for SMOKE100 
pandas_df.SMOKE100.unique()

array([1., 2., 7., 9.])

In [19]:
# Replace values based on responses in survey for SMOKE100
pandas_df = pandas_df.replace({"SMOKE100":{1.:"Yes",
                                           2.:"No",
                                           7.:"",
                                           9.:""}})
pandas_df.SMOKE100.unique()

array(['Yes', 'No', ''], dtype=object)

In [20]:
# Check unique values for _RFBING5
pandas_df._RFBING5.unique()

array([1., 2., 9.])

In [21]:
# Replace values based on responses in survey for _RFBING5
pandas_df = pandas_df.replace({"_RFBING5":{1.:"Yes",
                                           2.:"No",
                                           9.:""}})
pandas_df._RFBING5.unique()

array(['Yes', 'No', ''], dtype=object)

In [22]:
# Check unique values for EXERANY2
pandas_df.EXERANY2.unique()

array([2., 1., 7., 9.])

In [23]:
# Replace values based on responses in survey for EXERANY2
pandas_df = pandas_df.replace({"EXERANY2":{1.:"Yes",
                                           2.:"No",
                                           7.:"",
                                           9.:""}})
pandas_df.EXERANY2.unique()

array(['No', 'Yes', ''], dtype=object)

In [24]:
# Check unique values for DIABETE4
pandas_df.DIABETE4.unique()

array([3., 1., 4., 2., 7., 9.])

In [25]:
# Replace values based on responses for DIABETE4
pandas_df = pandas_df.replace({"DIABETE4":{1.: "Yes", 
                                           2.: "Yes", 
                                           3.: "No",
                                           4.: "No", 
                                           7.: "", 
                                           9.:""}})


In [26]:
# Check unique values for FRUIT2
pandas_df.FRUIT2.unique()

array([101., 203., 202., 312., 204., 205., 302., 102., 301., 330., 555.,
       307., 300., 201., 305., 303., 306., 103., 777., 207., 308., 220.,
       206., 315., 310., 105., 304., 235., 320., 325., 314., 104., 106.,
       208., 107., 316., 309., 317., 390., 360., 999., 345., 210., 340.,
       221., 215., 110., 130., 399., 230., 199., 299., 335., 324., 214.,
       212., 328., 125., 327., 114., 336., 250., 218., 108., 209., 326.,
       350., 323., 318., 321., 342., 322., 115., 120., 109., 118., 123.,
       180., 216., 160., 150., 311., 113., 380., 192., 225., 228., 329.,
       332., 370., 240., 122., 294., 211., 112., 339., 375., 331., 392.,
       363., 127., 111., 333., 121., 384., 226., 131., 313., 241., 134.,
       344., 222., 116., 365., 355., 224., 338., 232., 397., 124., 227.,
       319., 188., 213., 223., 334., 293., 190., 193., 166., 398., 337.,
       217., 260., 396., 119., 117., 198., 341., 275., 261., 128., 191.,
       348., 139., 298., 255., 133., 189., 140., 17

In [27]:
import numpy as np
pandas_df = pandas_df.replace({"FRUIT2":{300.:500.}})
pandas_df['FRUIT2'] = np.where(pandas_df['FRUIT2'].between(0,400), "Yes", pandas_df['FRUIT2'])
pandas_df = pandas_df.replace({"FRUIT2":{'500.0': "No", '555.0': "No", '777.0': "", '999.0': ""}})
pandas_df.FRUIT2.unique()

array(['Yes', 'No', ''], dtype=object)

In [28]:
# Check unique values for VEGETEB2
pandas_df.VEGETAB2.unique()

array([101., 203., 205., 201., 202., 204., 303., 207., 330., 306., 102.,
       105., 555., 777., 307., 301., 310., 320., 315., 225., 329., 104.,
       304., 302., 206., 103., 214., 325., 312., 316., 300., 210., 308.,
       328., 305., 314., 360., 209., 107., 327., 212., 333., 318., 324.,
       365., 326., 317., 340., 999., 311., 309., 323., 220., 321., 106.,
       313., 215., 199., 345., 128., 399., 355., 130., 350., 125., 115.,
       356., 223., 114., 112., 332., 208., 331., 109., 322., 335., 250.,
       299., 375., 150., 390., 119., 230., 108., 120., 111., 110., 160.,
       240., 175., 354., 319., 218., 348., 191., 370., 233., 217., 228.,
       221., 127., 245., 129., 380., 165., 211., 260., 124., 123., 339.,
       140., 338., 222., 227., 121., 292., 336., 192., 280., 295., 213.,
       359., 226., 358., 216., 362., 342., 393., 134., 189., 133., 382.,
       122., 231., 166., 275., 116., 139., 131., 352., 294., 180., 117.,
       177., 395., 269., 296., 171., 118., 337., 35

In [29]:
#VEGETEB2!!!! This needs to be updated - see above for values
pandas_df = pandas_df.replace({"VEGETAB2":{300.:500.}})
pandas_df['VEGETAB2'] = np.where(pandas_df['VEGETAB2'].between(0,400), "Yes", pandas_df['VEGETAB2'])
pandas_df = pandas_df.replace({"VEGETAB2":{'500.0': "No", '555.0': "No", '777.0': "", '999.0': ""}})
pandas_df.VEGETAB2.unique()

array(['Yes', 'No', ''], dtype=object)

In [30]:
# Check unique values for MEDCOST1
pandas_df.MEDCOST1.unique()

array([2., 1., 7., 9.])

In [31]:
#MEDCOST1	updates with survey results
pandas_df = pandas_df.replace({"MEDCOST1":{1.:"Yes", 2.:"No", 7.:"", 9.:""}})
pandas_df.MEDCOST1.unique()

array(['No', 'Yes', ''], dtype=object)

In [32]:
# Check unique values for _SEX
pandas_df._SEX.unique()

array([2., 1.])

In [33]:
#_SEX
pandas_df = pandas_df.replace({"_SEX":{1.:"Male", 2.:"Female"}})

In [34]:
# Check unique values for BPHIGH6
pandas_df.BPHIGH6.unique()

array([3., 1., 4., 2., 7., 9.])

In [35]:
# Replace values based on responses for BPHIGH6
pandas_df = pandas_df.replace({"BPHIGH6": {1.: "Yes",
                                           2.: "Yes",
                                           3.: "No",
                                           4.: "No", 
                                           7.: "", 
                                           9.:""}})

In [36]:
pandas_df

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,11.0,No,Yes,Yes,"$25,000 to < $35,000",No,Female
2,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,11.0,No,Yes,Yes,"$15,000 to < $25,000",No,Female
3,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,9.0,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
4,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,12.0,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
5,No,White,No,No,24.37,Yes,Yes,Some College,Good,13.0,No,Yes,Yes,"$35,000 to < $50,000",No,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438687,Yes,Hispanic,Yes,Yes,21.48,No,No,Elementary,Fair,10.0,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
438688,Yes,,No,Yes,24.69,Yes,Yes,High School Grad,Very Good,3.0,Yes,Yes,Yes,"$25,000 to < $35,000",Yes,Female
438690,No,Black,Yes,No,30.68,No,Yes,College Grad,Very Good,7.0,Yes,Yes,Yes,"$100,000 to < $200,000",No,Male
438691,No,,No,Yes,23.73,No,Yes,High School Grad,Very Good,10.0,Yes,Yes,Yes,"$35,000 to < $50,000",No,Male


In [37]:
pandas_df = pandas_df.replace({"_AGEG5YR": {1.: "Age 18 to 24",
                                            2.: "Age 25 to 29 ",
                                            3.: "Age 30 to 34 ",
                                            4.: "Age 35 to 39 ",
                                            5.: "Age 40 to 44",
                                            6.: "Age 45 to 49",
                                            7.: "Age 50 to 54",
                                            8.: "Age 55 to 59 ",
                                            9.: "Age 60 to 64 ",
                                            10.: "Age 65 to 69 ",
                                            11.: "Age 70 to 74 ",
                                            12.: "Age 75 to 79",
                                            13.: "Age 80 or older",
                                            14.: ""}})

In [38]:
pandas_df=pandas_df.mask(pandas_df == "")

In [39]:
pandas_df.isna().sum()

DIABETE4      402
_RACE        5867
TOLDHI3      2425
BPHIGH6       842
_BMI5           0
SMOKE100     1814
_RFBING5     5305
EDUCA         689
GENHLTH       611
_AGEG5YR     3690
EXERANY2      477
FRUIT2       5299
VEGETAB2     4572
_INCOMG1    51064
MEDCOST1      660
_SEX            0
dtype: int64

In [40]:
pandas_df = pandas_df.dropna()

In [41]:
pandas_df.reset_index(drop=True)

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
1,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
2,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
3,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
4,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250523,No,Black,Yes,Yes,20.98,No,Yes,High School Grad,Good,Age 55 to 59,Yes,Yes,Yes,"$25,000 to < $35,000",No,Male
250524,No,Black,No,No,29.05,No,Yes,College Grad,Very Good,Age 25 to 29,Yes,Yes,Yes,"> $200,000",No,Male
250525,Yes,Hispanic,Yes,Yes,21.48,No,No,Elementary,Fair,Age 65 to 69,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
250526,No,Black,Yes,No,30.68,No,Yes,College Grad,Very Good,Age 50 to 54,Yes,Yes,Yes,"$100,000 to < $200,000",No,Male


In [42]:
# Identify outliers in BMI data, create summary statistics to identify quartiles
# urban_drivers = urban_cities_df['driver_count']
bmi_df = pandas_df["_BMI5"]
bmi_df

0         14.54
2         28.29
3         33.47
4         28.73
5         24.37
          ...  
438685    20.98
438686    29.05
438687    21.48
438690    30.68
438692    31.71
Name: _BMI5, Length: 250528, dtype: float64

In [43]:
#get summary statistics
bmi_df.describe()

count    250528.000000
mean         28.978447
std           6.574445
min          12.020000
25%          24.410000
50%          27.890000
75%          32.190000
max          99.330000
Name: _BMI5, dtype: float64

In [44]:
bmi_q1 = bmi_df.quantile(0.25)
bmi_q3 = bmi_df.quantile(0.75)
bmi_iqr = bmi_q3 - bmi_q1

In [45]:
filtered_bmi_df = pandas_df.loc[(pandas_df["_BMI5"] > (bmi_q1 - (1.5 * bmi_iqr))) & (pandas_df["_BMI5"] < (bmi_q3 + (1.5 * bmi_iqr)))]
filtered_bmi_df

Unnamed: 0,DIABETE4,_RACE,TOLDHI3,BPHIGH6,_BMI5,SMOKE100,_RFBING5,EDUCA,GENHLTH,_AGEG5YR,EXERANY2,FRUIT2,VEGETAB2,_INCOMG1,MEDCOST1,_SEX
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
2,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
3,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
4,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
5,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438685,No,Black,Yes,Yes,20.98,No,Yes,High School Grad,Good,Age 55 to 59,Yes,Yes,Yes,"$25,000 to < $35,000",No,Male
438686,No,Black,No,No,29.05,No,Yes,College Grad,Very Good,Age 25 to 29,Yes,Yes,Yes,"> $200,000",No,Male
438687,Yes,Hispanic,Yes,Yes,21.48,No,No,Elementary,Fair,Age 65 to 69,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
438690,No,Black,Yes,No,30.68,No,Yes,College Grad,Very Good,Age 50 to 54,Yes,Yes,Yes,"$100,000 to < $200,000",No,Male


In [46]:
filtered_bmi_df.rename(columns={"DIABETE4": "Diabetes_Status", 
                                "_RACE": "Ethnicity", 
                                "TOLDHI3": "High_Cholesterol", 
                                "BPHIGH6": "High_Blood_Pressure",
                                "_BMI5": "BMI", 
                                "SMOKE100": "Smoked_Cigarettes",
                                "_RFBING5": "Alcohol_Use_30_Days",
                                "EDUCA": "Education", 
                                "GENHLTH": "Gen_Health",
                                "_AGEG5YR": "Age_Group",
                                "EXERANY2": "Exercise_Last_30_Days", 
                                "FRUIT2": "Fruit_Consumption", 
                                "VEGETAB2": "Vegetable_Consumption",
                                "_INCOMG1": "Income", 
                                "MEDCOST1": "Unable_to_see_doctor",
                                "_SEX": "Gender"}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_bmi_df.rename(columns={"DIABETE4": "Diabetes_Status",


In [47]:
filtered_bmi_df

Unnamed: 0,Diabetes_Status,Ethnicity,High_Cholesterol,High_Blood_Pressure,BMI,Smoked_Cigarettes,Alcohol_Use_30_Days,Education,Gen_Health,Age_Group,Exercise_Last_30_Days,Fruit_Consumption,Vegetable_Consumption,Income,Unable_to_see_doctor,Gender
0,No,White,Yes,No,14.54,Yes,Yes,High School Grad,Poor,Age 70 to 74,No,Yes,Yes,"$25,000 to < $35,000",No,Female
2,Yes,Black,No,Yes,28.29,No,Yes,High School Grad,Very Good,Age 70 to 74,No,Yes,Yes,"$15,000 to < $25,000",No,Female
3,Yes,White,Yes,Yes,33.47,No,No,High School Grad,Very Good,Age 60 to 64,Yes,Yes,Yes,"$50,000 to < $100,000",No,Female
4,Yes,Multiracial,Yes,No,28.73,No,Yes,Some High School,Poor,Age 75 to 79,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
5,No,White,No,No,24.37,Yes,Yes,Some College,Good,Age 80 or older,No,Yes,Yes,"$35,000 to < $50,000",No,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
438685,No,Black,Yes,Yes,20.98,No,Yes,High School Grad,Good,Age 55 to 59,Yes,Yes,Yes,"$25,000 to < $35,000",No,Male
438686,No,Black,No,No,29.05,No,Yes,College Grad,Very Good,Age 25 to 29,Yes,Yes,Yes,"> $200,000",No,Male
438687,Yes,Hispanic,Yes,Yes,21.48,No,No,Elementary,Fair,Age 65 to 69,Yes,Yes,Yes,"$15,000 to < $25,000",No,Male
438690,No,Black,Yes,No,30.68,No,Yes,College Grad,Very Good,Age 50 to 54,Yes,Yes,Yes,"$100,000 to < $200,000",No,Male


In [48]:
# check summary statistics
filtered_bmi_df.describe()

Unnamed: 0,BMI
count,242802.0
mean,28.324162
std,5.420467
min,12.75
25%,24.39
50%,27.48
75%,31.64
max,43.85


In [49]:
filtered_bmi_df.dtypes

Diabetes_Status           object
Ethnicity                 object
High_Cholesterol          object
High_Blood_Pressure       object
BMI                      float64
Smoked_Cigarettes         object
Alcohol_Use_30_Days       object
Education                 object
Gen_Health                object
Age_Group                 object
Exercise_Last_30_Days     object
Fruit_Consumption         object
Vegetable_Consumption     object
Income                    object
Unable_to_see_doctor      object
Gender                    object
dtype: object

In [56]:
# Create second dataframe
visualization_df = filtered_bmi_df.drop(['Alcohol_Use_30_Days', 'Education', 'Gen_Health', 'Exercise_Last_30_Days', 'Fruit_Consumption', 'Vegetable_Consumption', 'Unable_to_see_doctor'], 1)


  visualization_df = filtered_bmi_df.drop(['Alcohol_Use_30_Days', 'Education', 'Gen_Health', 'Exercise_Last_30_Days', 'Fruit_Consumption', 'Vegetable_Consumption', 'Unable_to_see_doctor'], 1)


In [57]:
visualization_df.head()

Unnamed: 0,Diabetes_Status,Ethnicity,High_Cholesterol,High_Blood_Pressure,BMI,Smoked_Cigarettes,Age_Group,Income,Gender
0,No,White,Yes,No,14.54,Yes,Age 70 to 74,"$25,000 to < $35,000",Female
2,Yes,Black,No,Yes,28.29,No,Age 70 to 74,"$15,000 to < $25,000",Female
3,Yes,White,Yes,Yes,33.47,No,Age 60 to 64,"$50,000 to < $100,000",Female
4,Yes,Multiracial,Yes,No,28.73,No,Age 75 to 79,"$15,000 to < $25,000",Male
5,No,White,No,No,24.37,Yes,Age 80 or older,"$35,000 to < $50,000",Male


In [58]:
# convert dataframes to pyspark to load into AWS
filtered_bmi_df_py = spark.createDataFrame(filtered_bmi_df)
visualization_df_py = spark.createDataFrame(visualization_df)

  for column, series in pdf.iteritems():


In [59]:
# Configure settings for RDS
#from getpass import getpass
mode = "append"
jdbc_url='jdbc:postgresql://database-2.ch7jmx5vb0uq.us-east-2.rds.amazonaws.com:5432/team_final_project'
config = {"user":"postgres",
          "password": "Quizab56()",
          "driver":"org.postgresql.Driver"}

In [60]:
# Write review_id_df to table in RDS
filtered_bmi_df_py.write.jdbc(url=jdbc_url, table='filtered_bmi_df', mode=mode, properties=config)

In [61]:
# Write review_id_df to table in RDS
visualization_df_py.write.jdbc(url=jdbc_url, table='visualization_df', mode=mode, properties=config)