In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Import data

In [2]:
df = pd.read_csv('medical_examination.csv')
df.head(5)

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [3]:
df.columns

Index(['id', 'age', 'sex', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol',
       'gluc', 'smoke', 'alco', 'active', 'cardio'],
      dtype='object')

# Add 'overweight' column

In [4]:
df['height'] = df['height']/100
df['overweight'] = df['weight']/(df['height']*df['height'])
df.head(5)

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight
0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,21.96712
1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,34.927679
2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,23.507805
3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,28.710479
4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,23.011177


In [5]:
df['overweight'] = df['overweight'] > 25
df

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight
0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,False
1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,True
2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,False
3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,True
4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,1.68,76.0,120,80,1,1,1,0,1,0,True
69996,99995,22601,1,1.58,126.0,140,90,2,2,0,0,1,1,True
69997,99996,19066,2,1.83,105.0,180,90,3,1,0,1,0,1,True
69998,99998,22431,1,1.63,72.0,135,80,1,2,0,0,0,1,True


In [6]:
df['overweight'] = pd.get_dummies(df['overweight'])[True]
df

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight
0,0,18393,2,1.68,62.0,110,80,1,1,0,0,1,0,0
1,1,20228,1,1.56,85.0,140,90,3,1,0,0,1,1,1
2,2,18857,1,1.65,64.0,130,70,3,1,0,0,0,1,0
3,3,17623,2,1.69,82.0,150,100,1,1,0,0,1,1,1
4,4,17474,1,1.56,56.0,100,60,1,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,1.68,76.0,120,80,1,1,1,0,1,0,1
69996,99995,22601,1,1.58,126.0,140,90,2,2,0,0,1,1,1
69997,99996,19066,2,1.83,105.0,180,90,3,1,0,1,0,1,1
69998,99998,22431,1,1.63,72.0,135,80,1,2,0,0,0,1,1


# Normalize data by making 0 always good and 1 always bad.
# If the value of 'cholesterol' or 'gluc' is 1, make the value 0.
# If the value is more than 1, make the value 1.

In [7]:
df['cholesterol'] = df['cholesterol'] > 1
df['gluc'] = df['gluc'] > 1
df['cholesterol'] = pd.get_dummies(df['cholesterol'])[True]
df['gluc'] = pd.get_dummies(df['gluc'])[True]
df

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio,overweight
0,0,18393,2,1.68,62.0,110,80,0,0,0,0,1,0,0
1,1,20228,1,1.56,85.0,140,90,1,0,0,0,1,1,1
2,2,18857,1,1.65,64.0,130,70,1,0,0,0,0,1,0
3,3,17623,2,1.69,82.0,150,100,0,0,0,0,1,1,1
4,4,17474,1,1.56,56.0,100,60,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69995,99993,19240,2,1.68,76.0,120,80,0,0,1,0,1,0,1
69996,99995,22601,1,1.58,126.0,140,90,1,1,0,0,1,1,1
69997,99996,19066,2,1.83,105.0,180,90,1,0,0,1,0,1,1
69998,99998,22431,1,1.63,72.0,135,80,0,1,0,0,0,1,1


# Draw Categorical Plot

# Create DataFrame for cat plot using `pd.melt`
# using just the values from 'cholesterol', 'gluc', 'smoke', 'alco', 'active', and 'overweight'.

In [16]:
df_cat = df.melt(id_vars = ['cholesterol', 'gluc', 'smoke', 'alco', 'active','overweight'])
df_cat

Unnamed: 0,cholesterol,gluc,smoke,alco,active,overweight,variable,value
0,0,0,0,0,1,0,id,0.0
1,1,0,0,0,1,1,id,1.0
2,1,0,0,0,0,0,id,2.0
3,0,0,0,0,1,1,id,3.0
4,0,0,0,0,0,0,id,4.0
...,...,...,...,...,...,...,...,...
559995,0,0,1,0,1,1,cardio,0.0
559996,1,1,0,0,1,1,cardio,1.0
559997,1,0,0,1,0,1,cardio,1.0
559998,0,1,0,0,0,1,cardio,1.0


In [None]:
# Group and reformat the data to split it by 'cardio'. Show the counts of each feature.
# You will have to rename one of the columns for the catplot to work correctly.
df_cat = None

In [None]:
# Draw the catplot with 'sns.catplot()'



# Get the figure for the output
fig = None


# Do not modify the next two lines
fig.savefig('catplot.png')



# Draw Heat Map

# Clean the data
df_heat = None

# Calculate the correlation matrix
corr = None

# Generate a mask for the upper triangle
mask = None



# Set up the matplotlib figure
fig, ax = None

# Draw the heatmap with 'sns.heatmap()'



# Do not modify the next two lines
fig.savefig('heatmap.png')