 #                         Type 2 Diabetes Prediction Part 1 - Data Wrangling
===================================================================================================================

# Read del dataset <a class="anchor" id="1.5"></a>

In [13]:
#importamos el dataset
import pandas as pd
df= pd.read_csv('../data/interim/BFRSS2021.csv', encoding='cp1252')
df.head(5)

Unnamed: 0.1,Unnamed: 0,X_STATE,FMONTH,IDATE,IMONTH,IDAY,IYEAR,DISPCODE,SEQNO,X_PSU,...,X_FRTRES1,X_VEGRES1,X_FRUTSU1,X_VEGESU1,X_FRTLT1A,X_VEGLT1A,X_FRT16A,X_VEG23A,X_FRUITE1,X_VEGETE1
0,1,1,1,1192021,1,19,2021,1100,2021000001,2021000001,...,1,1,100.0,214.0,1,1,1,1,0,0
1,2,1,1,1212021,1,21,2021,1100,2021000002,2021000002,...,1,1,100.0,128.0,1,1,1,1,0,0
2,3,1,1,1212021,1,21,2021,1100,2021000003,2021000003,...,1,1,100.0,71.0,1,2,1,1,0,0
3,4,1,1,1172021,1,17,2021,1100,2021000004,2021000004,...,1,1,114.0,165.0,1,1,1,1,0,0
4,5,1,1,1152021,1,15,2021,1100,2021000005,2021000005,...,1,1,100.0,258.0,1,1,1,1,0,0


# Wrangling del Dataset

Eligimos variables de interés a base de Codebook de BRFSS:

Codebook: https://www.cdc.gov/brfss/annual_data/2021/pdf/codebook21_llcp-v2-508.pdf

In [14]:
# Eligimos las variables relevantes y les asignamos nuevos nombres

# informacion personal

sex = df['SEXVAR']
age = df['X_AGEG5YR']
race = df['X_CRACE1']
height = df['HEIGHT3']
weight = df['WEIGHT2']
marital_status = df['MARITAL']
metropolitan_status = df['X_METSTAT']

# nivel de educación
level_of_education_completed = df['X_EDUCAG']


# situación económica
employed = df['EMPLOY1']
income = df['INCOME3'] # What is your annual household income from all sources? 
own_or_rent_home = df['RENTHOM1']

# estilo de vida

last_checkup = df['CHECKUP1'] # About how long has it been since you last visited a doctor for a routine checkup?
#hours_of_sleep_per_day = df['sleptim1']
phys_activity_past_30days = df['X_TOTINDA']
smoker = df['X_SMOKER3']

# estado de salud y enfermedades diagnosticadas
general_health = df['GENHLTH']
coronary_heart_disease = df['CVDCRHD4']
mental_health_past_30_days = df['MENTHLTH']
phys_health_past_30_days = df['PHYSHLTH']
kidney_disease = df['CHCKDNY2']
#health_special_equipment = df['useequip'] #Do you now have any health problem that requires you to use specialequipment, such as a cane, a wheelchair, a special bed, or a special telephone?
depression = df['ADDEPEV3']
blind = df['BLIND']
cognitive_diff = df['DECIDE'] #Because of a physical, mental, or emotional condition, do you have seriousdifficulty concentrating, remembering, or making decisions?
diagnosed_diabetes = df['DIABETE4']#convertir 2 al 9 en uno solo
HTA=df['BPHIGH6']
fruta_verdura=df['FRUIT2']


# Arreglamos algunas variables

In [15]:
# Limpiamos columnas de "height" y "weight" para hacer el cálculo de BMI
import numpy as np
hw_replace = {7777:np.nan, 9999:np.nan}
metric = True
max_weight = 999
height = height.replace(hw_replace)
weight = weight.replace(hw_replace)

In [16]:
# Buscamos los valores que no tienen sentido/errores y transformamos los valores de altura a metros
height = height.tolist()
new_height = []
for row in height:
    h = str(row)

    if row < 1: # If the height is 0, it isn't real
        meters = np.nan

    elif row < 712:
        feet = float(h[0])
        inches = float(h[1:])
        if inches > 12: # Catches an error by the surveyor: no one is 5' 13" tall
            meters = np.nan
        else:
            inches = inches + feet*12
            meters = inches * 0.0254

    elif row < 9999 and row >= 9000 and metric:
        meters = float(h[1])+float(h[2:])*0.01
        if meters == 0: # If the height is 0, it isn't real
            meters = np.nan

    else: # If you've gotten this far, give up. You're NaN
        meters = np.nan

    new_height.append(meters)

In [17]:
# Buscamos los valores que no tienen sentido/errores y transformamos los valores a KG

weight = weight.tolist()
new_weight = []
for row in weight:
    
    if row < 10: # No person ages 18-99 should weigh 10 lbs.
        kg = np.nan

    elif row < max_weight:
        kg = row * 0.453592

    elif row < 9999 and metric:
        w = str(row)
        kg = float(w[1:])
        if kg < 10: # Again, no person 18-99 should weigh 10 kgs. And yes, I know that 10 lbs != 10 kgs, but whatever
            kg = np.nan

    else:
        kg = np.nan

    new_weight.append(kg)

In [18]:
# Calculamos el BMI
bmi = []
for h, w in zip(new_height, new_weight):
    b = w/(h*h)
    if b < 10 or b > 200:
        b = np.nan
    bmi.append(b)

In [19]:
# Creamos series de altura, peso y BMI para incorporarlos en la tabla
height = pd.Series(new_height)
weight = pd.Series(new_weight)
bmi = pd.Series(bmi)

Arreglamos algunas variables

In [20]:
# Creamos el dataframe con las variables de arriba
df_diabetes = pd.concat([sex,
                         age,
                         race,
                         HTA,
                         fruta_verdura,
                         marital_status,
                         metropolitan_status,
                         height,
                         weight,
                         bmi,
                         level_of_education_completed,
                         employed,
                         income,
                         own_or_rent_home,
                         last_checkup,
                         phys_activity_past_30days,
                         smoker,
                         general_health,
                         coronary_heart_disease,
                         mental_health_past_30_days,
                         phys_health_past_30_days,
                         kidney_disease,
                         depression,
                         blind,
                         cognitive_diff,
                         diagnosed_diabetes], axis = 1)

                         # Eligimos las variables relevantes y les asignamos nuevos nombres

df_diabetes.columns = [  'sex',
                         'age',
                         'race',
                         'HTA',
                         'fruta_verdura',
                         'marital_status',
                         'metropolitan_status',
                         'height',
                         'weight',
                         'bmi',
                         'level_of_education_completed',
                         'employed',
                         'income',
                         'own_or_rent_home',
                         'last_checkup',
                         'phys_activity_past_30days',
                         'smoker',
                         'general_health',
                         'coronary_heart_disease',
                         'mental_health_past_30_days',
                         'phys_health_past_30_days',
                         'kidney_disease',
                         'depression',
                         'blind',
                         'cognitive_diff',
                         'diagnosed_diabetes']

Chequeamos si todos los valores de columnas son relevantes

In [21]:
df_diabetes['HTA'] = df_diabetes['HTA'].replace({2:-1,4:-1,7:-1,9:-1})
df_diabetes['fruta_verdura'] = df_diabetes['fruta_verdura'].replace({777:-1,999:-1})
# categorías 1 y 2 de edad corresponden a los menores de 30 años, este grupo etario probablemente tiene Diabetes Tipo 1, se eliminan para que no interfieran con el analisis de diabetes tipo 2
df_diabetes = df_diabetes[df_diabetes['age'] != 1]
df_diabetes = df_diabetes[df_diabetes['age'] != 2]
# cambiar nombres de las variables - nos quedan 11 categorías de edad, "0" siendo rango "30-34" y "10" - "80 o más"
# según el codebook, el valor 14 corresponde a "Don’t know/Refused/Missing" - le asignamos el valor "-1"
df_diabetes['age'] = df_diabetes['age'].replace({3:0, 4:1, 5:2, 6:3, 7:4, 8:5, 9:6, 10:7, 11:8, 12:9, 13:10, 14:-1})
# según el codebook los valores 7, 77, 99 corresponden a "No preferred race", "Don’t know", "Refused", "Missing" - sustituimos por "-1"
# sustituimos valores null por "-1"
df_diabetes['race'] = df_diabetes['race'].replace({6:-1,7:-1, 77:-1, 99:-1})
df_diabetes['marital_status'] = df_diabetes['marital_status'].replace({9:-1})
# según el codebook los valor 9 "Don’t know/Not sure/Missing"- lo eliminamos
df_diabetes['level_of_education_completed'] = df_diabetes['level_of_education_completed'].replace({9:-1})
df_diabetes['employed'] = df_diabetes['employed'].replace({9:-1})
df_diabetes['income'] = df_diabetes['income'].replace({99:-1, 77:-1})
df_diabetes['own_or_rent_home'] = df_diabetes['own_or_rent_home'].replace({3:-1,7:-1, 9:-1})
df_diabetes['last_checkup'] = df_diabetes['last_checkup'].replace({7:-1,9:-1})
df_diabetes['phys_activity_past_30days'] = df_diabetes['phys_activity_past_30days'].replace({9:-1})
df_diabetes['smoker'] = df_diabetes['smoker'].replace({9:-1})
df_diabetes['general_health'] = df_diabetes['general_health'].replace({7:-1, 9:-1})
df_diabetes['coronary_heart_disease'] = df_diabetes['coronary_heart_disease'].replace({7:-1, 9:-1})
df_diabetes['mental_health_past_30_days'] = df_diabetes['mental_health_past_30_days'].replace({77:-1, 99:-1})
df_diabetes['phys_health_past_30_days'] = df_diabetes['phys_health_past_30_days'].replace({77:-1, 99:-1})
df_diabetes['kidney_disease'] = df_diabetes['kidney_disease'].replace({7:-1, 9:-1})
df_diabetes['depression'] = df_diabetes['depression'].replace({7:-1, 9:-1})
df_diabetes['blind'] = df_diabetes['blind'].replace({7:-1, 9:-1})
df_diabetes['cognitive_diff'] = df_diabetes['cognitive_diff'].replace({7:-1, 9:-1})
# Dejamos las siguientes categorías:
# (1) Yes
# (2) Yes, but female told only during pregnancy
# (3) No
# (4) No, pre-diabetes or borderline diabetes
df_diabetes['diagnosed_diabetes'] = df_diabetes['diagnosed_diabetes'].replace({2:1,4:1,3:2,7:-1, 9:-1})

#Elimino los -1 del dataset
df_diabetes.fillna(-1,inplace=True)
df_diabetes = df_diabetes[df_diabetes['diagnosed_diabetes'] != -1]

==============================================================================================================================================

# Guardamos el data set final <a class="anchor" id="3.2"></a>

In [22]:
df_diabetes.to_csv("../data/processed/BRFSS2021T.csv")