In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import re

from sklearn.preprocessing import OneHotEncoder

# Problem 3. Data cleaning and preprocessing (1 point)
This should be self-explanatory. In an appropriate notebook, explore different ways to clean and preprocess the dataset.

This is still part of your research. That is, don't be afraid to _try out different approaches to the same problem_. E.g., if you have a lot of missing values, you may not know right away how to handle them. Experimenting with several approaches will give you a better indication what works well for your data and goals.

In [2]:
asthma_df = pd.read_csv("../data/asthma_disease_data.csv")

## Initial Cleaning

### Convert column names to `snake_case`

In [3]:
asthma_df.columns

Index(['PatientID', 'Age', 'Gender', 'Ethnicity', 'EducationLevel', 'BMI',
       'Smoking', 'PhysicalActivity', 'DietQuality', 'SleepQuality',
       'PollutionExposure', 'PollenExposure', 'DustExposure', 'PetAllergy',
       'FamilyHistoryAsthma', 'HistoryOfAllergies', 'Eczema', 'HayFever',
       'GastroesophagealReflux', 'LungFunctionFEV1', 'LungFunctionFVC',
       'Wheezing', 'ShortnessOfBreath', 'ChestTightness', 'Coughing',
       'NighttimeSymptoms', 'ExerciseInduced', 'Diagnosis', 'DoctorInCharge'],
      dtype='object')

In [4]:
def to_snake_case(name):
    # Add underscore before uppercase letters (except the first), then lowercase
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    snake = re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()
    return snake

In [5]:
asthma_df.columns = [to_snake_case(c) for c in asthma_df.columns]

In [6]:
asthma_df.columns

Index(['patient_id', 'age', 'gender', 'ethnicity', 'education_level', 'bmi',
       'smoking', 'physical_activity', 'diet_quality', 'sleep_quality',
       'pollution_exposure', 'pollen_exposure', 'dust_exposure', 'pet_allergy',
       'family_history_asthma', 'history_of_allergies', 'eczema', 'hay_fever',
       'gastroesophageal_reflux', 'lung_function_fev1', 'lung_function_fvc',
       'wheezing', 'shortness_of_breath', 'chest_tightness', 'coughing',
       'nighttime_symptoms', 'exercise_induced', 'diagnosis',
       'doctor_in_charge'],
      dtype='object')

### Delete unnecessary column

> **Confidential Information**
> 
> * DoctorInCharge: This column contains confidential information about the doctor in charge, with "Dr_Confid" as the value for all patients.

Since all the values of this variable are the same, I don't need it. 

In [7]:
asthma_df = asthma_df.drop(columns = "doctor_in_charge")

## Create a separate dataframe for visualization

For the purpose of the logistic regression I keep the original binary variables as 1/0. For the puprpose of visualization, I create a copy where I clean the categorical variables.

In [8]:
asthma_vis = asthma_df.copy()

### Transform categorical variables 

Initial Exploration (Problem 1) showed that many variables that should be categories are instead of type `int`. Most of them are binary but there are also variables that include more categories, like `ethnicity`. Let's convert them to categorical. Descriptions of variable mappings are found at the [data page](https://www.kaggle.com/datasets/rabieelkharoua/asthma-disease-dataset) 

I'll start with binary variables and follow-up with variables with 3+ categories. 

#### Gender

From description:

> **Gender**: Gender of the patients, where 0 represents Male and 1 represents Female.


In [9]:
asthma_vis["gender"] = asthma_vis["gender"].map({0: "male", 1: "female"}).astype("category")

#### Binary Variables with Yes/No valuesabs

> **Lifestyle Factors**
> Smoking: Smoking status, where 0 indicates No and 1 indicates Yes.
>
> **Environmental and Allergy Factors**
> * PetAllergy: Pet allergy status, where 0 indicates No and 1 indicates Yes.
>   
> **Medical History Variables**
> * FamilyHistoryAsthma: Family history of asthma, where 0 indicates No and 1 indicates Yes.
> * HistoryOfAllergies: History of allergies, where 0 indicates No and 1 indicates Yes.
> * Eczema: Presence of eczema, where 0 indicates No and 1 indicates Yes.
> * HayFever: Presence of hay fever, where 0 indicates No and 1 indicates Yes.
> * GastroesophagealReflux: Presence of gastroesophageal reflux, where 0 indicates No and 1 indicates Yes.
>  
> **Symptoms**
> * Wheezing: Presence of wheezing, where 0 indicates No and 1 indicates Yes.
> * ShortnessOfBreath: Presence of shortness of breath, where 0 indicates No and 1 indicates Yes.
> * ChestTightness: Presence of chest tightness, where 0 indicates No and 1 indicates Yes.
> * Coughing: Presence of coughing, where 0 indicates No and 1 indicates Yes.
> * NighttimeSymptoms: Presence of nighttime symptoms, where 0 indicates No and 1 indicates Yes.
> * ExerciseInduced: Presence of symptoms induced by exercise, where 0 indicates No and 1 indicates Yes.
> 
> **Diagnosis Information**
> * Diagnosis: Diagnosis status for Asthma, where 0 indicates No and 1 indicates Yes.

I'll convert all these to category, in one shot, with the exception of `diagnosis`, which is my outcome variable and is better kept as is. 

In [10]:
binary_cols = [
    'smoking', 'pet_allergy', 'family_history_asthma', 'history_of_allergies', 'eczema', 'hay_fever',
    'gastroesophageal_reflux', 'wheezing', 'shortness_of_breath', 'chest_tightness', 'coughing',
    'nighttime_symptoms', 'exercise_induced'
]

In [11]:
asthma_vis[binary_cols] = asthma_vis[binary_cols].apply(lambda x: x.map({0: 'no', 1: 'yes'}).astype('category'))

In [12]:
asthma_vis.dtypes

patient_id                    int64
age                           int64
gender                     category
ethnicity                     int64
education_level               int64
bmi                         float64
smoking                    category
physical_activity           float64
diet_quality                float64
sleep_quality               float64
pollution_exposure          float64
pollen_exposure             float64
dust_exposure               float64
pet_allergy                category
family_history_asthma      category
history_of_allergies       category
eczema                     category
hay_fever                  category
gastroesophageal_reflux    category
lung_function_fev1          float64
lung_function_fvc           float64
wheezing                   category
shortness_of_breath        category
chest_tightness            category
coughing                   category
nighttime_symptoms         category
exercise_induced           category
diagnosis                   

#### Ethnicity

> **Ethnicity**: The ethnicity of the patients, coded as follows:
> * 0: Caucasian
> * 1: African American
> * 2: Asian
> * 3: Other

In [13]:
asthma_vis['ethnicity'] = asthma_vis['ethnicity'].map({
    0: 'caucasian',
    1: 'african american',
    2: 'asian',
    3: 'other'
}).astype('category')

#### Education Level 

> **Education Level**: The education level of the patients, coded as follows:
> * 0: None
> * 1: High School
> * 2: Bachelor's
> * 3: Higher

I'm not sure here because this is an ordered varaible.

In [14]:
asthma_vis['education_level'] = asthma_vis['education_level'].map({
    0: 'none',
    1: 'high school',
    2: 'bachelors',
    3: 'higher'
}).astype('category')

#### Patient ID

convert to object 

In [15]:
asthma_vis["patient_id"] = asthma_vis["patient_id"].astype('object')

In [16]:
asthma_vis.dtypes

patient_id                   object
age                           int64
gender                     category
ethnicity                  category
education_level            category
bmi                         float64
smoking                    category
physical_activity           float64
diet_quality                float64
sleep_quality               float64
pollution_exposure          float64
pollen_exposure             float64
dust_exposure               float64
pet_allergy                category
family_history_asthma      category
history_of_allergies       category
eczema                     category
hay_fever                  category
gastroesophageal_reflux    category
lung_function_fev1          float64
lung_function_fvc           float64
wheezing                   category
shortness_of_breath        category
chest_tightness            category
coughing                   category
nighttime_symptoms         category
exercise_induced           category
diagnosis                   

### Save to csv

In [17]:
asthma_vis.to_csv("../data/asthma_disease_data_vis.csv", index=None)

## Analysis Dataframe

### Remove `patient_id`, I think I don't need it for the analysis.

In [18]:
asthma_df = asthma_df.drop(columns = "patient_id")

### Convert multi-categorical variables to categories

In [19]:
asthma_df['ethnicity'] = asthma_df['ethnicity'].map({
    0: 'caucasian',
    1: 'african american',
    2: 'asian',
    3: 'other'
}).astype('category')

In [20]:
asthma_df['education_level'] = asthma_df['education_level'].map({
    0: 'none',
    1: 'high school',
    2: 'bachelors',
    3: 'higher'
}).astype('category')

### Do one-hot encoding of `ethnicity` and `education_level`

In [21]:
asthma_df = pd.get_dummies(
    asthma_df, columns=['ethnicity', 'education_level'], 
    drop_first = True, 
    dtype = int
)

In [22]:
asthma_df

Unnamed: 0,age,gender,bmi,smoking,physical_activity,diet_quality,sleep_quality,pollution_exposure,pollen_exposure,dust_exposure,...,coughing,nighttime_symptoms,exercise_induced,diagnosis,ethnicity_asian,ethnicity_caucasian,ethnicity_other,education_level_high school,education_level_higher,education_level_none
0,63,0,15.848744,0,0.894448,5.488696,8.701003,7.388481,2.855578,0.974339,...,0,0,1,0,0,0,0,0,0,1
1,26,1,22.757042,0,5.897329,6.341014,5.153966,1.969838,7.457665,6.584631,...,1,1,1,0,1,0,0,0,0,0
2,57,0,18.395396,0,6.739367,9.196237,6.840647,1.460593,1.448189,5.445799,...,0,1,1,0,1,0,0,1,0,0
3,40,1,38.515278,0,1.404503,5.826532,4.253036,0.581905,7.571845,3.965316,...,1,1,0,0,1,0,0,1,0,0
4,61,0,19.283802,0,4.604493,3.127048,9.625799,0.980875,3.049807,8.260605,...,0,0,1,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2387,43,1,29.059613,0,3.019854,6.119637,8.300960,2.483829,7.314582,3.425445,...,0,0,1,1,0,1,0,0,0,0
2388,18,1,20.740850,0,5.805180,4.386992,7.731192,7.733983,2.279073,6.467701,...,1,1,0,1,0,1,0,1,0,0
2389,54,0,37.079560,0,4.735169,8.214064,7.483521,2.794847,3.055139,9.484013,...,1,0,1,1,0,0,1,0,0,0
2390,46,1,23.444712,0,9.672637,7.362861,6.717272,9.448862,7.712584,5.051405,...,0,1,1,0,0,1,0,0,0,0


### Save to csv

In [23]:
asthma_df.to_csv("../data/asthma_disease_data_analysis.csv", index=None)