Oral Insulin Phase II: Clinical Trial



In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

## Gather

In [2]:
# Read datasets
patients = pd.read_csv('../input/phase-ii-oral-insulin/patients.csv')
treatments = pd.read_csv('../input/phase-ii-oral-insulin/treatments.csv')
adverse_reactions = pd.read_csv('../input/phase-ii-oral-insulin/adverse_reactions.csv')
treatments_cut = pd.read_csv('../input/phase-ii-oral-insulin/treatments_cut.csv')

In [None]:
# Confirm if the datasets have downloaded successfully
patients

In [None]:
treatments

In [None]:
treatments.info()

In [None]:
treatments[treatments.hba1c_change.isnull()]

In [None]:
treatments_cut

In [None]:
adverse_reactions

## Assess

> Programmatically and visually assess the three tables

In [None]:
patients.info()

In [None]:
patients

In [None]:
patients.describe()

In [None]:
plt.hist(patients['weight']);

# Investigate how many invalid values are there
patients.weight.sort_values()

#### Quality
##### `patients` table
- Errorneous datatypes (assigned_sex, state, zip_code, birthdate) columns
- Some `zip_code` has only 4 digits 
- Inconsistencies in `state` where some entries are abbreviation and others are full name
- min() height is 27 which is invalid
- min() weight 48.8kgs --> lbs 
- Typo Dsvid Gustafsson
- Missing demographic information in `address`, `contact` columns which cannot be arbitrarily filled
- Mulpitle phone number formats (e.g. +12, (xxx), xxx)
- Default John Doe data

#### `treatments` table
- Missing records: should be 280 --> 350
- Missing hba1c changes
- 'u' in start & end dose for auralin and novodra
- Lowercase given_names and surname (inconsistent with other tables)
- Erroneous datatypes (`auralin` and `novodra`)
- Inaccurate hba1c change: 9s (invalidly high change) --> 4s
- Dashes (-) representing nulls in `auralin`, `novodra`
 

#### Tidiness
##### `patients` table
- `contact` column in `patients` table contains two pieces of info and should be split into two columns (`email`, `phone_number`)
- Three variables are included in two columns in `treatments` table (treatment, start dose, end dose)
- `adverse_reactions` table is non-necessary and should be merged into `treatments` table
- Duplicated columns (`given_name`, `surname`) in `patients` table and `treatments` table

## Clean
> Clean for missing values and structural issues (tidiness) first and then move on to data quality issues

In [13]:
# Before any cleaning, make copies of dataset
patients_clean = patients.copy()
treatments_clean = treatments.copy()
reactions_clean = adverse_reactions.copy()

#### Clean 1
##### Define
* Concatenate `treatment_cut` into `treatment` table to include missing entries and make it 280-->350 entries: .concat()
* Calculate missing and inaccurate hba1c (e.g. 9s) in `hba1c_change` column: `hba1c_start`-`hba1c_end`

In [None]:
# Before concatenating, check if the columns in two dataframes match
treatments_clean.columns == treatments_cut.columns

In [14]:
# See the number of duplicates patients to determine if the two tables are seperate record of patients
# If there is high number of duplicates, it indicates the records are duplicated

full_name1 = pd.DataFrame(treatments['given_name'] + ' ' + treatments['surname'])
full_name2 = pd.DataFrame(treatments_cut['given_name'] + ' ' + treatments_cut['surname'])

full_name = pd.concat([full_name1, full_name2], ignore_index=True)

full_name[full_name.duplicated()]

# Since there is only 1 duplicate, it is highly certain that the two tables are not duplicated record of patients.

Unnamed: 0,0
136,joseph day


In [16]:
# Concatenate the tables
treatments_clean = pd.concat([treatments_clean, treatments_cut], ignore_index=True)

In [17]:
# Calculate hba1c change
treatments_clean.hba1c_change = treatments_clean.hba1c_start - treatments_clean.hba1c_end

In [18]:
# Check if concatenation and calculation is successful
treatments_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 420 entries, 0 to 419
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   given_name    420 non-null    object 
 1   surname       420 non-null    object 
 2   auralin       420 non-null    object 
 3   novodra       420 non-null    object 
 4   hba1c_start   420 non-null    float64
 5   hba1c_end     420 non-null    float64
 6   hba1c_change  420 non-null    float64
dtypes: float64(3), object(4)
memory usage: 23.1+ KB


#### Clean 2
##### Define
* Split the `contact` column in `patients_clean` table by extracting 'email' and 'phone number':
    * .str.extract() / .split(expand=True) / .drop()

In [20]:
# Assess the messy contact details
patients_clean.contact.sample(50)

122                   TaHaBoi@superrito.com231-607-3625
422             641-475-9654GyorfyJazmin@jourrapide.com
341     FatimahAqilahKhoury@superrito.com1 949 290 0728
358         AmalieJChristensen@einrot.com1 203 235 1076
4                       334-515-7487TimNeudorf@cuvox.de
314                908-884-4247RenzoLucchese@dayrep.com
110       MiroslavStepanek@teleworm.us+1 (214) 637-0855
425                908-751-4255TaylahMobourne@rhyta.com
297              724-759-0310CsonkaBodor@jourrapide.com
238           AkselHVestergaard@armyspy.com215-528-2193
193                 BornaSlezinger@cuvox.de203-933-3979
239              228-378-1355KhalidJohnsrud@teleworm.us
447           708-691-2195LuizCavalcantiGomes@rhyta.com
329         HerczeghCsilla@jourrapide.com1 813 434 8122
28                 RobertWolf@fleckens.hu1 267 895 7462
231            StefanieHerman@fleckens.hu1 252 583 5410
260             EmilyNHenriksen@armyspy.com925-757-6139
323                  PolakNancsi@cuvox.de1 513 4

In [None]:
# Extract the two different information
patients_clean['phone_number'] = patients_clean.contact.str.extract('((?:\+?\d{1,2}[\s.-])?
                                                                    \(?\d{3}\)?[\s.-]?
                                                                    \d{3}[\s.-]?\d{4})', expand=True)
patients_clean['email'] = patients_clean.contact.str.extract('([a-zA-Z][a-zA-Z0-9_.+-]+@
                                                             [a-zA-Z0-9-]+\.[a-zA-Z0-9-.][a-zA-Z]+)', expand=True)

# Drop the `contact` column
patients_clean.drop(columns = 'contact', inplace=True)

In [None]:
patients_clean

#### Clean 3
##### Define
* Melt the `auralin` and `novodra` columns to a `treatment` and `dose` column: pd.melt()
* Then, split the `dose` column into `start_dose` and `end_dose` with '-' as a seperator: str.split()
* Drop the original dose column: .drop()

* Remove 'u' from start_dose and end_dose, and convert datatype to integer

In [None]:
# Melt the columns
treatments_clean = pd.melt(treatments_clean, id_vars=['given_name', 'surname', 'hba1c_start', 'hba1c_end', 'hba1c_change'], value_vars = ['auralin', 'novodra'], var_name ='treatment', value_name='dose')

In [None]:
# Drop rows with dose value of '-'
treatments_clean = treatments_clean[treatments_clean.dose != '-']

In [27]:
# Split into start / end dose columns
treatments_clean[['dose_start', 'dose_end']] = treatments_clean.dose.str.split(' - ', 1, expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [28]:
# Drop the original column
treatments_clean.drop('dose', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [31]:
treatments_clean.dose_start = treatments_clean.dose_start.str[:-1].astype(int)
treatments_clean.dose_end = treatments_clean.dose_end.str[:-1].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [37]:
# Confirm
treatments_clean.dtypes

given_name       object
surname          object
hba1c_start      object
hba1c_end        object
hba1c_change    float64
treatment        object
dose_start        int64
dose_end          int64
dtype: object

#### Clean 4
##### Define
* Merge the `adverse_reactions` table to the `treatments` table: pd.merge()

In [40]:
# Merge two tables 
pd.merge(treatments_clean, reactions_clean, 
         on=['given_name', 'surname'], how='left')

# Now that the 'adverse reaction' information is merged into the 
## treatment table, we no longer need the adverse reactions table

Unnamed: 0,given_name,surname,hba1c_start,hba1c_end,hba1c_change,treatment,dose_start,dose_end,adverse_reaction
0,veronika,jindrová,41u,48u,0.43,auralin,41,48,
1,skye,gormanston,33u,36u,0.35,auralin,33,36,
2,sophia,haugen,37u,42u,0.38,auralin,37,42,
3,eddie,archer,31u,38u,0.34,auralin,31,38,
4,asia,woźniak,30u,36u,0.39,auralin,30,36,
...,...,...,...,...,...,...,...,...,...
415,christopher,woodward,55u,51u,0.45,novodra,55,51,nausea
416,maret,sultygov,26u,23u,0.37,novodra,26,23,
417,lixue,hsueh,22u,23u,0.41,novodra,22,23,injection site discomfort
418,jakob,jakobsen,28u,26u,0.45,novodra,28,26,hypoglycemia


#### Clean 5
##### Define
* In order to remove duplicated columns `given_name` and `surname`, isolate the `patient_id` and names in the patients table and join with treatments table:
    * make sure to lowercase the names to match the treatments table: str.lower()
* Then, drop those name columns from the treatments table

In [None]:
# Isolate and lowercase
id_name = patients_clean[['patient_id', 'given_name', 'surname']]
id_name.given_name = id_name.given_name.str.lower()
id_name.surname = id_name.surname.str.lower()

id_name

In [None]:
# Merge the id_name table to treatments
treatments_clean = pd.merge(treatments_clean, id_name, on=['given_name', 'surname'])

# As we now have the identifier column `patient_id` in the table, drop the name columns 
treatments_clean.drop(['given_name', 'surname'], axis=1, inplace=True)

In [None]:
# Confirm: patient_id should only be the duplicate column
all_col = pd.Series(list(patients_clean) + list(treatments_clean))
all_col[all_col.duplicated()]

In [None]:
# Optional: save the dataframes
patients_clean.to_csv('patients_clean.csv', index=False)
treatments_clean.to_csv('treatments_clean.csv', index=False)

In [None]:
import pandas as pd, numpy as np

patients = pd.read_csv('patients_clean.csv')
treatments = pd.read_csv('treatments_clean.csv')

#### Quality
* Errorneous datatypes (assigned_sex, state, zip_code, birthdate) columns
* Some zip_code has only 4 digits
* Inconsistencies in state where some entries are abbreviation and others are full name
* min() height is 27 which is invalid
* min() weight 48.8kgs --> lbs
* Typo Dsvid Gustafsson
* Missing demographic information in address, contact columns which cannot be arbitrarily filled
* Mulpitle phone number formats (e.g. +12, (xxx), xxx)
* Default John Doe data
* Multiple records for Jakobsen, Gersten, Taylor

#### Clean
##### Define
* Convert dtypes: ['assigned_sex', 'state']: category / ['zip_code']: / 'birthdate': to_datetime

In [None]:
# assigned_sex and state
patients.assigned_sex = patients.assigned_sex.astype('category')
patients.state = patients.state.astype('category')

In [None]:
# Convert the zip code column's data type from a float to a string using astype, remove the '.0' using string slicing
patients.zip_code = patients.zip_code.astype(str).str[:-2]

# Then pad four digit zip codes with a leading 0
patients.zip_code = patients.zip_code.str.pad(5, side='left', fillchar='0') 

# Reconvert NaN entries that were converted to 0000n due to the code above
patients.zip_code = patients.zip_code.replace('0000n', np.nan)

In [None]:
# Convert birthdate
patients.birthdate = pd.to_datetime(patients['birthdate'])

In [None]:
# Confirm the changes
patients.dtypes

patients.tail()

#### Clean
##### Define
* Inconsistencies in state where some entries are abbreviation and others are full name

In [None]:
# Check the state entries to see what states are in full name
patients.state.value_counts()

# Create a dictionary for California, New York, Illinois, Florida, Nebraska
state_abbrev = {'California': 'CA', 'New York': 'NY',
         'Illinois': 'IL', 'Florida': 'FL',
         'Nebraska': 'NE'}

In [None]:
# Create a abbreviation function
def abbrev_state(patient):
    if patient['state'] in state_abbrev.keys():
        abbrev = state_abbrev[patient['state']]
        return abbrev
    else:
        return patient['state']

# Apply the function to the table
patients['state'] = patients.apply(abbrev_state, axis=1)

In [None]:
# Confirm
patients.state.value_counts()

* min() height is 27 which is invalid
* min() weight 48.8kgs --> lbs
* Typo Dsvid Gustafsson
* Missing demographic information in address, contact columns which cannot be arbitrarily filled (cannot clean yet)
* Default John Doe data
* Strip all " ", "-", "(", ")", and "+" and store each number without any formatting. Pad the phone number with a 1 if the length of the number is 10 digits (we want country code).

In [None]:
# Convert the invalid '27' height to 72
patients.height = patients.height.replace(27, 72)

In [None]:
# Convert '48.8kg' weight to lbs
# First, get the info about patient with minimum weight 48.8kg
patients[patients['weight'] == patients.weight.min()]

In [None]:
# mask
kg = patients.weight.min()
mask = patients.surname == 'Zaitseva'
column_name = 'weight'

# Calculate weight(kg)*2.20462 to get weight(lbs)
patients.loc[mask, column_name] = kg*2.20462

# Recalculate bmi with amended lbs weight
lbs = patients[patients.surname=='Zaitseva'].weight
height = patients[patients.surname=='Zaitseva'].height
column_name = 'bmi'
patients.loc[mask, column_name] = 703 * lbs / (height * height)

In [None]:
# Confirm
patients.query("surname == 'Zaitseva'")

In [None]:
mask

#### Clean
##### Define
* Make phone_number consistent by stripping all " ", "-", "(", ")", "+" 
* Store each number without any formatting
* Pad 10 digit phone_number with a 1 (the US country code) 

In [None]:
patients.phone_number.dtypes

In [None]:
patients.phone_number = patients.phone_number.str.replace(r'\D+','').str.pad(11, fillchar='1')

In [None]:
patients

#### Clean
##### Define
* Remove 'u' from `dose_start` and `dose_end`
* Then, convert the datatype to integer

In [None]:
treatments.dose_start = treatments.dose_start.str.strip('u').astype(int)
treatments.dose_end = treatments.dose_end.str.strip('u').astype(int)

In [None]:
treatments.head()

In [None]:
# Confirm
treatments.dtypes

#### Clean
##### Define
* Fix typo Dsvid Gustafsson --> David: .replace()

In [None]:
patients.given_name = patients.given_name.replace('Dsvid', 'David')

# Confirm
patients.query('surname == "Gustafsson"')

#### Clean
##### Define
* Eliminate the non-recoverable records of John Doe from the `patients` table

In [None]:
patients = patients[patients.surname != 'Doe']

# Confirm
patients.query('surname =="Doe"')

#### Clean
##### Define
* Remove multiple records

In [None]:
patients = patients[~((patients.address.duplicated()) & patients.address.notnull())]