In [23]:
import pandas as pd
import numpy as np

In [24]:
df = pd.read_csv("dataset.csv")

In [25]:
columns_to_drop = list(df.loc[:, "CVA_tenderness":"chief_complaint"].columns) + \
                  list(df.loc[:, "arrival":"VITAMINS"].columns) + \
                  ['lang', 'employStatus', 'insurance_status', 'disposition', 'dispo', 
                   'UTI_diag', 'split', 'alt_diag', 'maritalStatus', 'race']

# Drop all the columns at once, except Urinary_tract_infections
df = df.drop(columns=[col for col in columns_to_drop if col != "Urinary_tract_infections"])

In [26]:
df = df.drop(df[df['ua_clarity'] == 'not_reported'].index)
df = df.drop(df[df['ua_ph'] == 'not_reported'].index)
df['ua_urobili'] = df['ua_urobili'].replace('not_reported', 'negative')
df['ua_bacteria'] = df['ua_bacteria'].replace('none', 'negative')
df = df.replace('not_reported', 'negative')
df = df[df['age'] <= 60]
df.shape

(39491, 24)

In [27]:
columns_to_check = ['ua_blood', 'ua_clarity', 'ua_color', 'ua_epi', 'ua_glucose', 
                    'ua_ketones', 'ua_leuk', 'ua_nitrite', 'ua_ph', 'ua_protein', 
                    'ua_rbc', 'ua_wbc', 'ua_bili']

# Filter the DataFrame to exclude rows where any of the columns has the value 'other'
clean_df = df[~df[columns_to_check].isin(['other']).any(axis=1)]


clean_df.shape

(38913, 24)

In [28]:
# Binary Encoding for binary columns
binary_columns = {
    'UCX_abnormal': {'yes': 1, 'no': 0},
    'abxUTI': {'yes': 1, 'no': 0},
    'ua_urobili': {'positive': 1, 'negative': 0},
    'Urinary_tract_infections': {'Yes': 1, 'No': 0} 
}

for col, mapping in binary_columns.items():
    clean_df[col] = clean_df[col].map(mapping)

# Ordinal Encoding for columns with natural order
ordinal_mappings = {
    'ua_bacteria': ['negative', 'few', 'moderate', 'marked', 'many'],
    'ua_bili': ['negative', 'small', 'moderate', 'large'],
    'ua_blood': ['negative', 'small', 'moderate', 'large'],
    'ua_clarity': ['clear', 'not_clear'],
    'ua_color': ['colorless', 'yellow', 'amber', 'orange', 'red'],
    'ua_epi': ['negative', 'small', 'moderate', 'large'],
    'ua_glucose': ['negative', 'small', 'moderate', 'large'],
    'ua_ketones': ['negative', 'small', 'moderate', 'large', '4+'],
    'ua_leuk': ['negative', 'small', 'moderate', 'large'],
    'ua_nitrite': ['negative', 'positive', 'other'],
    # 'ua_ph': ['5.0', '5.5', '6.0', '6.5', '7.0', '7.5', '8.0', '8.5', '9.0'],
    'ua_protein': ['negative', 'small', 'moderate', 'large'],
    'ua_rbc': ['negative', 'small', 'moderate', 'large'],
    'ua_wbc': ['negative', 'small', 'moderate', 'large']

}

for col, order in ordinal_mappings.items():
    clean_df[col] = pd.Categorical(clean_df[col], categories=order, ordered=True).codes


# One-Hot Encoding for categorical columns without a natural order
one_hot_cols = ['ethnicity']

# Use pd.get_dummies to one-hot encode the specified columns
clean_df = pd.get_dummies(clean_df, columns=one_hot_cols)


clean_df['Female'] = clean_df['gender'].apply(lambda x: True if x == 'Female' else False)
clean_df = clean_df.drop(columns=['gender'])


clean_df.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[col] = clean_df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[col] = clean_df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  clean_df[col] = clean_df[col].map(mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,ID,PATID,UCX_abnormal,ua_bacteria,ua_bili,ua_blood,ua_clarity,ua_color,ua_epi,ua_glucose,...,ua_wbc,age,Urinary_tract_infections,abxUTI,ethnicity_Hispanic or Latino,ethnicity_Non-Hispanic,ethnicity_Patient Refused,ethnicity_Unknown,ethnicity_negative,Female
4,5,4,0,2,1,0,0,3,0,0,...,1,55,0,0,False,True,False,False,False,False
5,6,4,1,1,1,3,1,1,3,0,...,1,47,0,0,False,True,False,False,False,True
10,11,9,0,1,0,0,0,1,1,0,...,2,31,0,1,True,False,False,False,False,True
11,12,10,1,1,0,1,1,1,2,0,...,2,54,0,0,False,True,False,False,False,True
14,15,12,0,0,0,0,0,1,0,0,...,0,24,0,1,False,True,False,False,False,False
15,16,13,0,0,0,0,0,1,0,0,...,0,24,0,1,False,True,False,False,False,False
16,17,13,1,3,0,1,1,1,3,0,...,2,36,0,1,True,False,False,False,False,True
17,18,14,0,1,0,2,1,1,1,0,...,1,46,0,0,False,True,False,False,False,True
19,20,16,0,4,1,3,1,4,2,0,...,1,52,0,1,False,True,False,False,False,True
21,22,18,1,4,0,0,0,3,2,0,...,1,35,0,1,False,True,False,False,False,True


In [29]:
clean_df.to_csv('fullSet.csv', index=False)