In [13]:
import pandas as pd

# --- Load your FBC data (replace 'fbc_data.csv' with your actual file) ---
fbc_df = pd.read_csv('final_cbc_diagnoses_dataset_with_labels1.csv')

# --- Load the necessary CSV files from MIMIC-III ---
patients_df = pd.read_csv('PATIENTS.csv')
admissions_df = pd.read_csv('ADMISSIONS.csv')



# --- Calculate Age ---
patients_admissions_df = pd.merge(patients_df, admissions_df, on='subject_id')
patients_admissions_df['admittime'] = pd.to_datetime(patients_admissions_df['admittime'], errors='coerce')
patients_admissions_df['dob'] = pd.to_datetime(patients_admissions_df['dob'], errors='coerce')
patients_admissions_df['AGE'] = (patients_admissions_df['admittime'].dt.year - patients_admissions_df['dob'].dt.year)
patients_admissions_df = patients_admissions_df.dropna(subset=['AGE'])
age_df = patients_admissions_df[['subject_id', 'AGE']]

# --- Merge the DataFrames ---
merged_df = pd.merge(fbc_df, age_df, on='subject_id', how='left')

# --- Display and save the merged DataFrame ---
print(merged_df)
merged_df.to_excel('fbc_with_age.xlsx', index=False)
print("Data saved to fbc_with_age.xlsx")

      subject_id  Hemoglobin  Eosinophils  Lymphocytes  Monocytes  Basophils  \
0          10006         NaN          NaN          NaN        NaN        0.3   
1          10006         NaN          NaN          NaN        NaN        0.3   
2          10006         NaN          NaN          NaN        NaN        0.3   
3          10006         NaN          NaN          NaN        NaN        0.3   
4          10006         NaN          NaN          NaN        NaN        0.3   
...          ...         ...          ...          ...        ...        ...   
2967       44228    9.366667          NaN          NaN        NaN        0.1   
2968       44228    9.366667          NaN          NaN        NaN        0.1   
2969       44228    9.366667          NaN          NaN        NaN        0.1   
2970       44228    9.366667          NaN          NaN        NaN        0.1   
2971       44228    9.366667          NaN          NaN        NaN        0.1   

      Eosinophils.1  Hematocrit  Hemogl

In [17]:
import pandas as pd

# --- Load your FBC data (replace 'fbc_data.csv' with your actual file) ---
fbc_df = pd.read_csv('fbc_with_age.csv')

# --- Load the necessary CSV files from MIMIC-III ---
labevents_df = pd.read_csv('LABEVENTS.csv')



# --- Extract Cholesterol Levels ---
cholesterol_df = labevents_df[labevents_df['itemid'] == 50902]
cholesterol_df = cholesterol_df.rename(columns={'valuenum': 'CHOLESTEROL'})
cholesterol_df = cholesterol_df.groupby('subject_id').agg({'CHOLESTEROL': 'max'}).reset_index()

# --- Merge the DataFrames ---
merged_df = pd.merge(fbc_df, cholesterol_df, on='subject_id', how='left')

# --- Display and save the merged DataFrame ---
print(merged_df)
merged_df.to_excel('fbc_with_cholesterol.xlsx', index=False)
print("Data saved to fbc_with_cholesterol.xlsx")

      subject_id  Hemoglobin  Eosinophils  Lymphocytes  Monocytes  Basophils  \
0          10006         NaN          NaN          NaN        NaN        0.3   
1          10006         NaN          NaN          NaN        NaN        0.3   
2          10006         NaN          NaN          NaN        NaN        0.3   
3          10006         NaN          NaN          NaN        NaN        0.3   
4          10006         NaN          NaN          NaN        NaN        0.3   
...          ...         ...          ...          ...        ...        ...   
2967       44228    9.366667          NaN          NaN        NaN        0.1   
2968       44228    9.366667          NaN          NaN        NaN        0.1   
2969       44228    9.366667          NaN          NaN        NaN        0.1   
2970       44228    9.366667          NaN          NaN        NaN        0.1   
2971       44228    9.366667          NaN          NaN        NaN        0.1   

      Eosinophils.1  Hematocrit  Hemogl

In [23]:
import pandas as pd

# --- Load your FBC data (replace 'fbc_data.csv' with your actual file) ---
fbc_df = pd.read_csv('fbc_with_cholesterol.csv')

# --- Load the necessary CSV files from MIMIC-III ---
chartevents_df = pd.read_csv('CHARTEVENTS.csv', low_memory=False, dtype={
    'value': str,
    'valuenum': float,
    'valueuom': str,
    'storetime': str,
})



# --- Extract Blood Pressure ---
systolic_bp_df = chartevents_df[chartevents_df['itemid'] == 220179]
diastolic_bp_df = chartevents_df[chartevents_df['itemid'] == 220180]

systolic_bp_df = systolic_bp_df.rename(columns={'valuenum': 'SYSTOLIC_BP'})
diastolic_bp_df = diastolic_bp_df.rename(columns={'valuenum': 'DIASTOLIC_BP'})

bp_df = pd.merge(systolic_bp_df[['subject_id', 'charttime', 'SYSTOLIC_BP']],
                 diastolic_bp_df[['subject_id', 'charttime', 'DIASTOLIC_BP']],
                 on=['subject_id', 'charttime'], how='outer')

bp_df = bp_df.groupby('subject_id').agg({'SYSTOLIC_BP': 'max', 'DIASTOLIC_BP': 'max'}).reset_index()

# --- Merge the DataFrames ---
merged_df = pd.merge(fbc_df, bp_df, on='subject_id', how='left')

# --- Display and save the merged DataFrame ---
print(merged_df)
merged_df.to_excel('fbc_with_blood_pressure1.xlsx', index=False)
print("Data saved to fbc_with_blood_pressure1.xlsx")

      subject_id  Hemoglobin  Eosinophils  Lymphocytes  Monocytes  Basophils  \
0          10006         NaN          NaN          NaN        NaN        0.3   
1          10006         NaN          NaN          NaN        NaN        0.3   
2          10006         NaN          NaN          NaN        NaN        0.3   
3          10006         NaN          NaN          NaN        NaN        0.3   
4          10006         NaN          NaN          NaN        NaN        0.3   
...          ...         ...          ...          ...        ...        ...   
2967       44228    9.366667          NaN          NaN        NaN        0.1   
2968       44228    9.366667          NaN          NaN        NaN        0.1   
2969       44228    9.366667          NaN          NaN        NaN        0.1   
2970       44228    9.366667          NaN          NaN        NaN        0.1   
2971       44228    9.366667          NaN          NaN        NaN        0.1   

      Eosinophils.1  Hematocrit  Hemogl

In [25]:
import pandas as pd

# --- Load your FBC data (replace 'fbc_data.csv' with your actual file) ---
fbc_df = pd.read_csv('fbc_with_blood_pressure1.csv')

# --- Load the necessary CSV files from MIMIC-III ---
labevents_df = pd.read_csv('LABEVENTS.csv')



# --- Extract Cholesterol Levels ---
cholesterol_df = labevents_df[labevents_df['itemid'] == 50902]
cholesterol_df = cholesterol_df.rename(columns={'valuenum': 'CHOLESTEROL'})
cholesterol_df = cholesterol_df.groupby('subject_id').agg({'CHOLESTEROL': 'max'}).reset_index()

# --- Merge the DataFrames ---
merged_df = pd.merge(fbc_df, cholesterol_df, on='subject_id', how='left')

# --- Display and save the merged DataFrame ---
print(merged_df)
merged_df.to_excel('fbc_with_cholesterol.xlsx', index=False)
print("Data saved to fbc_with_cholesterol.xlsx")

      subject_id  Hemoglobin  Eosinophils  Lymphocytes  Monocytes  Basophils  \
0          10006         NaN          NaN          NaN        NaN        0.3   
1          10006         NaN          NaN          NaN        NaN        0.3   
2          10006         NaN          NaN          NaN        NaN        0.3   
3          10006         NaN          NaN          NaN        NaN        0.3   
4          10006         NaN          NaN          NaN        NaN        0.3   
...          ...         ...          ...          ...        ...        ...   
2967       44228    9.366667          NaN          NaN        NaN        0.1   
2968       44228    9.366667          NaN          NaN        NaN        0.1   
2969       44228    9.366667          NaN          NaN        NaN        0.1   
2970       44228    9.366667          NaN          NaN        NaN        0.1   
2971       44228    9.366667          NaN          NaN        NaN        0.1   

      Eosinophils.1  Hematocrit  Hemogl

In [29]:
import pandas as pd

# --- Load your FBC data (replace 'fbc_data.csv' with your actual file) ---
fbc_df = pd.read_csv('fbc_with_blood_pressure1.csv')

# --- Load the necessary CSV files from MIMIC-III ---
diagnoses_icd_df = pd.read_csv('DIAGNOSES_ICD.csv')


# --- Extract Diabetes Information ---
diabetes_df = diagnoses_icd_df[diagnoses_icd_df['icd9_code'].str.startswith('250', na=False)][['subject_id']]
diabetes_df['DIABETES'] = 1
diabetes_df = diabetes_df.drop_duplicates()

# --- Merge the DataFrames ---
merged_df = pd.merge(fbc_df, diabetes_df, on='subject_id', how='left')
merged_df['DIABETES'] = merged_df['DIABETES'].fillna(0) # Fill NaN with 0, meaning no diabetes.

# --- Display and save the merged DataFrame ---
print(merged_df)
merged_df.to_excel('fbc_with_diabetes.xlsx', index=False)
print("Data saved to fbc_with_diabetes.xlsx")

      subject_id  Hemoglobin  Eosinophils  Lymphocytes  Monocytes  Basophils  \
0          10006         NaN          NaN          NaN        NaN        0.3   
1          10006         NaN          NaN          NaN        NaN        0.3   
2          10006         NaN          NaN          NaN        NaN        0.3   
3          10006         NaN          NaN          NaN        NaN        0.3   
4          10006         NaN          NaN          NaN        NaN        0.3   
...          ...         ...          ...          ...        ...        ...   
2967       44228    9.366667          NaN          NaN        NaN        0.1   
2968       44228    9.366667          NaN          NaN        NaN        0.1   
2969       44228    9.366667          NaN          NaN        NaN        0.1   
2970       44228    9.366667          NaN          NaN        NaN        0.1   
2971       44228    9.366667          NaN          NaN        NaN        0.1   

      Eosinophils.1  Hematocrit  Hemogl

In [33]:
import pandas as pd

# Load CSV file
file_path = "dataset.csv"  # Change this to your actual file path
df = pd.read_csv(file_path)

# Handling missing values
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())  # Fill numerical columns with mean

categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])  # Fill categorical columns with mode

# Save preprocessed data
df.to_csv("preprocessed_data.csv", index=False)

print("Preprocessing completed. Preprocessed data saved as 'preprocessed_data.csv'.")


Preprocessing completed. Preprocessed data saved as 'preprocessed_data.csv'.


In [10]:
import pandas as pd

# Load CSV file
file_path = "dataset.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Drop unnecessary columns (Replace ['column1', 'column2'] with actual column names)
columns_to_drop = ['Eosinophils', 'Lymphocytes','Monocytes','Monocytes.3','Eosinophils.3','Basophils','Eosinophils.1','dod','dob','dod_hosp','dod_ssn','expire_flag','Eosinophils.2','White Blood Cells','Lymphocytes.1','Monocytes.1','Neutrophils','Monocytes.2','Lymphocytes.2','Heamoglobin','Monocytes.2','Eosinophils.3','Lymphocytes.3','Lymphocytes.4']  # Specify the columns you want to remove
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Handling missing values
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_cols] = df[numeric_cols].fillna(0)
 

categorical_cols = df.select_dtypes(include=['object']).columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])  # Fill categorical columns with mode

# Save preprocessed data
df.to_csv("preprocessed_data1.csv", index=False)

print("Preprocessing completed. Preprocessed data saved as 'preprocessed_data.csv'.")


Preprocessing completed. Preprocessed data saved as 'preprocessed_data.csv'.
