In [None]:
# %% 

import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import random
random.seed(42)
from IPython.display import display, HTML
from scipy.stats import chi2_contingency

# Function to fill empty cells with the mode value
def fill_empty_cells(df):
    df_copy = df.copy()
    df_copy.replace('', np.nan, inplace=True)
    
    summary_df = pd.DataFrame(columns=['Column', 'Number Of Missing Cells', 'Percentage Of Null Values',
                                       'Missing Indices', 'Fill Summary'])
    
    total_rows = len(df_copy)
    filled_columns = []
    
    for col in df_copy.columns:
        null_indices = df_copy[df_copy[col].isnull()].index
        if null_indices.size > 0:
            num_missing = len(null_indices)
            percentage_missing = (num_missing / total_rows) * 100
            
            fill_value = df_copy[col].mode()[0]
            fill_summary = f"filled with mode: {fill_value}"
            df_copy[col].fillna(fill_value, inplace=True)
            
            filled_columns.append(col)
            
            temp_df = pd.DataFrame({
                'Column': [col],
                'Number Of Missing Cells': [num_missing],
                'Percentage Of Null Values': [f'{percentage_missing:.2f}%'],
                'Missing Indices': [null_indices.tolist()],
                'Fill Summary': [fill_summary]
            })
            
            summary_df = pd.concat([summary_df, temp_df], ignore_index=True)
    
    return df_copy, summary_df

def empty_cells(df, dataset_name="Dataset"):
    if df.isnull().values.any() or (df == '').any().any():
        display(HTML(f'<h2>{dataset_name} Data Set Contains Empty Cells</h2>'))
        df_filled, summary_df = fill_empty_cells(df)
    else:
        display(HTML(f'<h2>{dataset_name} Data Set Does Not Contain Empty Cells</h2>'))
        df_filled, summary_df = df.copy(), pd.DataFrame()

    return df_filled, summary_df

def translate_hospitalization1_dataset(df):
    # Rename columns
    df.rename(columns={
        "סוג קבלה": "Receipt_Type",
        "מהיכן המטופל הגיע": "Patient_Origin",
        "אבחנות בקבלה": "Admission_Diagnoses",
        "ימי אשפוז": "Hospitalization_Days",
        "אבחנות בשחרור": "Release_Diagnoses",
        "רופא משחרר-קוד": "Release_Doctor_Code"
    }, inplace=True)

    # Replace values in the "Receipt_Type" column
    df["Receipt_Type"].replace({
        "דחוף": "Urgent",
        "מוזמן": "Invited",
        "אשפוז יום": "Day_Hospitalization"
    }, inplace=True)

    # Replace values in the "Patient_Origin" column
    df["Patient_Origin"].replace({
        "מביתו": "Home",
        "ממוסד": "Institution",
        "אחר": "Other",
        "ממרפאה": "Clinic",
        "מבית חולים אחר": "Different_Hospital"
    }, inplace=True)

    # Replace values in the "Release_Type" column
    df["Release_Type"].replace({
        "שוחרר לביתו": "Home_Released",
        "שוחרר למוסד": "Institution_Released"
    }, inplace=True)

    return df

In [None]:
# %% 

# Load Dataset
path = "F:\\לימודים\\תואר שני\\סמסטר ב\\Data Science 2\\DS2-Final Project\\hospitalization1.xlsx"
hptl1 = pd.read_excel(path)

display(HTML('<h2>Hospitalization1 Dataset</h2>'))
display(hptl1.head(5))

Unnamed: 0,Patient,unitName1,Admission_Medical_Record,Admission_Entry_Date,Release_Date,סוג קבלה,מהיכן המטופל הגיע,Release_Type,רופא משחרר-קוד,ימי אשפוז,אבחנות בקבלה,אבחנות בשחרור
0,62051,1,5207766,2021-09-04 22:21:04.440,2021-09-08 14:43:00,דחוף,מביתו,שוחרר לביתו,6878.0,4,42731,42731
1,951769,2,5605505,2022-02-15 14:39:02.220,2022-02-16 13:37:00,דחוף,אחר,שוחרר לביתו,12615.0,1,Z3601,Z3601
2,863707,2,6755794,2023-05-15 04:21:09.137,2023-05-15 15:59:00,דחוף,מביתו,שוחרר לביתו,12599.0,0,7895,7895
3,884200,3,6964052,2023-07-27 08:16:48.640,2023-07-28 17:34:00,דחוף,ממוסד,שוחרר לביתו,11582.0,1,78609,"4919 , 78609"
4,936226,4,7023187,2023-08-20 19:44:35.410,2023-08-21 15:47:00,דחוף,מביתו,שוחרר לביתו,12359.0,1,78609,"78609 , 81203 , 83100"


In [None]:
# %% 

display(HTML('<h2>Translated Hospitalization1 Dataset</h2>'))

hptl1 = translate_hospitalization1_dataset(hptl1)

display(hptl1.head(5))

Unnamed: 0,Patient,unitName1,Admission_Medical_Record,Admission_Entry_Date,Release_Date,Receipt_Type,Patient_Origin,Release_Type,Release_Doctor_Code,Hospitalization_Days,Admission_Diagnoses,Release_Diagnoses
0,62051,1,5207766,2021-09-04 22:21:04.440,2021-09-08 14:43:00,Urgent,Home,Home_Released,6878.0,4,42731,42731
1,951769,2,5605505,2022-02-15 14:39:02.220,2022-02-16 13:37:00,Urgent,Other,Home_Released,12615.0,1,Z3601,Z3601
2,863707,2,6755794,2023-05-15 04:21:09.137,2023-05-15 15:59:00,Urgent,Home,Home_Released,12599.0,0,7895,7895
3,884200,3,6964052,2023-07-27 08:16:48.640,2023-07-28 17:34:00,Urgent,Institution,Home_Released,11582.0,1,78609,"4919 , 78609"
4,936226,4,7023187,2023-08-20 19:44:35.410,2023-08-21 15:47:00,Urgent,Home,Home_Released,12359.0,1,78609,"78609 , 81203 , 83100"


In [None]:
# %% 

# Apply the empty_cells function to the hospitalization dataset
# Example usage
filled_hptl1, summary = empty_cells(hptl1, dataset_name="Hospitalization1")
display(HTML('<h3>Missing Values Completion Summary </h3>'))
display(summary)

Unnamed: 0,Column,Number Of Missing Cells,Percentage Of Null Values,Missing Indices,Fill Summary
0,Receipt_Type,64,0.91%,"[79, 257, 260, 510, 595, 619, 625, 1214, 1239,...",filled with mode: Urgent
1,Release_Doctor_Code,114,1.62%,"[130, 171, 296, 317, 432, 451, 510, 595, 630, ...",filled with mode: 6888.0
2,Admission_Diagnoses,461,6.55%,"[22, 44, 55, 139, 156, 171, 174, 198, 203, 208...",filled with mode: 78609
3,Release_Diagnoses,29,0.41%,"[171, 510, 595, 786, 846, 952, 1218, 1239, 133...",filled with mode: 7865


In [None]:
# %% 

temp = empty_cells(filled_hptl1, dataset_name=" Filled Hospitalization1")