# Data Science 2 - final project results - team 11: Barack Samuni & Barak Yaakov
***

In [50]:
import os
import pandas as pd
import numpy as np
from src.team_11.task7.task7 import extract_hebrew_content
from task7.task7 import translate_dataframe

## Task 7: Data cleaning and completion for table: hospitalization2
***

In [51]:

# ========== Path Config ==========
# The data is present in the data folder two directories up
data_dir = os.path.abspath(os.path.join(os.getcwd(),"..","data"))
load_path = os.path.join(data_dir, "rehospitalization.xlsx")
sheet = 'hospitalization2'
export_path = fr'task7\{sheet}_Team_11.csv'


# =========== Helper Functions ===========

def fill_na_with_other_col(df, col_to_fill, col_reference, fill_val=-1, reference_val=-2):
    """
    Fills missing values in col_to_fill with values from col_reference based on conditions.
    """
    mask_fill = (df[col_to_fill] == fill_val) & (df[col_reference] != reference_val)
    df.loc[mask_fill, col_to_fill] = df.loc[mask_fill, col_reference]

def fill_missing_doctors(df, diagnosis_col, doctor_map):
    """
    Fills missing doctors based on the most frequent diagnosis association.
    """
    mask = (df['Releasing_Doctor'] == -1) & (df[diagnosis_col] != -1)
    df.loc[mask, 'Releasing_Doctor'] = df.loc[mask, diagnosis_col].map(lambda diag: doctor_map.get(get_diag(diag), -1))

def get_diag_categories(df):
    """
    Returns unique diagnosis categories for rows with missing doctors.
    """
    diagnoses = df.loc[df['Releasing_Doctor'] == -1, ['Diagnosis_In_Reception', 'Diagnosis_In_Release']]
    all_diags = pd.concat([diagnoses['Diagnosis_In_Release'], diagnoses['Diagnosis_In_Reception']])
    cats = set(get_diag(diag) for diag in all_diags if diag != -1)
    return list(cats)

def get_diag(diag):
    # Convert to string and split by comma, stripping spaces from each part
    split_diag = [part.strip() for part in str(diag).split(',') if part.strip()]
    # Return the first non-empty value or None if the list is empty
    return split_diag[0] if split_diag else None

# A function that translates the contents of a column in a DataFrame according to a given dictionary
def translate_column(df, translations_dict, column_name):
    for i in range(len(df[column_name])):
        df.loc[i, column_name] = translations_dict[df[column_name][i]]
        
def calculate_optimal_split(df, column_name, number_of_quartiles):
    quantiles = np.linspace(0, 100, number_of_quartiles + 1)
    quartiles = np.percentile(df[column_name], quantiles)
    bin_edges = [df[column_name].min()] + list(quartiles[1:-1]) + [df[column_name].max()]
    return bin_edges

# =========== Dataset loading & Initial Fix ===========
df = pd.read_excel(load_path, sheet_name=sheet)
initial_row_count = df.shape[0]  # Record the initial number of rows

# Check for null values before cleaning
print(f'Total samples before cleaning: {initial_row_count}\n')
print("Null values before cleaning:\n")
df.isnull().sum()

Total samples before cleaning: 8917

Null values before cleaning:



Patient                         0
unitName1                       0
Admission_Medical_Record        0
Admission_Entry_Date            0
Release_Date                    0
unitName2                       0
Admission_Medical_Record2       0
Admission_Entry_Date2           0
Release_Date2                   0
סוג קבלה                       68
מהיכן המטופל הגיע               0
Release_Type                    0
רופא משחרר                     88
ימי אשפוז                       0
אבחנות בקבלה                  802
אבחנות בשחרור                 233
מחלקות מייעצות               4176
ct                              0
dtype: int64

In [52]:
df.rename(columns={'סוג קבלה': 'Entry_Type', 'מהיכן המטופל הגיע': 'Patient_Origin', 
                   'רופא משחרר': 'Releasing_Doctor', 'ימי אשפוז': 'Admission_Days2', 
                   'אבחנות בקבלה': 'Diagnosis_In_Reception', 'אבחנות בשחרור': 'Diagnosis_In_Release', 
                   'מחלקות מייעצות': 'Advisory_Departments'}, inplace=True) # translating column names

# Clean 'Diagnosis_In_Reception' & 'Diagnosis_In_Release' columns
df.fillna({'Diagnosis_In_Reception': -1, 'Diagnosis_In_Release': -2}, inplace=True)

fill_na_with_other_col(df, 'Diagnosis_In_Reception', 'Diagnosis_In_Release', fill_val=-1, reference_val=-2) 
fill_na_with_other_col(df, 'Diagnosis_In_Release', 'Diagnosis_In_Reception', fill_val=-2, reference_val=-1)

# Clean 'Releasing_Doctor' column
df['Releasing_Doctor'] = df['Releasing_Doctor'].fillna(-1)

# Identify diagnosis categories and fill missing doctors
diag_categories = get_diag_categories(df)
doctor_map = {
    cat: (df[df['Diagnosis_In_Release'].fillna('').astype(str).str.contains(cat)]['Releasing_Doctor'].mode().iloc[0]
          if not df[df['Diagnosis_In_Release'].fillna('').astype(str).str.contains(cat)]['Releasing_Doctor'].mode().empty
          else -1)
    for cat in diag_categories
}
fill_missing_doctors(df, 'Diagnosis_In_Release', doctor_map)
fill_missing_doctors(df, 'Diagnosis_In_Reception', doctor_map)

# Clean 'Entry_Type' column and translate values
df['Entry_Type'] = df['Entry_Type'].fillna('דחוף')
translate_column(df, {'דחוף': 'urgent', 'מוזמן': 'scheduled', 'אשפוז יום': 'day hospitalization'}, 'Entry_Type')
translate_column(df, {'ממרפאה': 'medical clinic', 'מבית חולים אחר': 'different hospital', 
                      'ממוסד': 'institude', 'מביתו': 'home', 'אחר': 'other'}, 'Patient_Origin')
translate_column(df, {'שוחרר לביתו': 'home', 'שוחרר למוסד': 'institude'}, 'Release_Type')

# Remove irrelevant columns and clean-up
df.drop(columns=['Advisory_Departments'], inplace=True) # This column seems irrelevant for purposes, admission no releasing department is crucial
df = df[~((df['Releasing_Doctor'] == -1) & (df['Diagnosis_In_Reception'] == -1) & (df['Diagnosis_In_Release'] == -1))]

# Check for null values after cleaning
print("Null values after cleaning:")
df.isnull().sum()

Null values after cleaning:


Patient                      0
unitName1                    0
Admission_Medical_Record     0
Admission_Entry_Date         0
Release_Date                 0
unitName2                    0
Admission_Medical_Record2    0
Admission_Entry_Date2        0
Release_Date2                0
Entry_Type                   0
Patient_Origin               0
Release_Type                 0
Releasing_Doctor             0
Admission_Days2              0
Diagnosis_In_Reception       0
Diagnosis_In_Release         0
ct                           0
dtype: int64

In [53]:
# Calculate percentage of data loss
final_row_count = df.shape[0]
data_loss_percentage = ((initial_row_count - final_row_count) / initial_row_count) * 100
print(f'Total samples after cleaning: {final_row_count}\n')
print(f'Percentage of data lost during cleaning: {data_loss_percentage:.2f}%\n')

# Date formatting and calculation
df['Admission_Entry_Date'] = pd.to_datetime(df['Admission_Entry_Date'], format='%d/%m/%Y %H:%M')
df['Release_Date'] = pd.to_datetime(df['Release_Date'], format='%d/%m/%Y %H:%M:%S')
#df['Admission_Days'] = (df['Release_Date'] - df['Admission_Entry_Date']).dt.days.abs()

# Categorize 'Days_Between_Admissions'
df['Admission_Entry_Date2'] = pd.to_datetime(df['Admission_Entry_Date2'], format='%d/%m/%Y %H:%M')
# df['Days_Between_Admissions'] = (df['Admission_Entry_Date2'] - df['Release_Date']).dt.days.abs()
# bin_edges = calculate_optimal_split(df, 'Days_Between_Admissions', 3)
# df['Period_Between_Admissions'] = pd.cut(df['Days_Between_Admissions'], bins=bin_edges, labels=['short', 'mid', 'long'])

# Export cleaned data
#df.drop(columns=['Admission_Medical_Record', 'Admission_Medical_Record2', 'Days_Between_Admissions'], inplace=True)
df.to_csv(export_path, index=False)


Total samples after cleaning: 8917

Percentage of data lost during cleaning: 0.00%



We can see that now the data contains no missing values and no rows were lost.