## EXCERISE 25 - FINDING THE RELATIONSHIP BETWEEN DOCTOR'S WORK LOAD AND REHOSPITALIZATION

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import statsmodels.api as sm

In [None]:
# Load the Excel file
file_path = 'rehospitalization.xlsx'
excel_data = pd.ExcelFile(file_path)

# Load the 'hospitalization2' sheet
hospitalization_data = excel_data.parse('hospitalization2')

# Convert relevant columns to datetime for processing
hospitalization_data['Admission_Entry_Date2'] = pd.to_datetime(hospitalization_data['Admission_Entry_Date2'], errors='coerce')
hospitalization_data['Release_Date'] = pd.to_datetime(hospitalization_data['Release_Date'], errors='coerce')

# Calculate the days to rehospitalization using Release_Date and Admission_Entry_Date2
hospitalization_data['Days_To_Rehospitalization'] = (hospitalization_data['Admission_Entry_Date2'] - hospitalization_data['Release_Date']).dt.days

# Filter for rehospitalizations within 30 days
rehospitalized_patients = hospitalization_data[hospitalization_data['Days_To_Rehospitalization'] <= 30]

# Select relevant columns for merging with doctor workload
rehospitalized_filtered = rehospitalized_patients[['Patient', 'Release_Date', 'Admission_Entry_Date2', 'רופא משחרר', 'Days_To_Rehospitalization']]

# Load and merge doctor workload data
er_doctor_workload = excel_data.parse('erDoctor').groupby('קוד רופא')['כמות מטופלים'].sum().reset_index()
h_doctor_workload = excel_data.parse('hDoctor').groupby('קוד רופא')['כמות מטופלים'].sum().reset_index()
doctor_workload = pd.merge(er_doctor_workload, h_doctor_workload, on='קוד רופא', how='outer', suffixes=('_ER', '_Hospital'))
doctor_workload['Total_Patients'] = doctor_workload['כמות מטופלים_ER'].fillna(0) + doctor_workload['כמות מטופלים_Hospital'].fillna(0)

# Merge rehospitalization data with doctor workload
rehospitalization_doctor_analysis = pd.merge(
    rehospitalized_filtered,
    doctor_workload,
    left_on='רופא משחרר',
    right_on='קוד רופא',
    how='left'
)

# Filter out records where the days to rehospitalization are negative
rehospitalization_doctor_analysis_cleaned = rehospitalization_doctor_analysis[rehospitalization_doctor_analysis['Days_To_Rehospitalization'] >= 0]

# Group by doctor and calculate average days to rehospitalization
doctor_rehospitalization_analysis_cleaned = rehospitalization_doctor_analysis_cleaned.groupby('רופא משחרר').agg(
    Total_Patients=('Total_Patients', 'first'),
    Avg_Days_To_Rehospitalization=('Days_To_Rehospitalization', 'mean')
).reset_index()

# Pearson correlation test for average days to rehospitalization
correlation_coefficient, p_value = pearsonr(
    doctor_rehospitalization_analysis_cleaned['Total_Patients'],
    doctor_rehospitalization_analysis_cleaned['Avg_Days_To_Rehospitalization']
)

# Linear regression for average days to rehospitalization
X = doctor_rehospitalization_analysis_cleaned['Total_Patients']
y = doctor_rehospitalization_analysis_cleaned['Avg_Days_To_Rehospitalization']
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()

# Calculate the count of rehospitalizations per doctor
rehospitalization_count_per_doctor = rehospitalization_doctor_analysis_cleaned.groupby('רופא משחרר').agg(
    Total_Patients=('Total_Patients', 'first'),
    Rehospitalization_Count=('Patient', 'count')
).reset_index()

# Merge rehospitalization count with doctor analysis
doctor_analysis_with_rehospitalizations = pd.merge(
    doctor_rehospitalization_analysis_cleaned,
    rehospitalization_count_per_doctor[['רופא משחרר', 'Rehospitalization_Count']],
    on='רופא משחרר',
    how='left'
)

# Pearson correlation test for rehospitalization count
correlation_rehospitalizations_final, p_value_rehospitalizations_final = pearsonr(
    doctor_analysis_with_rehospitalizations['Total_Patients'],
    doctor_analysis_with_rehospitalizations['Rehospitalization_Count']
)

# Linear regression for rehospitalization count
X_rehospitalizations_final = doctor_analysis_with_rehospitalizations[['Total_Patients']]
y_rehospitalizations_final = doctor_analysis_with_rehospitalizations['Rehospitalization_Count']
X_rehospitalizations_final = sm.add_constant(X_rehospitalizations_final)
rehospitalization_model_final = sm.OLS(y_rehospitalizations_final, X_rehospitalizations_final).fit()

# Display the average days of rehospitalization data
print("Average Days to Rehospitalization by Doctor:")
print(doctor_rehospitalization_analysis_cleaned[['רופא משחרר', 'Total_Patients', 'Avg_Days_To_Rehospitalization']])

# Display the final correlation and regression results
print("\nCorrelation between Total Patients and Rehospitalization Count:")
print(f"Correlation Coefficient: {correlation_rehospitalizations_final}, P-value: {p_value_rehospitalizations_final}")
print(rehospitalization_model_final.summary())

In [None]:
# Visualization: Scatter Plot of Total Patients vs. Rehospitalization Count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Total_Patients', y='Rehospitalization_Count', data=doctor_analysis_with_rehospitalizations)
plt.title('Scatter Plot of Total Patients vs. Rehospitalization Count')
plt.xlabel('Total Patients Managed by Doctor')
plt.ylabel('Rehospitalization Count')
plt.grid(True)
plt.show()

# Visualization: Regression Line Plot
plt.figure(figsize=(10, 6))
sns.regplot(x='Total_Patients', y='Rehospitalization_Count', data=doctor_analysis_with_rehospitalizations, line_kws={"color":"red"})
plt.title('Regression Line Plot of Total Patients vs. Rehospitalization Count')
plt.xlabel('Total Patients Managed by Doctor')
plt.ylabel('Rehospitalization Count')
plt.grid(True)
plt.show()

# Visualization: Distribution of Days to Rehospitalization
plt.figure(figsize=(10, 6))
sns.histplot(hospitalization_data['Days_To_Rehospitalization'], bins=30, kde=True)
plt.title('Distribution of Days to Rehospitalization')
plt.xlabel('Days to Rehospitalization')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()

# Visualization: Heatmap for Correlation Matrix
plt.figure(figsize=(8, 6))

# Calculate the correlation matrix for relevant features
correlation_matrix = doctor_analysis_with_rehospitalizations[['Total_Patients', 'Rehospitalization_Count']].corr()

# Create a heatmap
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Heatmap of Correlation Matrix')
plt.show()