#Cleaning the dataset

In [31]:
import pandas as pd
df=pd.read_csv("/content/healthcare_dataset_with_new_rules.csv")
df.duplicated().sum()

np.int64(534)

Dropping duplicates

In [32]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

np.int64(0)

Converting dates to datetime objects

In [33]:
df['Date of Admission'] = pd.to_datetime(df['Date of Admission'], format='%d/%m/%Y', errors='coerce')
df['Discharge Date'] = pd.to_datetime(df['Discharge Date'], format='%d/%m/%Y', errors='coerce')

Removing records where date of discharge is before date of admission

In [34]:
df['Length of Stay'] = (df['Discharge Date'] - df['Date of Admission']).dt.days
invalid_dates = df['Length of Stay'].isnull().sum()
negative_stay = (df['Length of Stay'] < 0).sum()

if invalid_dates > 0:
    df.dropna(subset=['Length of Stay'], inplace=True)
    print(f"Removed {invalid_dates} rows with invalid date formats.")
if negative_stay > 0:
    df = df[df['Length of Stay'] >= 0]
    print(f"Removed {negative_stay} rows where discharge date was before admission date.")

Handling missing values

In [35]:
df.isnull().sum()

Unnamed: 0,0
Name,0
Age,0
Gender,0
Blood Type,0
Medical Condition,0
Date of Admission,0
Doctor,0
Hospital,0
Insurance Provider,0
Billing Amount,0


Removing records with impossible age values

In [36]:
illogical_age = df[(df['Age'] < 0) | (df['Age'] > 120)].shape[0]
if illogical_age > 0:
    df = df[(df['Age'] >= 0) & (df['Age'] <= 120)]
    print(f"Removed {illogical_age} rows with illogical age values.")

#Training the model

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

#Importing the dataset
try:
    df = pd.read_csv("/content/healthcare_dataset_with_new_rules.csv")
except FileNotFoundError:
    print("Error: Dataset not found.")
    print("Please make sure the dataset file is in the same directory as the script.")
    exit()

#Defining the features x and y
features=['Age', 'Medical Condition', 'Admission Type', 'Test Results']
target='Readmission'

X=df[features]
y=df[target]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

categorical_features = ['Medical Condition', 'Admission Type', 'Test Results']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

model = RandomForestClassifier(random_state=42)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', model)
])

print("Training the Random Forest model...")
pipeline.fit(X_train, y_train)
print("Training complete.")

y_pred = pipeline.predict(X_test)

print("\n--- Model Evaluation Results ---")

# Print the classification report (precision, recall, f1-score)
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("\nConfusion Matrix:")
print("         Predicted NO | Predicted YES")
conf_matrix = confusion_matrix(y_test, y_pred, labels=['No', 'Yes'])
print(f"Actual NO | {conf_matrix[0][0]:<12} | {conf_matrix[0][1]}")
print(f"Actual YES| {conf_matrix[1][0]:<12} | {conf_matrix[1][1]}")

Training the Random Forest model...
Training complete.

--- Model Evaluation Results ---

Classification Report:
              precision    recall  f1-score   support

          No       1.00      1.00      1.00     10159
         Yes       1.00      1.00      1.00       941

    accuracy                           1.00     11100
   macro avg       1.00      1.00      1.00     11100
weighted avg       1.00      1.00      1.00     11100


Confusion Matrix:
         Predicted NO | Predicted YES
Actual NO | 10159        | 0
Actual YES| 0            | 941


#Data visualization

####Distribution of Numerical Columns

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set_style("whitegrid")
palette = "viridis"


print("Generating distribution plots for numerical data...")
plt.figure(figsize=(12, 5))

# Histogram for Age
plt.subplot(1, 2, 1)
sns.histplot(df['Age'], kde=True, color=sns.color_palette(palette)[0])
plt.title('Distribution of Patient Age')
plt.xlabel('Age')
plt.ylabel('Frequency')

# Histogram for Billing Amount
plt.subplot(1, 2, 2)
sns.histplot(df['Billing Amount'], kde=True, color=sns.color_palette(palette)[2])
plt.title('Distribution of Billing Amount')
plt.xlabel('Billing Amount')
plt.ylabel('Frequency')

plt.tight_layout()
plt.savefig('numerical_distributions.png')

####Relationship between Age and Billing Amount

In [None]:
print("Generating scatter plot for Age vs. Billing Amount...")
plt.figure(figsize=(8, 6))
sns.scatterplot(x='Age', y='Billing Amount', data=df, alpha=0.5, color=sns.color_palette(palette)[4])
plt.title('Age vs. Billing Amount')
plt.xlabel('Age')
plt.ylabel('Billing Amount')
plt.tight_layout()
plt.savefig('age_vs_billing_scatter.png')

####Box Plot Distribution by Medical Condition

In [None]:
print("Generating box plots for Billing Amount by Medical Condition...")
plt.figure(figsize=(12, 7))
sns.boxplot(y='Medical Condition', x='Billing Amount', data=df, palette=palette)
plt.title('Billing Amount Across Medical Conditions')
plt.xlabel('Billing Amount')
plt.ylabel('Medical Condition')
plt.tight_layout()
plt.savefig('billing_by_condition_boxplot.png')

####Correlation matrix of numerical values

In [None]:
print("Generating a correlation heatmap...")
plt.figure(figsize=(8, 6))

numeric_cols = df.select_dtypes(include=['number'])
corr_matrix = numeric_cols.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')

#Saving as a pickle file

In [39]:
import pickle
filename = 'readmission_model.pkl'
with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)