In [6]:
# Install necessary libraries
!pip install pandas scikit-learn tensorflow matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset (replace with your dataset path)
# Hypothetical dataset: 'medical_data.csv'
dataset_path = '/content/healthcare_dataset.csv.zip'  # Replace with actual path
data = pd.read_csv(dataset_path)

# Display the first few rows of the dataset
print(data.head())

# Data preprocessing
# Assuming 'Disease' is the target column and the rest are features
target_column = 'Disease'
X = data.drop(columns=[target_column])
y = data[target_column]

# Encode categorical target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Build the neural network model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(len(np.unique(y)), activation='softmax')  # Output layer for multi-class classification
])

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

# Classification report
y_pred = np.argmax(model.predict(X_test), axis=1)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt


            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

KeyError: "['Disease'] not found in axis"

In [7]:
# Install necessary libraries
!pip install pandas scikit-learn tensorflow matplotlib seaborn

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
dataset_path = '/content/healthcare_dataset.csv.zip'  # Replace with actual path
data = pd.read_csv(dataset_path)

# Display the first few rows and column names
print("Dataset preview:")
print(data.head())
print("\nColumns in the dataset:")
print(data.columns)

# Check if 'Disease' exists as the target column
target_column = 'Disease'  # Target column name
if target_column not in data.columns:
    print(f"\n'{target_column}' column not found in the dataset. Please confirm the correct target column name.")
else:
    # Data preprocessing
    X = data.drop(columns=[target_column])  # Features
    y = data[target_column]  # Target variable

    # Encode categorical target variable
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)

    # Split the dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Build the neural network model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(len(np.unique(y)), activation='softmax')  # Output layer for multi-class classification
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"\nTest Accuracy: {test_accuracy:.2f}")

    # Classification report
    y_pred = np.argmax(model.predict(X_test), axis=1)
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

    # Confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()


Dataset preview:
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0   

In [8]:
# Install necessary libraries
!pip install pandas scikit-learn tensorflow matplotlib seaborn

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns


def load_dataset(path):
    """Load dataset from the given path."""
    try:
        data = pd.read_csv(path)
        print("\nDataset loaded successfully!")
        print(f"Shape of the dataset: {data.shape}")
        print("\nFirst 5 rows:")
        print(data.head())
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None


def preprocess_data(data, target_column):
    """Preprocess data by separating features and target, encoding target, and scaling features."""
    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset. Available columns: {list(data.columns)}")

    print(f"\nTarget column: '{target_column}' found in the dataset.")

    # Separate features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Encode categorical target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    print(f"Classes in target column: {label_encoder.classes_}")

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("Data preprocessing completed!")
    return X_train, X_test, y_train, y_test, label_encoder


def build_model(input_shape, output_classes):
    """Build and compile a neural network model."""
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(output_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("\nModel built successfully!")
    return model


def plot_metrics(history):
    """Plot training and validation metrics over epochs."""
    plt.figure(figsize=(12, 5))

    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()


def plot_confusion_matrix(y_test, y_pred, label_encoder):
    """Plot a confusion matrix."""
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()


# Main script
if __name__ == "__main__":
    # Dataset path
    dataset_path = '/content/healthcare_dataset.csv.zip'

    # Load the dataset
    data = load_dataset(dataset_path)

    if data is not None:
        # Target column name
        target_column = 'Disease'

        try:
            # Preprocess the data
            X_train, X_test, y_train, y_test, label_encoder = preprocess_data(data, target_column)

            # Build the model
            model = build_model(input_shape=X_train.shape[1], output_classes=len(np.unique(y_train)))

            # Train the model
            print("\nTraining the model...")
            history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

            # Evaluate the model
            test_loss, test_accuracy = model.evaluate(X_test, y_test)
            print(f"\nTest Accuracy: {test_accuracy:.2f}")

            # Plot training metrics
            plot_metrics(history)

            # Predict on test data
            y_pred = np.argmax(model.predict(X_test), axis=1)

            # Classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

            # Confusion matrix
            plot_confusion_matrix(y_test, y_pred, label_encoder)

        except Exception as e:
            print(f"Error during preprocessing or model training: {e}")



Dataset loaded successfully!
Shape of the dataset: (55500, 15)

First 5 rows:
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  

another way

In [10]:
import zipfile

# Path to the uploaded zip file
zip_file_path = '/content/healthcare_dataset.csv.zip'

# Extract the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')


In [11]:
dataset_path = '/content/healthcare_dataset.csv'  # Use the correct path after extraction


In [12]:
# Install necessary libraries
!pip install pandas scikit-learn tensorflow matplotlib seaborn

# Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import files
import zipfile

# Step 1: Upload the dataset
uploaded = files.upload()

# Step 2: Extract the zip file (if applicable)
zip_file_path = '/content/healthcare_dataset.csv.zip'
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# Step 3: Define the functions from your script
def load_dataset(path):
    """Load dataset from the given path."""
    try:
        data = pd.read_csv(path)
        print("\nDataset loaded successfully!")
        print(f"Shape of the dataset: {data.shape}")
        print("\nFirst 5 rows:")
        print(data.head())
        return data
    except Exception as e:
        print(f"Error loading dataset: {e}")
        return None

def preprocess_data(data, target_column):
    """Preprocess data by separating features and target, encoding target, and scaling features."""
    if target_column not in data.columns:
        raise ValueError(f"Target column '{target_column}' not found in the dataset. Available columns: {list(data.columns)}")

    print(f"\nTarget column: '{target_column}' found in the dataset.")

    # Separate features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]

    # Encode categorical target variable
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    print(f"Classes in target column: {label_encoder.classes_}")

    # Split dataset into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

    # Standardize numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    print("Data preprocessing completed!")
    return X_train, X_test, y_train, y_test, label_encoder

def build_model(input_shape, output_classes):
    """Build and compile a neural network model."""
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(output_classes, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    print("\nModel built successfully!")
    return model

def plot_metrics(history):
    """Plot training and validation metrics over epochs."""
    plt.figure(figsize=(12, 5))

    # Accuracy plot
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy')
    plt.legend()

    # Loss plot
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.tight_layout()
    plt.show()

def plot_confusion_matrix(y_test, y_pred, label_encoder):
    """Plot a confusion matrix."""
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues',
                xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

# Main script
if __name__ == "__main__":
    # Dataset path after extraction
    dataset_path = '/content/healthcare_dataset.csv'  # Update this path if necessary

    # Load the dataset
    data = load_dataset(dataset_path)

    if data is not None:
        # Target column name
        target_column = 'Medical Condition'  # Changed from 'Disease' to 'Medical Condition'

        try:
            # Preprocess the data
            X_train, X_test, y_train, y_test, label_encoder = preprocess_data(data, target_column)

            # Build the model
            model = build_model(input_shape=X_train.shape[1], output_classes=len(np.unique(y_train)))

            # Train the model
            print("\nTraining the model...")
            history = model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_test, y_test))

            # Evaluate the model
            test_loss, test_accuracy = model.evaluate(X_test, y_test)
            print(f"\nTest Accuracy: {test_accuracy:.2f}")

            # Plot training metrics
            plot_metrics(history)

            # Predict on test data
            y_pred = np.argmax(model.predict(X_test), axis=1)

            # Classification report
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

            # Confusion matrix
            plot_confusion_matrix(y_test, y_pred, label_encoder)

        except Exception as e:
            print(f"Error during preprocessing or model training: {e}")




Saving healthcare_dataset.csv.zip to healthcare_dataset.csv (1).zip

Dataset loaded successfully!
Shape of the dataset: (55500, 15)

First 5 rows:
            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna   