<a href="https://colab.research.google.com/github/SUDHARSSHINI/Heart_Disease_Prediction/blob/main/Heart_disease_Responsible_ai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [26]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create synthetic data
data = {
    'Age': np.random.randint(20, 80, size=50),
    'Sex': np.random.randint(0, 2, size=50),
    'Cholesterol': np.random.randint(150, 300, size=50),
    'Blood Pressure': np.random.randint(80, 180, size=50),
    'Heart Disease': np.random.randint(0, 2, size=50)
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_file_path = '/content/heart_disease_prediction.csv'
df.to_csv(csv_file_path, index=False)

print(f"Dataset created and saved to {csv_file_path}")


Dataset created and saved to /content/heart_disease_prediction.csv


In [27]:
pip install responsibleai




In [28]:
!pip install responsibleai




In [29]:
pip install responsibleai responsibleai_dashboard


[31mERROR: Could not find a version that satisfies the requirement responsibleai_dashboard (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for responsibleai_dashboard[0m[31m
[0m

In [30]:
pip install aif360




In [31]:
from responsibleai import RAIInsights, FeatureMetadata
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from aif360.datasets import BinaryLabelDataset
from aif360.metrics import ClassificationMetric

# Set random seed for reproducibility
np.random.seed(42)

# Create synthetic data
data = {
    'Age': np.random.randint(20, 80, size=50),
    'Sex': np.random.randint(0, 2, size=50),  # 0 for female, 1 for male
    'Cholesterol': np.random.randint(150, 300, size=50),
    'Blood Pressure': np.random.randint(80, 180, size=50),
    'Heart Disease': np.random.randint(0, 2, size=50)  # 0 for no, 1 for yes
}

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV
csv_file_path = '/content/heart_disease_prediction.csv'
df.to_csv(csv_file_path, index=False)

print(f"Dataset created and saved to {csv_file_path}")

# Load the synthetic dataset
data = pd.read_csv(csv_file_path)

# Prepare the dataset
X = data.drop(columns=['Heart Disease'])
y = data['Heart Disease']

# Ensure 'Heart Disease' column is numeric
y = y.astype(int)

# Identify categorical columns
categorical_features = ['Sex']

# Convert categorical columns to string if they are not already
X[categorical_features] = X[categorical_features].astype(str)

# Check and handle missing values
if X.isnull().any().any() or y.isnull().any():
    X = X.dropna()
    y = y[X.index]  # Align y with the cleaned X

# Verify that there are no NA values after cleaning
if X.isnull().any().any() or y.isnull().any():
    raise ValueError("Data contains NA values after handling. Please check the data cleaning steps.")

# Create a column transformer to preprocess the data
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Create a pipeline with preprocessing and model training
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Ensure no missing values in train and test sets
print("Missing values in X_train:\n", X_train.isnull().sum())
print("Missing values in y_train:\n", y_train.isnull().sum())
print("Missing values in X_test:\n", X_test.isnull().sum())
print("Missing values in y_test:\n", y_test.isnull().sum())

# Concatenate X and y for train and test sets
train_df = pd.concat([X_train.reset_index(drop=True), y_train.reset_index(drop=True)], axis=1)
test_df = pd.concat([X_test.reset_index(drop=True), y_test.reset_index(drop=True)], axis=1)

# Check for missing values after concatenation
print("Missing values in train_df after concatenation:\n", train_df.isnull().sum())
print("Missing values in test_df after concatenation:\n", test_df.isnull().sum())

# Drop rows with missing values if any
train_df = train_df.dropna()
test_df = test_df.dropna()

# Verify no NA values exist after dropping
if train_df.isnull().any().any() or test_df.isnull().any().any():
    raise ValueError("NA values found even after dropping missing values.")

# Convert to AIF360 dataset format
train_data_aif360 = BinaryLabelDataset(
    df=train_df,
    label_names=['Heart Disease'],
    protected_attribute_names=['Sex']
)

test_data_aif360 = BinaryLabelDataset(
    df=test_df,
    label_names=['Heart Disease'],
    protected_attribute_names=['Sex']
)

# Predict using the pipeline
y_pred = pipeline.predict(X_test)

# Convert y_pred into a DataFrame
y_pred_df = pd.DataFrame(data={'Heart Disease': y_pred})

# Combine X_test and y_pred for the predicted dataset
predicted_df = pd.concat([X_test.reset_index(drop=True), y_pred_df.reset_index(drop=True)], axis=1)

# Convert the predicted DataFrame to AIF360 BinaryLabelDataset
predicted_data_aif360 = BinaryLabelDataset(
    df=predicted_df,
    label_names=['Heart Disease'],
    protected_attribute_names=['Sex']
)

# Initialize fairness metrics
metric = ClassificationMetric(test_data_aif360, predicted_data_aif360,
                               privileged_groups=[{'Sex': 1}], unprivileged_groups=[{'Sex': 0}])

# Print fairness metrics
print("Disparate Impact:")
print(metric.disparate_impact())

print("Statistical Parity Difference:")
print(metric.statistical_parity_difference())

print("Equal Opportunity Difference:")
print(metric.equal_opportunity_difference())

print("Average Odds Difference:")
print(metric.average_odds_difference())

# Create FeatureMetadata instance with only categorical features
feature_metadata = FeatureMetadata(
    categorical_features=categorical_features
)

# Initialize RAIInsights
rai_insights = RAIInsights(
    model=pipeline,
    train=train_df,  # Use cleaned DataFrame
    test=test_df,    # Use cleaned DataFrame
    target_column='Heart Disease',
    task_type='classification',
    feature_metadata=feature_metadata
)

# Add the components you want to include in the insights
rai_insights.explainer.add()
rai_insights.error_analysis.add()

# Increase total_CFs to 10 or more
rai_insights.counterfactual.add(total_CFs=10, desired_class='opposite')

# Specify treatment features for causal analysis
treatment_features = ['Cholesterol', 'Blood Pressure']  # Example treatment features

# Add causal analysis with treatment features
rai_insights.causal.add(treatment_features=treatment_features)

# Compute the insights
rai_insights.compute()

# Get insights
explainer = rai_insights.explainer.get()
error_analysis = rai_insights.error_analysis.get()
counterfactual = rai_insights.counterfactual.get()
causal = rai_insights.causal.get()

# Print insights
print("Explainer Insights:")
print(explainer)

print("Error Analysis Insights:")
print(error_analysis)

print("Counterfactual Insights:")
print(counterfactual)

print("Causal Insights:")
print(causal)


Dataset created and saved to /content/heart_disease_prediction.csv
Missing values in X_train:
 Age               0
Sex               0
Cholesterol       0
Blood Pressure    0
dtype: int64
Missing values in y_train:
 0
Missing values in X_test:
 Age               0
Sex               0
Cholesterol       0
Blood Pressure    0
dtype: int64
Missing values in y_test:
 0
Missing values in train_df after concatenation:
 Age               0
Sex               0
Cholesterol       0
Blood Pressure    0
Heart Disease     0
dtype: int64
Missing values in test_df after concatenation:
 Age               0
Sex               0
Cholesterol       0
Blood Pressure    0
Heart Disease     0
dtype: int64
Disparate Impact:
1.5
Statistical Parity Difference:
0.25
Equal Opportunity Difference:
0.09999999999999998
Average Odds Difference:
0.04999999999999999
Causal Effects
Current Status: Generating Causal Effects.


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().
A column-vector y was passed when a 1d array was expected. Plea

Current Status: Finished generating causal effects.
Time taken: 0.0 min 20.291548734000116 sec
Counterfactual
Current Status: Generating 10 counterfactuals for 10 samples


100%|██████████| 10/10 [00:12<00:00,  1.29s/it]


Current Status: Generated 10 counterfactuals for 10 samples.
Time taken: 0.0 min 13.031865825000295 sec
Error Analysis
Current Status: Generating error analysis reports.
Current Status: Finished generating error analysis reports.
Time taken: 0.0 min 0.13971640199997637 sec
Explanations
Current Status: Explaining 4 features
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000022 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 45
[LightGBM] [Info] Number of data points in the train set: 40, number of used features: 4
[LightGBM] [Info] Start training from score -0.078861


categorical_feature keyword has been found in `params` and will be ignored.
Please use categorical_feature argument of the Dataset constructor to pass this parameter.


Current Status: Explained 4 features.
Time taken: 0.0 min 0.47536145200001556 sec
Explainer Insights:
[<abc.DynamicGlobalExplanation object at 0x7a9704f2fb80>]
Error Analysis Insights:
[<erroranalysis.report.error_report.ErrorReport object at 0x7a97040fc340>]
Counterfactual Insights:
[<dice_ml.counterfactual_explanations.CounterfactualExplanations object at 0x7a9704f6fc40>]
Causal Insights:
[<responsibleai._tools.causal.causal_result.CausalResult object at 0x7a97045c6ec0>]


In [32]:
# Check columns of new data
print("Columns in new data:", new_data.columns)
print("Columns expected by the model:", preprocessor.get_feature_names_out())



Columns in new data: Index(['Age', 'Sex', 'Cholesterol', 'Blood Pressure'], dtype='object')
Columns expected by the model: ['cat__Sex_0' 'cat__Sex_1' 'remainder__Age' 'remainder__Cholesterol'
 'remainder__Blood Pressure']


In [33]:
pipeline.fit(X_train, y_train)


In [34]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load and prepare the dataset
data = pd.read_csv('/content/heart_disease_prediction.csv')
X = data.drop(columns=['Heart Disease'])
y = data['Heart Disease']

# Ensure 'Heart Disease' column is numeric
y = y.astype(int)

# Define categorical features
categorical_features = ['Sex']

# Define the preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough'
)

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline with training data
pipeline.fit(X_train, y_train)


In [35]:
# Example new data for prediction
new_data = pd.DataFrame({
    'Age': [30, 55],  # Example ages
    'Sex': [0, 1],    # Example sex values (0 for female, 1 for male)
    'Cholesterol': [180, 240],  # Example cholesterol values
    'Blood Pressure': [120, 140]  # Example blood pressure values
})

# Ensure categorical columns are of type string
new_data['Sex'] = new_data['Sex'].astype(str)

# Predict using the fitted pipeline
predictions = pipeline.predict(new_data)

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(data={'Heart Disease': predictions})

# Print the predictions
print(predictions_df)


   Heart Disease
0              0
1              0
