In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
import os
files = os.listdir('/content/drive/MyDrive')
print(files)


['Untitled Diagram (1).drawio.png', 'Untitled Diagram.drawio.png', 'DS2 Notes.gdoc', 'Victor Elkins Resume.pdf', 'Colab Notebooks', 'Untitled project.gscript', 'CIA 1 Readthrough (1).mp4', 'CamelsAndHorses', 'archive', 'Untitled document (1).gdoc', 'CatDog', 'catvsdog.zip', 'Flowers Recognition Dataset.zip', 'Flowers Recognition Dataset', 'R2 Interview Presentation Guidelines  Agenda.pdf', 'CIA 1 Readthrough.mp4', 'catimage.jpg', 'yolov3.h5', 'yolov3.weights.1', 'yolov3.weights', 'gecko.jpg', 'tree.jpg', 'bird.jpg', 'yolov3.h5.1', 'yolov3.weights.2', 'yolov3.cfg', 'yolov3.h5.2', 'yolov3.weights.3', 'yolov3.h5.3', 'yolov3.weights.4', 'cifar-10-python.tar.gz', 'reddit.csv', 'Untitled presentation.gslides', '2024-10-18 15-58-30.mkv', '2024-10-18 16-28-08.mkv', 'R2 Interview Presentation Guidelines  Agenda.gdoc', 'images.jpg', 'Low quality dancing Cat meme.mp4', 'breastcancer', 'Untitled document.gdoc', 'monthly-milk-production.csv', 'archive(2)', 'photos', 'imagedata', 'CBR Learnings.gdoc

In [None]:
import pandas as pd

# Specify the path to the file
file_path = '/content/drive/MyDrive/extracted_data.csv'

# Load the file into a Pandas DataFrame
df = pd.read_csv(file_path)

# Print the first few rows of the DataFrame
print(df.head())


   CoronaryAngina  Age_Group  Sex  Income_Category  Heavy_Alcohol_Consumption  \
0               1         12    0                6                          0   
1               1         11    0                5                          0   
2               1          8    0                5                          0   
3               1         12    1                4                          0   
4               1         12    0                4                          0   

   BMI_Category  Exercise_Participation  Smoking_Status  Insurance_Type  Race  
0             0                       1               0               0     2  
1             2                       1               0               0     7  
2             1                       1               0               4     2  
3             0                       0               0               0     7  
4             1                       0               0               0     2  


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import precision_recall_fscore_support, classification_report

weighted_percentages = {
    'CoronaryAngina': [21.23, 78.77],  # [Yes, No]
    'Age_Group': [12.37, 7.38, 9.11, 7.62, 8.47, 6.45, 7.83, 7.11, 8.60, 6.94, 6.48, 4.55, 5.24, 1.84],
    'Sex': [48.50, 51.50],  # [Male, Female]
    'Income_Category': [4.88, 7.53, 8.88, 10.42, 22.29, 17.49, 6.78, 21.73],
    'Heavy_Alcohol_Consumption': [85.67, 5.19, 9.14],
    'BMI_Category': [1.96, 30.50, 34.73, 32.80],  # [Underweight, Normal, Overweight, Obese]
    'Exercise_Participation': [75.03, 24.71, 0.19, 0.07],
    '_SmokingStatus': [83.08, 10.68, 6.25],  # [No, Yes, Unknown]
    'Insurance_Type': [37.92, 8.99, 21.23, 0.08, 8.18, 0.04, 3.30, 0.15, 3.15, 3.27, 8.11, 3.82, 1.76]
}

def create_synthetic_data(n_samples=10000, weighted=False):
    data = {}

    if weighted:
        for feature, weights in weighted_percentages.items():
            if feature!= 'CoronaryAngina':
                data[feature] = np.random.choice(
                    len(weights),
                    size=n_samples,
                    p=np.array(weights) / np.sum(weights)
                )
    else:
        features = ['Age_Group', 'Sex', 'Income_Category', 'Heavy_Alcohol_Consumption',
                    'BMI_Category', 'Exercise_Participation', '_SmokingStatus', 'Insurance_Type']
        for feature in features:
            num_categories = len(weighted_percentages[feature])
            data[feature] = np.random.choice(num_categories, size=n_samples)

    df = pd.DataFrame(data)
    probabilities = 0.2
    probabilities += 0.1 * (df['Age_Group'] > 7)
    probabilities += 0.1 * (df['BMI_Category'] == 3)
    probabilities += 0.05 * (df['_SmokingStatus'] == 1)

    df['CoronaryAngina'] = np.random.binomial(1, probabilities)

    smoking_mapping = {
        1: "Yes",
        2: "Yes",
        3: "No",
        4: "No"
    }
    df['_SmokingStatus'] = df['_SmokingStatus'].map(smoking_mapping)
    df = df.dropna(subset=['_SmokingStatus'])

    return df

def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model

def preprocess_data(df):
    df_processed = df.copy()
    protected_attribute = df_processed['Income_Category'].astype(float)

    categorical_columns = ['Age_Group', 'Sex', 'Income_Category', 'Heavy_Alcohol_Consumption',
                           'BMI_Category', 'Exercise_Participation', '_SmokingStatus', 'Insurance_Type']

    df_encoded = pd.get_dummies(df_processed, columns=categorical_columns)

    return df_encoded, protected_attribute

def train_and_evaluate(weighted=False):
    df = create_synthetic_data(weighted=weighted)
    df_encoded, protected_attribute = preprocess_data(df)

    X = df_encoded.drop('CoronaryAngina', axis=1)
    y = df_encoded['CoronaryAngina'].astype(float)

    X_train, X_test, y_train, y_test, protected_train, protected_test = train_test_split(
        X, y, protected_attribute, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_tensor = tf.cast(X_train_scaled, tf.float32)
    y_train_tensor = tf.cast(y_train, tf.float32)
    protected_train_tensor = tf.cast(protected_train, tf.float32)

    model = create_model(X_train.shape[1])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True
    )

    history = model.fit(
        X_train_tensor,
        y_train_tensor,
        epochs=100,
        batch_size=32,
        validation_split=0.2,
        verbose=0
    )

    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

    precision, recall, f1_score, support = precision_recall_fscore_support(y_test, y_pred)

    print("\nClassification Report:")
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1-score:", f1_score)
    print("Support:", support)

    return precision, recall, f1_score, support

def compare_results():
    weighted_report = train_and_evaluate(weighted=True)
    non_weighted_report = train_and_evaluate(weighted=False)

    # Create a DataFrame from the reports
    table = pd.DataFrame({
        'Metric': ['Precision', 'Recall', 'F1-score', 'Support'],
        'Weighted': list(weighted_report),
        'Non-Weighted': list(non_weighted_report)
    })

    print(table)

if __name__ == "__main__":
    compare_results()




[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step

Classification Report:
Precision: [0.75655431 0.40540541]
Recall: [0.82113821 0.31578947]
F1-score: [0.78752437 0.35502959]
Support: [246  95]




[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step

Classification Report:
Precision: [0.70150502 0.39310345]
Recall: [0.90507012 0.13768116]
F1-score: [0.79039096 0.2039356 ]
Support: [927 414]
      Metric                                   Weighted  \
0  Precision  [0.7565543071161048, 0.40540540540540543]   
1     Recall   [0.8211382113821138, 0.3157894736842105]   
2   F1-score  [0.7875243664717348, 0.35502958579881655]   
3    Support                                  [246, 95]   

                                Non-Weighted  
0   [0.7015050167224081, 0.3931034482758621]  
1  [0.9050701186623517, 0.13768115942028986]  
2    [0.790390956194065, 0.2039355992844365]  
3                                 [927, 414]  


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


weighted_percentages = {
    'CoronaryAngina': [21.23, 78.77],  # [Yes, No]
    'Age_Group': [12.37, 7.38, 9.11, 7.62, 8.47, 6.45, 7.83, 7.11, 8.60, 6.94, 6.48, 4.55, 5.24, 1.84],
    'Sex': [48.50, 51.50],  # [Male, Female]
    'Income_Category': [4.88, 7.53, 8.88, 10.42, 22.29, 17.49, 6.78, 21.73],
    'Heavy_Alcohol_Consumption': [85.67, 5.19, 9.14],
    'BMI_Category': [1.96, 30.50, 34.73, 32.80],  # [Underweight, Normal, Overweight, Obese]
    'Exercise_Participation': [75.03, 24.71, 0.19, 0.07],
    '_SmokingStatus': [83.08, 10.68, 6.25],  # [No, Yes, Unknown]
    'Insurance_Type': [37.92, 8.99, 21.23, 0.08, 8.18, 0.04, 3.30, 0.15, 3.15, 3.27, 8.11, 3.82, 1.76]
}


def create_synthetic_data(n_samples=10000):
    data = {}

    # Generate data for protected groups
    women = np.random.choice([0, 1], size=n_samples, p=[0.5, 0.5])
    african_americans = np.random.choice([0, 1], size=n_samples, p=[0.5, 0.5])
    low_income = np.random.choice([0, 1], size=n_samples, p=[0.5, 0.5])

    # Generate data for other features
    age = np.random.choice(len(weighted_percentages['Age_Group']), size=n_samples, p=np.array(weighted_percentages['Age_Group']) / np.sum(weighted_percentages['Age_Group']))
    bmi = np.random.choice(len(weighted_percentages['BMI_Category']), size=n_samples, p=np.array(weighted_percentages['BMI_Category']) / np.sum(weighted_percentages['BMI_Category']))
    smoking = np.random.choice(len(weighted_percentages['_SmokingStatus']), size=n_samples, p=np.array(weighted_percentages['_SmokingStatus']) / np.sum(weighted_percentages['_SmokingStatus']))

    # Generate target variable
    coronary_angina = np.random.choice(len(weighted_percentages['CoronaryAngina']), size=n_samples, p=np.array(weighted_percentages['CoronaryAngina']) / np.sum(weighted_percentages['CoronaryAngina']))

    # Create dataframe
    df = pd.DataFrame({
        'Women': women,
        'African Americans': african_americans,
        'Low Income': low_income,
        'Age': age,
        'BMI': bmi,
        'Smoking': smoking,
        'Coronary Angina': coronary_angina
    })

    return df


def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model


def train_and_evaluate(df):
    X = df.drop('Coronary Angina', axis=1)
    y = df['Coronary Angina']

    # Split data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Create model
    model = create_model(X_train.shape[1])

    # Train model
    model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_test, y_test), verbose=0)

    # Make predictions
    y_pred = (model.predict(X_test) > 0.5).astype(int)

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Calculate bias metrics
    women_pred = y_pred[X_test['Women'] == 1]
    african_americans_pred = y_pred[X_test['African Americans'] == 1]
    low_income_pred = y_pred[X_test['Low Income'] == 1]

    men_pred = y_pred[X_test['Women'] == 0]
    non_african_americans_pred = y_pred[X_test['African Americans'] == 0]
    high_income_pred = y_pred[X_test['Low Income'] == 0]

    print("\nBias Metrics:")
    print("Women:", np.mean(women_pred))
    print("African Americans:", np.mean(african_americans_pred))
    print("Low Income:", np.mean(low_income_pred))
    print("Men:", np.mean(men_pred))
    print("Non-African Americans:", np.mean(non_african_americans_pred))
    print("High Income:", np.mean(high_income_pred))


if __name__ == "__main__":
    df = create_synthetic_data()
    train_and_evaluate(df)




[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       452
           1       0.77      1.00      0.87      1548

    accuracy                           0.77      2000
   macro avg       0.39      0.50      0.44      2000
weighted avg       0.60      0.77      0.68      2000


Bias Metrics:
Women: 1.0
African Americans: 1.0
Low Income: 1.0
Men: 1.0
Non-African Americans: 1.0
High Income: 1.0


In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report


def create_synthetic_data(n_samples=10000):
    data = {}

    for feature, weights in weighted_percentages.items():
        if feature != 'CoronaryAngina':
            data[feature] = np.random.choice(
                len(weights),
                size=n_samples,
                p=np.array(weights) / np.sum(weights)
            )

    df = pd.DataFrame(data)
    probabilities = 0.2
    probabilities += 0.1 * (df['Age_Group'] > 7)
    probabilities += 0.1 * (df['BMI_Category'] == 3)
    probabilities += 0.05 * (df['_SmokingStatus'] == 1)

    df['CoronaryAngina'] = np.random.binomial(1, probabilities)

    smoking_mapping = {
        1: "Yes",
        2: "Yes",
        3: "No",
        4: "No"
    }
    df['_SmokingStatus'] = df['_SmokingStatus'].map(smoking_mapping)
    df = df.dropna(subset=['_SmokingStatus'])

    return df


def create_model(input_dim):
    model = tf.keras.Sequential([
        tf.keras.layers.InputLayer(input_shape=(input_dim,)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.4),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.3),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy']
    )

    return model


def preprocess_data(df):
    df_processed = df.copy()
    # Assuming you want to use 'Income_Category' as the protected attribute before encoding
    protected_attribute = df_processed['Income_Category'].astype(float)

    categorical_columns = ['Age_Group', 'Sex', 'Income_Category', 'Heavy_Alcohol_Consumption',
                           'BMI_Category', 'Exercise_Participation', '_SmokingStatus', 'Insurance_Type']

    # Encode categorical columns
    df_encoded = pd.get_dummies(df_processed, columns=categorical_columns)

    return df_encoded, protected_attribute


def train_and_evaluate():
    df = create_synthetic_data()
    df_encoded, protected_attribute = preprocess_data(df)

    X = df_encoded.drop('CoronaryAngina', axis=1)
    y = df_encoded['CoronaryAngina'].astype(float)

    # Now use the protected_attribute, which is the original 'Income_Category'
    X_train, X_test, y_train, y_test, protected_train, protected_test = train_test_split(
        X, y, protected_attribute, test_size=0.2, random_state=42
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    X_train_tensor = tf.cast(X_train_scaled, tf.float32)
    y_train_tensor = tf.cast(y_train, tf.float32)
    protected_train_tensor = tf.cast(protected_train, tf.float32)

    model = create_model(X_train.shape[1])

    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', patience=10, restore_best_weights=True
    )

    # Change verbose to 0 to suppress epoch-wise accuracy printing
    history = model.fit(
        X_train_tensor,
        y_train_tensor,
        epochs=1000,
        batch_size=32,
        validation_split=0.2,
        verbose=0,  # Set to 0 to suppress training progress
    )

    y_pred = (model.predict(X_test_scaled) > 0.5).astype(int)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Print bias metrics
    print_bias_metrics(y_test, y_pred, protected_test)

    # Plot results
    plot_results(history)

    return model, history, scaler


if __name__ == "__main__":
    model, history, scaler = train_and_evaluate()
