Data & Package Load

In [4]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

# Raw File Dataset Path (Full) : https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Full%20Dataset.csv?token=GHSAT0AAAAAACTZRV2CDOCYANJTD5X7E2OQZTSEHXQ
# Raw File Dataset Path (50-50): https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/50-50%20Balanced%20Dataset.csv?token=GHSAT0AAAAAACTZRV2DZMBCXNKTGOPFEL7YZTSEGGQ

# Read the data file
df = pd.read_csv("https://raw.githubusercontent.com/NumanESchulich/SchulichDataScience/main/Data%20Science%20I%20(MBAN%206110T)/Group%20Assignment/Full%20Dataset.csv?token=GHSAT0AAAAAACTZRV2CDOCYANJTD5X7E2OQZTSEHXQ")

# Reversing some Binary Columns (Changing column names and reversing values for specific columns)

columns_to_reverse = {
    'DiffWalk': 'NoDiffWalk',
    'HighBP': 'NoHighBP',
    'HighChol': 'NoHighChol',
    'HeartDiseaseorAttack': 'NoHeartDiseaseorAttack',
    'Stroke': 'NoStroke',
    'Smoker': 'NoSmoker',
    'HvyAlcoholConsump': 'NoHvyAlcoholConsump'
}

for old_col, new_col in columns_to_reverse.items():
    df[new_col] = 1 - df[old_col]
    df.drop(columns=[old_col], inplace=True)

# Clustering adjustments

# GenHlth scaling
gen_hlth_scale = {1: 1, 2: 0.75, 3: 0.5, 4: 0.25, 5: 0}
df['GenHlth'] = df['GenHlth'].map(gen_hlth_scale)

# PhysHlth scaling
def phys_ment_hlth_scale(days):
    if 1 <= days <= 6:
        return 1
    elif 7 <= days <= 12:
        return 0.75
    elif 13 <= days <= 18:
        return 0.5
    elif 19 <= days <= 24:
        return 0.25
    elif 25 <= days <= 30:
        return 0
    return days

df['PhysHlth'] = df['PhysHlth'].apply(phys_ment_hlth_scale)

# MentHlth scaling
df['MentHlth'] = df['MentHlth'].apply(phys_ment_hlth_scale)

# BMI scaling
def bmi_scale(bmi):
    if bmi < 18.5:
        return 1
    elif 18.5 <= bmi <= 24.9:
        return 1
    elif 25 <= bmi <= 29.9:
        return 0.5
    elif 30 <= bmi <= 39.9:
        return 0.25
    elif bmi >= 40:
        return 0
    return bmi

df['BMI'] = df['BMI'].apply(bmi_scale)

# Income scaling
income_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.25, 5: 0.5, 6: 0.75, 7: 0.75, 8: 1}
df['Income'] = df['Income'].map(income_scale)

# Education scaling
education_scale = {1: 0, 2: 0, 3: 0.25, 4: 0.75, 5: 1, 6: 1}
df['Education'] = df['Education'].map(education_scale)

# Age scaling
age_scale = {1: 1, 2: 1, 3: 1, 4: 0.75, 5: 0.75, 6: 0.75, 7: 0.5, 8: 0.5, 9: 0.5, 10: 0.25, 11: 0.25, 12: 0.25, 13: 0}
df['Age'] = df['Age'].map(age_scale)

# Feature Engineered Columns
df['PhysicalCondition'] = (df['GenHlth'] + df['NoDiffWalk'] + df['PhysHlth'] + df['PhysActivity']) / 4
df['LackOfDisease'] = (df['NoHighBP'] + df['NoHighChol'] + df['NoHeartDiseaseorAttack'] + df['NoStroke']) / 4
df['Lifestyle'] = (df['NoSmoker'] + df['NoHvyAlcoholConsump'] + df['Veggies'] + df['Fruits']) / 4

# Save the adjusted dataset
adjusted_file_path = r"C:\Users\Numan\Desktop\Adjusted_Full_Dataset.csv"  # Specify the full file path
df.to_csv(adjusted_file_path, index=False)