In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import seaborn as sns
import scipy.stats as stats
from scipy.stats import chi2_contingency
import warnings
warnings.filterwarnings('ignore')


In [None]:
pd.set_option('display.max_columns', None)
da = pd.read_csv("brfss2020.csv")

In [None]:
da.head()

## Preprocessing

In [None]:
# Columns to be retained
columns_to_keep = [
    "_RFHLTH","ADDEPEV3" ,"DECIDE", "POORHLTH", "HLTHPLN1", "PERSDOC2", 
    "MEDCOST", "CHECKUP1", "_TOTINDA" , "SLEPTIM1", "CVDINFR4", "CVDCRHD4", "CVDSTRK3", "_AGE80",
    "_SMOKER3", "DIABETE4", "_DRDXAR2","DIFFWALK","_RFBMI5", "_SEX", "_CHLDCNT", "_EDUCAG", "_INCOMG", "DRNKANY5", "_MENT14D","_PHYS14D", "_AGEG5YR"
]

# Filter the dataset for rows where _STATE equals 53 and select the specified columns
washington = da[da['_STATE'] == 53][columns_to_keep]


In [None]:
washington = washington.rename(columns={"_SEX":"Gender", "_RFBMI5":"Obesity","ADDEPEV3":"Stress","SLEPTIM1":"Sleep","_DRDXAR2":"Arthritis","Physical_InActivity":"Physical_Activity","DIABETE4":"Diabetes","_SMOKER3":"Smoking","_AGE80":"Age","_MENT14D":"Mental_Health","_PHYS14D":"Physical_Health","MARIJAN1":"Drugs","DRNKANY5":"Alcohol_Consumption","CVDCRHD4":"CHD","CVDSTRK3":"Stroke","CVDINFR4":"Heart_Attack" })

In [None]:
washington



In [None]:
washington.corr()

In [None]:
washington.to_csv("Sliced_Data.csv")

## Exploratory Data Analysis

In [None]:
washington.info()

In [None]:
washington.describe()

In [None]:
correlation_matrix = washington.corr()

# Plotting the correlation matrix
plt.figure(figsize=(15, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("Correlation Matrix of Risk Factors")
plt.show()

**Gender: Male & 2:Female**

In [None]:
washington['Gender'].value_counts()
sns.countplot(x='Gender', data=washington,palette=custom_palette)

**Variable : Age**

In [None]:
washington['Age'].value_counts()
plt.figure(figsize=(12, 6))


# Create the countplot
sns.countplot(x='Age', data=washington, width=.9, palette=custom_palette)

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)


# Show the plot
plt.show()



In [None]:
washington['Age'].value_counts()
plt.figure(figsize=(12, 6))


# Create the countplot
sns.boxplot(x='Age', data=washington, width=.9)

# Rotate x-axis labels for better readability
plt.xticks(rotation=90)


# Show the plot
plt.show()



In [None]:
washington['Age'].mean()

In [None]:
washington['Sleep'] = washington['Sleep'][(washington['Sleep'] != 77) & (washington['Sleep'] != 99)]

In [None]:
washington['Sleep'].unique()

**Variable : Sleep**

In [None]:
print(washington['Sleep'].value_counts())
plt.figure(figsize=(12, 6))
sns.boxplot(x='_AGEG5YR', y='Sleep',data=washington)

In [None]:
print(washington['Sleep'].value_counts())
plt.figure(figsize=(12, 6))
sns.boxplot(x='Physical_Activity', y='Sleep',data=washington)

In [None]:
sns.countplot(x='_AGEG5YR', hue='Diabetes', data=washington, palette=custom_palette)

**Variable : Heart_Attack**

In [None]:
washington['Heart_Attack'].value_counts()
sns.countplot(x='Heart_Attack', data=washington, palette=custom_palette)

In [None]:
washington['Heart_Attack'].value_counts()
sns.countplot(x='Heart_Attack', hue='Gender', data=washington, palette=custom_palette)

In [None]:
washington['Heart_Attack'].value_counts()
sns.countplot(x='Heart_Attack', hue='Gender', data=washington, palette=custom_palette)

In [None]:
sns.countplot(x='_AGEG5YR', hue='Heart_Attack', data=washington, palette=custom_palette)

In [None]:
print(washington['Stress'].value_counts())
sns.countplot(x='Stress', data=washington, palette=custom_palette)

In [None]:
washington.drop(washington[(washington['Stress'] == 7) | (washington['Stress'] == 9)].index, inplace=True)


In [None]:
print(washington['Stress'].value_counts())
sns.countplot(x='Stress', hue='Gender', data=washington,palette=custom_palette)

In [None]:
sns.countplot(x='_AGEG5YR', hue='Stress', data=washington, palette=custom_palette)

In [None]:
print(washington['Smoking'].value_counts())
sns.countplot(x='Smoking', data=washington)

In [None]:
print(washington['Smoking'].value_counts())
sns.countplot(x='Smoking', data=washington)

In [None]:
washington.drop(washington[(washington['Smoking'] == 9)].index, inplace=True)


In [None]:
sns.countplot(x='_AGEG5YR', hue='Smoking', data=washington, palette=custom_palette)

In [None]:
gender_counts = washington['Gender'].value_counts()

# Calculate percentages
gender_percentages = (gender_counts / len(washington)) * 100

# Display the percentages
print("Gender Percentages:")
print(gender_percentages)


In [None]:
sns.countplot(x='Gender', hue='Smoking', data=washington, palette=custom_palette)

In [None]:
grouped_gender_smoking_counts = washington.groupby('Smoking')['Gender'].value_counts()

# Calculate percentages within each smoking category
grouped_gender_smoking_percentages = (grouped_gender_smoking_counts / washington.groupby('Smoking')['Gender'].count()) * 100

# Display the percentages
print("Gender Percentages within each Smoking category:")
print(grouped_gender_smoking_percentages)


In [None]:
sns.countplot(x='Obesity', hue='Smoking', data=washington, palette=custom_palette)

In [None]:
grouped_obesity_smoking_counts = washington.groupby('Smoking')['Obesity'].value_counts()

# Calculate percentages within each smoking category
grouped_obesity_smoking_percentages = (grouped_obesity_smoking_counts / washington.groupby('Smoking')['Obesity'].count()) * 100

# Display the percentages
print("Obesity Percentages within each Smoking category:")
print(grouped_obesity_smoking_percentages)


washington['CHD'].value_counts()
sns.countplot(x='CHD', data=washington)

In [None]:
print(washington['Obesity'].value_counts())

sns.countplot(x='Obesity', data=washington, palette=custom_palette)

In [None]:
sns.countplot(x='_AGEG5YR', hue='Obesity', data=washington, palette=custom_palette)

In [None]:
print(washington['Obesity'].value_counts())

sns.countplot(x='Obesity', hue='Gender', data=washington, palette=custom_palette)

In [None]:
washington.drop(washington[(washington['Obesity'] == 9)].index, inplace=True)


In [None]:
print(washington['Physical_Activity'].value_counts())
sns.countplot(x='Physical_Activity', data=washington,palette=custom_palette)

In [None]:
grouped_gender_phy_counts = washington.groupby('Physical_Activity')['Gender'].value_counts()

# Calculate percentages within each smoking category
grouped_gender_phy_percentages = (grouped_gender_phy_counts / washington.groupby('Physical_Activity')['Gender'].count()) * 100

# Display the percentages
print("Gender Percentages within each Physical Activity category:")
print(grouped_gender_phy_percentages)


In [None]:
sns.countplot(x='_AGEG5YR', hue='Physical_Activity', data=washington, palette=custom_palette)

In [None]:
washington.drop(washington[(washington['Physical_Activity'] == 9)].index, inplace=True)

**Variable : Arthritis**

In [None]:
print(washington['Arthritis'].value_counts())
sns.countplot(x='Arthritis', data=washington,palette=custom_palette)

In [None]:
print(washington['_AGEG5YR'].value_counts())
sns.countplot(x='_AGEG5YR', data=washington)

In [None]:
print(washington['Diabetes'].value_counts())
sns.countplot(x='Diabetes', data=washington,palette=custom_palette)

In [None]:
sns.countplot(x='_AGEG5YR', hue='Diabetes', data=washington, palette=custom_palette)

In [None]:
sns.countplot(x='Obesity', hue='Diabetes', data=washington, palette=custom_palette)

In [None]:
sns.countplot(x='Physical_Activity', hue='Diabetes', data=washington, palette=custom_palette)

In [None]:

sns.countplot(x='Obesity', hue='Smoking', data=washington, palette=custom_palette)

In [None]:
washington.drop(washington[(washington['Alcohol_Consumption'] == 7) | (washington['Alcohol_Consumption'] == 9)].index, inplace=True)


In [None]:
sns.countplot(x='Alcohol_Consumption', data=washington, palette=custom_palette)

In [None]:
sns.countplot(x='Alcohol_Consumption', hue='Smoking', data=washington, palette=custom_palette)

In [None]:
washington['Diabetes'].unique()


In [None]:
washington.drop(washington[(washington['Diabetes'] == 7) | (washington['Diabetes'] == 9)].index, inplace=True)


In [None]:
washington["Diabetes"] = washington.Diabetes.replace({1: "Yes", 2: "Yes", 3: "No", 4: "No" })

In [None]:
# Correlations
#Physical_Activity and Obesity

# Hypothesis


In [None]:
washington.columns

In [None]:
print(washington["_RFHLTH"].value_counts())

In [None]:
washington.drop(washington[(washington['_RFHLTH'] == 9)].index, inplace=True)

**Hypothesis 1: Relationship Between Stress and General Health**

Null Hypothesis ($H_0$): There is no correlation between stress levels and general health.

Alternative Hypothesis ($H_1$): Higher stress levels are associated with poorer general health.


In [None]:
# Selecting the relevant columns
Stress_fact = washington['Stress']
Health_stat = washington['_RFHLTH']

# Creating a contingency table
contingency_table = pd.crosstab(Stress_fact, Health_stat)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table)

chi2, p, dof, expected

In [None]:
contingency_table


**Hypothesis 2: Impact of Physical InActivity on Obesity**
    
Null Hypothesis ($H_0$): Physical activity does not significantly affect obesity status.

Alternative Hypothesis ($H_1$): Individuals who are physically inactive are more likely to be obese.

Test Type: Chi-Square Test for Independence

In [None]:
# Selecting the relevant columns
Physical_Stat = washington['Physical_InActivity']
Obesity_Stat = washington['Obesity']

# Creating a contingency table
contingency_table1 = pd.crosstab(Physical_Stat, Obesity_Stat)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table1)

chi2, p, dof, expected

In [None]:
contingency_table1

In [None]:
print(washington['Obesity'].value_counts())
sns.countplot(x='Obesity',hue='Physical_InActivity', data=washington)

In [None]:
# Create a stacked bar graph
sns.countplot(data=washington, x='Physical_Activity', hue='Obesity', palette='Set1')

# Add labels and a legend
plt.xlabel('Physical_Activity')
plt.ylabel('Count')
plt.title('Bar Graph of Obesity vs. Physical activity')
plt.legend(title='Obesity', loc='upper right')

# Show the plot
plt.show()


**Hypothesis 3: Smoking and Heart Attack Incidence**

Null Hypothesis ($H_0$): Smoking status does not influence the incidence of heart attacks.

Alternative Hypothesis ($H_1$): Smokers have a higher incidence of heart attacks compared to non-smokers.

Test Type: Chi-Square Test for Independence or Logistic Regression.

In [None]:
# Selecting the relevant columns
Smoking_Stat = washington['Smoking']
Heart_Attack = washington['Heart_Attack']

# Creating a contingency table
contingency_table2 = pd.crosstab(Smoking_Stat, Heart_Attack)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table2)

chi2, p, dof, expected

In [None]:
contingency_table2

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Define a custom color palette with red and black
custom_palette = ["red", "black"]

sns.countplot(x='Smoking', hue='Heart_Attack', data=washington, palette=custom_palette)

# Show the plot
plt.show()



**Hypothesis 4: Influence of Age on Arthritis Prevalence**

Null Hypothesis ($H_0$): Age does not have a significant effect on the prevalence of arthritis.

Alternative Hypothesis ($H_1$): Older individuals are more likely to suffer from arthritis.


In [None]:
# Selecting the relevant columns
Arthritis_Stat = washington['Arthritis']
Age_group = washington['_AGEG5YR']

# Creating a contingency table
contingency_table5 = pd.crosstab(Arthritis_Stat, Age_group)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table5)

chi2, p, dof, expected

In [None]:
custom_palette = ["red", "black"]

# Use the custom palette in the countplot
sns.set(style="whitegrid")  # Optional: Set the style
sns.countplot(x='_AGEG5YR', hue='Arthritis', data=washington, palette=custom_palette)

# Show the plot
plt.show()


In [None]:
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
plt.plot([1,2,3])
plt.savefig('myfig')

In [None]:
print(washington['Arthritis'].value_counts())
sns.countplot(x='Arthritis',hue='Gender', data=washington, palette=custom_palette)

**Hypothesis 5: Obesity has no impact on Diabetes**
    
Null Hypothesis ($H_0$): Obesity doesn't have any affect on Diabetes
    
Alternative Hypothesis ($H_1$): Obesity cause Diabetes.
    


In [None]:
# Selecting the relevant columns
Dia_Stat = washington['Diabetes']
Obs_group = washington['Obesity']

# Creating a contingency table
contingency_table5 = pd.crosstab(Dia_Stat, Obs_group)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table5)

chi2, p, dof, expected

**Hypothesis 6: Smoking has no impact on Obesity**
    
Null Hypothesis ($H_0$): Smokinng has no impact on Obesity

Alternative Hypothesis ($H_1$): Smoking impacts Obesity.
    


In [None]:
# Selecting the relevant columns
smk_Stat = washington['Smoking']
Obs_group = washington['Obesity']

# Creating a contingency table
contingency_table6= pd.crosstab(Obs_group,smk_Stat)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table6)

chi2, p, dof, expected

**Hypothesis 7: Smoking has no impact on Diabetes**
    
Null Hypothesis ($H_0$): Smoking doesn't have any affect on Diabetes
    
Alternative Hypothesis ($H_1$): Smoking cause Diabetes.
    


In [None]:
# Selecting the relevant columns
smk_Stat = washington['Smoking']
Dia_group = washington['Diabetes']

# Creating a contingency table
contingency_table7= pd.crosstab(Dia_group,smk_Stat)

# Applying Chi-Square test
chi2, p, dof, expected = chi2_contingency(contingency_table7)

chi2, p, dof, expected

# Non-Modifiable Risk Factors:

**Age:** The risk of heart attack increases with age. Men over 45 and women over 55 are at higher risk.

**Gender:** Men are generally at higher risk for heart attacks than premenopausal women. However, the risk for women increases after menopause, and heart disease is the leading cause of death for both genders.

**Family History:** If you have a family history of heart disease or heart attacks, your risk is elevated.

**Genetics:** Certain genetic factors can increase susceptibility to heart disease and heart attacks.

# Modifiable Risk Factors:

**High Blood Pressure (Hypertension):** Elevated blood pressure puts extra strain on the heart and arteries, increasing the risk of heart disease.

**High Cholesterol:** High levels of LDL (low-density lipoprotein) cholesterol, often referred to as "bad" cholesterol, can lead to the buildup of plaque in the arteries, increasing the risk of blockages.

**Smoking:** Smoking is a major risk factor for heart disease. The chemicals in tobacco can damage the heart and blood vessels.

**Diabetes:** Uncontrolled diabetes can increase the risk of heart disease due to high blood sugar levels damaging blood vessels and nerves.

**Obesity:** Excess body weight, particularly if concentrated around the abdomen, can increase the risk of heart disease.

**Physical Inactivity:** A sedentary lifestyle can contribute to obesity and increase the risk of heart disease.

**Unhealthy Diet:** A diet high in saturated and trans fats, cholesterol, sodium, and low in fruits, vegetables, and whole grains can raise the risk of heart disease.

**Stress:** Chronic stress can contribute to heart disease, possibly through its impact on behaviors like overeating, smoking, or inactivity.

**Excessive Alcohol Consumption:** Drinking too much alcohol can raise blood pressure and contribute to heart disease.

**Sleep Apnea:** Sleep apnea, a condition characterized by interrupted breathing during sleep, can increase the risk of heart disease.

**Drug Abuse:** Illicit drug use, such as cocaine or amphetamines, can trigger heart attacks.

**Inflammation:** Conditions associated with chronic inflammation, such as rheumatoid arthritis or lupus, can increase heart attack risk.

In [None]:
washington['Heart_Attack'].value_counts()

In [None]:
washington.drop(washington[washington['Heart_Attack']==7].index,inplace=True)

In [None]:
washington.drop(washington[washington['Heart_Attack']==9].index,inplace=True)

In [None]:
chg_stroke = {2: 0}
washington['Heart_Attack'].replace(to_replace=chg_stroke, inplace=True)

In [None]:
chg_gender = {1: "Male", 2: "Female"}
washington['Gender'].replace(to_replace=chg_gender, inplace=True)

In [None]:
washington['Obesity'].value_counts()

In [None]:
chg_bmi = {2: "OverWeight", 1: "Normal"}
washington['Obesity'].replace(to_replace=chg_bmi, inplace=True)

In [None]:
chg_smoking = {"Occasional_Smoker":2,"Chain_Smoker":3, "Former_Smoker":1, "Non_Smoker":0}
washington['Smoking'].replace(to_replace=chg_smoking, inplace=True)

In [None]:
washington['Physical_Activity'].unique()

In [None]:

washington["Physical_Activity"] = washington.Physical_Activity.replace({1: "Yes", 2: "No"})

In [None]:
washington["Stress"].value_counts()

In [None]:
chg_stress = {2:0}
washington['Stress'].replace(to_replace=chg_stress, inplace=True)

In [None]:
washington["Obesity"].value_counts()

In [None]:
washington['Arthritis'].value_counts()

In [None]:
chg_art = {2:0}
washington['Arthritis'].replace(to_replace=chg_art, inplace=True)

In [None]:
washington['Diabetes'].value_counts()

In [None]:
washington['Smoking'].unique()


In [None]:
washington["Smoking"] = washington['Smoking'].replace({1: "No", 0: "No", 3: "Yes", 2: "Yes"})

In [None]:
washington["Smoking"] = washington.Smoking.replace({'Former_Smoker': "No", 'Non_Smoker': "No", 'Chain_Smoker': "Yes", 'Occasional_Smoker': "Yes"})

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress + Physical_Activity", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress + Physical_Activity + Obesity", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress + Physical_Activity + Obesity + Smoking", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress + Physical_Activity + Obesity + Smoking + Alcohol_Consumption", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Stress + Physical_Activity + Diabetes + Obesity + Smoking + Alcohol_Consumption", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

In [None]:
model = sm.GLM.from_formula("Heart_Attack ~ Age + Gender + Physical_Activity + Obesity + Arthritis + Smoking + Alcohol_Consumption + Diabetes + Stress", family=sm.families.Binomial(), data=washington)
result = model.fit()
result.summary()

 **Gender:** the male will have 0.83 units greater log odds for heart attack than the female, while holding the other variables in the model fixed. **OR** the males have 2.3 times higher odds of heart attack than females.
 
 **Physical_Activity:** The Physically Active people will have 0.4847 units less log odds for heart attack than the Physically Inactive ones **OR** the physically active people will have 0.61 times lower odds of heart attack.
 
 **Obesity:** the overweight people have 0.1953 units greater log odds for heart attack than normal people **OR** the overweight people have 1.21 times higher odds of heart attack.
 
 **Diabetes:** The diabetic people have 0.7535 greater log odds for heart attack **OR** the diabetic people have 2.12 times higher odds of heart attack.
 
 **Age:** With every increase in age by one year the log odd for heart attack increases by 0.0695 units.
 
 **Arthritis:** The people suffering from Arthritis have 0.2216 units more log odds than with the people without Arthritis issue.
 
 **Smoking:** The Smokers have 0.3344 units greater log odds for heart attack than the non smokers.
 
 **Alcohol_Consumption:** The Drinkers have 0.0818 higher log odds than non-drinkers.
 
 **Stress:** The people having Stress have 0.4442 more units log odds than those without stress.