In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd 
import seaborn as sns 
import statistics
import numpy as np
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

In [3]:
model_build_df = pd.read_csv('mydata/MMSA2021_train.csv')

In [4]:
#pd.set_option('display.max_columns', None)

In [5]:
pd.set_option('display.max_info_columns', 1000)
pd.set_option('display.max_info_rows', 1000000)

In [6]:
import io
buffer = io.StringIO()
model_build_df.info(buf=buffer)
info_str = buffer.getvalue()
#print(info_str)

In [7]:
model_build_df.sample(5)

Unnamed: 0,DISPCODE,STATERE1,CELPHON1,LADULT1,COLGSEX,LANDSEX,RESPSLCT,SAFETIME,CADULT1,CELLSEX,...,_FRTLT1A,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_MMSA,_MMSAWT,SEQNO,MMSANAME
115667,1100.0,,,,,,,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,43580.0,35.130776,2021000000.0,"b'Sioux City, IA-NE-SD, Metropolitan Statistic..."
44538,1100.0,,,,,,,1.0,1.0,1.0,...,2.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79,39100.0,70.735306,2021001000.0,"b'Poughkeepsie-Newburgh-Middletown, NY, Metrop..."
28680,1200.0,,,,,,,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,35380.0,202.603027,2021000000.0,"b'New Orleans-Metairie, LA, Metropolitan Stati..."
104406,1100.0,,,,,,,1.0,1.0,1.0,...,2.0,1.0,1.0,1.0,5.397605e-79,5.397605e-79,42644.0,281.508836,2021003000.0,"b'Seattle-Bellevue-Kent, WA, Metropolitan Divi..."
130742,1100.0,,,,,,,1.0,1.0,1.0,...,2.0,2.0,1.0,1.0,5.397605e-79,5.397605e-79,22020.0,250.942593,2021001000.0,"b'Fargo, ND-MN, Metropolitan Statistical Area'"


In [8]:
len(model_build_df)

162710

### Data Dictionary

In [9]:
from Extraction_module import description_dict

In [10]:
#Any variable with value counts below 14 with be turned to a categorical nominal datatype:
# Convert 'col1' from float to categorical
for col in model_build_df.columns:
    if len(model_build_df[col].value_counts()) < 15:
        model_build_df[col] = model_build_df[col].astype('category')


In [11]:
##Extract state from MMSANAME
def get_state(col):
    return col.split(',')[1]

model_build_df['STATE'] = model_build_df['MMSANAME'].apply(get_state)

In [12]:
model_build_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162710 entries, 0 to 162709
Data columns (total 153 columns):
 #    Column    Non-Null Count   Dtype   
---   ------    --------------   -----   
 0    DISPCODE  162710 non-null  category
 1    STATERE1  39668 non-null   category
 2    CELPHON1  39668 non-null   category
 3    LADULT1   39668 non-null   category
 4    COLGSEX   14 non-null      category
 5    LANDSEX   16664 non-null   category
 6    RESPSLCT  22031 non-null   category
 7    SAFETIME  123042 non-null  category
 8    CADULT1   123042 non-null  category
 9    CELLSEX   123041 non-null  category
 10   HHADULT   123041 non-null  float64 
 11   SEXVAR    162710 non-null  category
 12   GENHLTH   162709 non-null  category
 13   PHYSHLTH  162708 non-null  float64 
 14   MENTHLTH  162710 non-null  float64 
 15   POORHLTH  87874 non-null   float64 
 16   PRIMINSR  162709 non-null  category
 17   PERSDOC3  162710 non-null  category
 18   MEDCOST1  162709 non-null  category
 19   

### Handling Duplicate rows

In [13]:
duplicate_rows = model_build_df[model_build_df.duplicated()]
duplicate_rows

Unnamed: 0,DISPCODE,STATERE1,CELPHON1,LADULT1,COLGSEX,LANDSEX,RESPSLCT,SAFETIME,CADULT1,CELLSEX,...,_VEGLT1A,_FRT16A,_VEG23A,_FRUITE1,_VEGETE1,_MMSA,_MMSAWT,SEQNO,MMSANAME,STATE


### Generate EDA Report with Original Dataset

In [None]:
#profile = ProfileReport(model_build_df)
#profile.to_file(output_file="EDA_Report.html")

### Handling Missing Data

In [14]:
#Step1 Remove Columns that have more than 50% missing values

def columns_with_high_null_percentage(df, threshold=0.9):
    """
    Get the columns with null values exceeding the specified threshold in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): The input DataFrame.
        threshold (float): The threshold for null values (default is 0.9, meaning 90%).

    Returns:
        List[str]: A list of column names with null values exceeding the threshold.
    """
    null_percentages = (df.isnull().sum() / len(df)).sort_values(ascending=False)
    high_null_columns = null_percentages[null_percentages > threshold].index.tolist()
    return high_null_columns

high_null_columns = columns_with_high_null_percentage(model_build_df, threshold=0.4)
print(high_null_columns)


['COLGSEX', 'NUMPHON3', 'LANDSEX', 'CAGEG', 'DIABAGE3', 'RESPSLCT', 'ASTHNOW', 'PREGNANT', 'CELPHON1', 'LADULT1', 'NUMHHOL3', 'STATERE1', 'ARTHEXER', 'LMTJOIN3', 'ARTHEDU', 'ARTHDIS2', 'JOINPAI2', 'HIVTSTD3', '_FLSHOT7', '_PNEUMO3', 'SMOKDAY2', 'BPMEDS', 'IMFVPLA2', 'FLSHTMY3', 'MAXDRNKS', 'DRNK3GE5', 'AVEDRNK3', 'POORHLTH']


In [15]:
model_build_df_1 = model_build_df.drop(columns=high_null_columns)
model_build_df_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162710 entries, 0 to 162709
Data columns (total 125 columns):
 #    Column    Non-Null Count   Dtype   
---   ------    --------------   -----   
 0    DISPCODE  162710 non-null  category
 1    SAFETIME  123042 non-null  category
 2    CADULT1   123042 non-null  category
 3    CELLSEX   123041 non-null  category
 4    HHADULT   123041 non-null  float64 
 5    SEXVAR    162710 non-null  category
 6    GENHLTH   162709 non-null  category
 7    PHYSHLTH  162708 non-null  float64 
 8    MENTHLTH  162710 non-null  float64 
 9    PRIMINSR  162709 non-null  category
 10   PERSDOC3  162710 non-null  category
 11   MEDCOST1  162709 non-null  category
 12   CHECKUP1  162710 non-null  category
 13   EXERANY2  162710 non-null  category
 14   BPHIGH6   162710 non-null  category
 15   CHOLCHK3  162710 non-null  category
 16   TOLDHI3   141413 non-null  category
 17   CHOLMED3  141218 non-null  category
 18   CVDINFR4  162710 non-null  category
 19   

In [18]:
#Step2 Remove categorical columns with low variance using feature selection

from sklearn.feature_selection import VarianceThreshold

# Select only the numerical columns of type float
numerical_df = model_build_df_1.select_dtypes(include='float')

# Specify the threshold for variance (e.g., 0.01, 0.05, etc.)
threshold = 0.01  # Adjust this threshold as needed

# Initialize the VarianceThreshold object
variance_selector = VarianceThreshold(threshold)

# Fit the selector to the numerical data
variance_selector.fit(numerical_df)

# Get the indices of columns with variance above the threshold
high_variance_indices = variance_selector.get_support(indices=True)

# Filter the DataFrame to keep only the high-variance columns
df_high_variance = numerical_df.iloc[:, high_variance_indices]

df_high_variance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162710 entries, 0 to 162709
Data columns (total 29 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   HHADULT   123041 non-null  float64
 1   PHYSHLTH  162708 non-null  float64
 2   MENTHLTH  162710 non-null  float64
 3   CHILDREN  160191 non-null  float64
 4   WEIGHT2   157932 non-null  float64
 5   HEIGHT3   157458 non-null  float64
 6   ALCDAY5   152805 non-null  float64
 7   FRUIT2    148953 non-null  float64
 8   FRUITJU2  148303 non-null  float64
 9   FVGREEN1  147829 non-null  float64
 10  FRENCHF1  147385 non-null  float64
 11  POTATOE1  146937 non-null  float64
 12  VEGETAB2  146449 non-null  float64
 13  _STSTR    162710 non-null  float64
 14  _AGE80    162710 non-null  float64
 15  WTKG3     146513 non-null  float64
 16  _BMI5     143962 non-null  float64
 17  _DRNKWK1  162710 non-null  float64
 18  FTJUDA2_  145120 non-null  float64
 19  FRUTDA2_  145552 non-null  float64
 20  GREN

In [21]:
low_variance_columns = [x for x in numerical_df.columns if x not in df_high_variance.columns]
low_variance_columns

[]

In [None]:
#Step3 Remove Highly correlated Columns 

In [None]:
#profile = ProfileReport(model_build_df)
#profile.to_file(output_file="EDA_Report_No_NA.html")

How do dietary habits and nutrition shape an individual's health outcome?
- we know that studies have shown that high fiber intake is associated with lower mortality.
as seen in this comparative study https://pubmed.ncbi.nlm.nih.gov/22648726/. 
- we would like to see how a high vegetable/fruit diet may be associated with a person percieved health. 
- the darker the vegetables the higher the fibre content

In [None]:
#Columns of interest would be nutrition and diet related columns:
nutrition_cols = []
for key, value in description_dict.items():
    if any(sub in str(value).lower() for sub in ['fruit','vegetable','fruits','vegetables']):
        nutrition_cols.append(key)

In [None]:
nutrition_cols

In [None]:
#drop alcolhol related columns 
nutrition_cols = nutrition_cols[2:]
for col in nutrition_cols:
    print(description_dict[col])

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
#make a dataframe by combining nutrition columns with health status, health days columns  
nutrition_cols = nutrition_cols + ['GENHLTH','PHYSHLTH','POORHLTH','_RFHLTH','_PHYS14D','_MENT14D']
df_nutrition = model_build_df[nutrition_cols]

In [None]:
df_nutrition.head()

In [None]:
plt.figure(figsize=(15,10))
ax = sns.heatmap(df_nutrition.corr(),cmap='viridis',annot=True, annot_kws={"size": 8})

### Looking at the data we could use the calculated Total calcualted variables per day for the fruits and vegetables to see its relation to general health 

In [None]:
#_FRUTSU1=(FTJUDA2_/100) + (FRUTDA2_/100); 
#_FRUTSU1=round((_FRUTSU1*100),1); 

#WE CAN MAKE A TOTAL FRUITS PER DAY COLUMN 
df_nutrition['Total_fruits_daily'] = df_nutrition['_FRUTSU1'].apply(lambda x: round(x/100,1) )
df_nutrition.head()

In [None]:
df_nutrition['Total_fruits_daily'].value_counts().head(20)

In [None]:
#_VEGESU1=(GRENDA1_/100) + (FRNCHDA_/100) + (POTADA1_/100) + 
#(VEGEDA2_/100); 
#_VEGESU1=round((_VEGESU1*100),1); 

#WE CAN MAKE A TOTAL VEGETABLES PER DAY COLUMN 
df_nutrition['Total_vegetables_daily'] = df_nutrition['_VEGESU1'].apply(lambda x: round(x/100,1))
df_nutrition.head()

In [None]:
df_nutrition['Total_vegetables_daily'].value_counts().head(20)

In [None]:
df_nutrition.describe([0.25,0.5,0.75,0.99]).T

In [None]:
#lets filter data to remove outliers 
filtered_df = df_nutrition[(df_nutrition['Total_fruits_daily'] <= 9.0) & 
                           (df_nutrition['Total_vegetables_daily'] <= 26.6396)]

In [None]:
len(filtered_df)

In [None]:
#Distribution of Total_fruits_daily
sns.histplot(x=filtered_df['Total_vegetables_daily'])

In [None]:
#Distribution of Total_vegetables_daily
sns.histplot(x=filtered_df['Total_fruits_daily'])

In [None]:
#create a BOX PLOT FOR _RFHLTH Vs Total fruits 
plt.figure(figsize=(5,5))
sns.set_style('darkgrid')
sns.boxplot(data=filtered_df, x='Total_fruits_daily', y='_RFHLTH')

In [None]:
#CREATE A SCATTERPLOT FOR PHYSHLTH(Number of days of poor physical health) Vs Total fruits consued daily
plt.figure(figsize=(5,5))
sns.set_style('darkgrid')
sns.boxplot(data=filtered_df, y='Total_fruits_daily', x='_PHYS14D')

In [None]:
#create a BOX PLOT FOR _RFHLTH Vs Total_vegetables_daily
plt.figure(figsize=(5,5))
sns.set_style('darkgrid')
sns.boxplot(data=filtered_df, x='Total_vegetables_daily', y='_RFHLTH')

In [None]:
#Lets create a pivot table showing the mean fruits and vegetables consumed per day for each _RFHLTH group 

pivot_result = pd.pivot_table(filtered_df, values=['Total_fruits_daily','Total_vegetables_daily'],index='_RFHLTH', aggfunc='mean')

#Only looking at those who reported Good health vs those that reported poor/bad health
pivot_result[0:2]

- This analysis shows that those who reported good or better health had a higher mean daily intake of fruits and vegetables. Not a significant difference.

### Apply association rules between eating fruits and vegetables and good percieved health

In [None]:
#!pip install mlxtend

In [None]:
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Assuming you already have a DataFrame called filtered_df with 'Total_fruits_daily', 'Total_vegetables_daily' and '_RFHLTH' columns

# Binning Total_fruits_daily
fruit_bins = [0, 2, 4, 10]  # Adjust bins as per your data distribution for fruits
fruit_labels = ['Low_Fruit', 'Medium_Fruit', 'High_Fruit']
filtered_df['Fruit_Intake_Binned'] = pd.cut(filtered_df['Total_fruits_daily'], bins=fruit_bins, labels=fruit_labels, include_lowest=True)

# Binning Total_vegetables_daily
veg_bins = [0, 8, 16, 26]  # Adjust bins as per your data distribution for vegetables
veg_labels = ['Low_Veg', 'Medium_Veg', 'High_Veg']
filtered_df['Veg_Intake_Binned'] = pd.cut(filtered_df['Total_vegetables_daily'], bins=veg_bins, labels=veg_labels, include_lowest=True)

# Convert to the binary format
filtered_df_binary = pd.get_dummies(filtered_df[['Fruit_Intake_Binned', 'Veg_Intake_Binned', '_RFHLTH']])

# Apply Apriori to find frequent itemsets
frequent_itemsets = apriori(filtered_df_binary, min_support=0.1, use_colnames=True)

# Generate Association Rules
rules = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)
print(rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])

What is the role of access to healthcare,marriage status, Education and income in predicting health outcomes?

In [None]:
#Get the columns related to access to healthcare,

In [None]:
#make a dataframe by combining acess to healthcare columns with health status, health days columns  

In [None]:
#Do some EDA analysis and correlation analysis to see if there are relationships btw acess to healthcare and percieved
#health

In [None]:
#Apply association rules between acess to healthcare and good percieved health

Can lifestyle factors, including smoking and alcohol consumption, 
predict health risks and outcomes?

In [None]:
#Get the columns related to alcohol consumption and smoking

In [None]:
#Get the columns related to exercise

How predictable are chronic diseases, such as diabetes and hypertension, through a combined analysis of 
lifestyle factors and genetic predisposition?

What is the significant contribution of mental health factors, such as stress, anxiety, 
and depression, to health outcomes, and what pathways influence overall well-being?