# Importing the required packages:

In [490]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import chardet

# Data Cleaning and Processing:

## Reading the parquet file into a pandas dataframe:

In [491]:
path = r"C:\Users\Sebma\OneDrive\Documents\Sparta_Global\Academy\Week 7\Cardiovascular-disease-data\heart_converted.parquet"
heart_df = pd.read_parquet(path)
heart_df.info()

ArrowMemoryError: malloc of size 1143099008 failed

In [None]:
heart_df.head()

## List of the desired column names:

In [None]:
variable_names = [
    "State",
    "Sex",
    "General Health",
    "Physical Health Days",
    "Mental Health Days",
    "Last Checkup Time",
    "Physical Activities",
    "Sleep Hours",
    "Removed Teeth",
    "Had Heart Attack",
    "Had Angina",
    "Had CAD or MI",
    "Had Stroke",
    "Had Asthma",
    "Had Skin Cancer",
    "Had COPD",
    "Had Depressive Disorder",
    "Had Kidney Disease",
    "Had Arthritis",
    "Had Diabetes",
    "Deaf or Hard of Hearing",
    "Blind or Vision Difficulty",
    "Difficulty Concentrating",
    "Difficulty Walking",
    "Difficulty Dressing or Bathing",
    "Difficulty Errands",
    "Smoker Status",
    "Years Smoking",
    "ECigarette Usage",
    "Chest Scan",
    "Race Ethnicity Category",
    "Age Category",
    "Height in Meters",
    "Weight in Kilograms",
    "BMI Category",
    "Alcohol Drinker",
    "HIV Testing",
    "Flu Vax Last 12 Months",
    "Pneumo Vax Ever",
    "Tetanus Tdap Last 10 Years",
    "High Risk Last Year",
    "Covid Positive",
    "Income Category"
]

## Reading the variable CSV into a Pandas dataframe:

#### Explanation:
The CSV containing the desired variable names is imported. Chardet is used to detect the encoding of the CSV file and specify the encoding when reading into a Pandas dataframe - this handles UnicodeDecodeError errors.

In [None]:
var_path = r"C:\Users\Sebma\OneDrive\Documents\Sparta_Global\Academy\Week 7\Cardiovascular-disease-data\Variable_descriptions.csv"

with open(var_path, 'rb') as file:
    result = chardet.detect(file.read())
    encoding = result['encoding']

var_df = pd.read_csv(var_path, names=['Variable', 'Description'], encoding=encoding)
var_df

## Converting the variable names to a NumPy array:

#### Explanation:
Converting the Pandas Variable series to a NumPy array. A NumPy array is used over a Pandas series as it more computationally efficient for large datasets.

In [None]:
var_array = var_df['Variable'].to_numpy()
var_array

## Selecting the desired columns from the heart_df dataframe:

#### Explanation:
Only the desired columns from the heart_df dataframe are selected using the NumPy array of the desired variable names.

In [None]:
heart_df = heart_df[var_array]

## Converting the names of desired columns into informative names:

#### Explanation:
The column labels of the filtered heart_df dataframe are set to the desired column names from the variable_names list.

In [None]:
heart_df.columns = variable_names
heart_df

## Creating a copy of the heart_df dataframe:

#### Explanation:
The heart_df is copied to preserve the original data. The copy method is used to create a new dataframe in memory rather than reference the original dataframe.

In [None]:
main_heart_df = heart_df.copy()

## Linking the variable values to their descriptive value:

#### Explanation:
The values in the raw dataset are represented by numerical values. Here the numerical values are linked to their descriptive value.

In [None]:
State = {
    1: "Alabama",
    2: "Alaska",
    4: "Arizona",
    5: "Arkansas",
    6: "California",
    8: "Colorado",
    9: "Connecticut",
    10: "Delaware",
    11: "District of Columbia",
    12: "Florida",
    13: "Georgia",
    15: "Hawaii",
    16: "Idaho",
    17: "Illinois",
    18: "Indiana",
    19: "Iowa",
    20: "Kansas",
    21: "Kentucky",
    22: "Louisiana",
    23: "Maine",
    24: "Maryland",
    25: "Massachusetts",
    26: "Michigan",
    27: "Minnesota",
    28: "Mississippi",
    29: "Missouri",
    30: "Montana",
    31: "Nebraska",
    32: "Nevada",
    33: "New Hampshire",
    34: "New Jersey",
    35: "New Mexico",
    36: "New York",
    37: "North Carolina",
    38: "North Dakota",
    39: "Ohio",
    40: "Oklahoma",
    41: "Oregon",
    42: "Pennsylvania",
    44: "Rhode Island",
    45: "South Carolina",
    46: "South Dakota",
    47: "Tennessee",
    48: "Texas",
    49: "Utah",
    50: "Vermont",
    51: "Virginia",
    53: "Washington",
    54: "West Virginia",
    55: "Wisconsin",
    56: "Wyoming",
    66: "Guam",
    72: "Puerto Rico",
    78: "Virgin Islands"
}

Sex = {1: 'Male', 2: 'Female'}

General_health = {
    1: "Excellent",
    2: "Very good",
    3: "Good",
    4: "Fair",
    5: "Poor",
}

Phys_mental_health = {
    77: np.nan,
    88: 0,
    99: np.nan
}


Last_checkup = {
    1: "Within past year",
    2: "Between 1 and 2 years",
    3: "Between 2 and 5 years",
    4: "5 or more years"
}


Yes_no_questions = {1: "Yes", 2: "No"}

Sleep_time = lambda x: np.where(x > 24, np.nan, x)


Teeth_removed = {
    1: "1-5",
    2: "6 or more, but not all",
    3: "All",
    8: "None"
}

Diabetes = {
    1: "Yes",
    2: "Yes, only during pregnancy",
    3: "No",
    4: "No, pre-diabetes or borderline diabetes",
}

Smoker_status = {
    1: "Current (every day)",
    2: "Current (some days)",
    3: "Former",
    4: "Never"
}

Ecigarettes = {
    1: "Never",
    2: "Current (every day)",
    3: "Current (some days)",
    4: "Former"
}


Race = {
    1: "White only",
    2: "Black only",
    3: "Other race only",
    4: "Multiracial, Non-Hispanic",
    5: "Hispanic"
}

BMI_cat = {
    1: "Underweight",
    2: "Normal weight",
    3: "Overweight",
    4: "Obese"
}

Age_cat = {
    1: "18 to 24",
    2: "25 to 29",
    3: "30 to 34",
    4: "35 to 39",
    5: "40 to 44",
    6: "45 to 49",
    7: "50 to 54",
    8: "55 to 59",
    9: "60 to 64",
    10: "65 to 69",
    11: "70 to 74",
    12: "75 to 79",
    13: "80 or older"
}

Tetanus = {
    1: "Yes",
    2: "Yes, but not Tdap",
    3: "Yes, but not sure what type",
    4: "No"
}

COVID = {
    1: "Yes",
    2: "No",
    3: "Tested positive using home test without a health professional"
    
}

Income = {
    1: "Less than $15,000",
    2: "\\$15,000 to $24,999",
    3: "\\$25,000 to $34,999",
    4: "\\$35,000 to $49,999",
    5: "\\$50,000 to $99,999",
    6: "\\$100,000 to $199,999",
    7: "\\$200,000 or more",
    8: np.nan
    
}

## Setting descriptive variable values to the dataframe:

#### Explanation:
Each element in the Pandas series is replaced by the associated value within the dictionary (Element-wise transformation).

In [None]:
main_heart_df['State'] = main_heart_df['State'].map(State)
main_heart_df['Sex'] = main_heart_df['Sex'].map(Sex)
main_heart_df['General Health'] = main_heart_df['General Health'].map(General_health)
main_heart_df['Physical Health Days'] = main_heart_df['Physical Health Days'].replace(Phys_mental_health)
main_heart_df['Mental Health Days'] = main_heart_df['Mental Health Days'].replace(Phys_mental_health)
main_heart_df['Last Checkup Time'] = main_heart_df['Last Checkup Time'].map(Last_checkup)
main_heart_df['Physical Activities'] = main_heart_df['Physical Activities'].map(Yes_no_questions)
main_heart_df['Sleep Hours'] = main_heart_df['Sleep Hours'].apply(Sleep_time)
main_heart_df['Removed Teeth'] = main_heart_df['Removed Teeth'].map(Teeth_removed)
main_heart_df['Had Heart Attack'] = main_heart_df['Had Heart Attack'].map(Yes_no_questions)
main_heart_df['Had Angina'] = main_heart_df['Had Angina'].map(Yes_no_questions)
main_heart_df['Had CAD or MI'] = main_heart_df['Had CAD or MI'].map(Yes_no_questions)
main_heart_df['Had Stroke'] = main_heart_df['Had Stroke'].map(Yes_no_questions)
main_heart_df['Had Asthma'] = main_heart_df['Had Asthma'].map(Yes_no_questions)
main_heart_df['Had Skin Cancer'] = main_heart_df['Had Skin Cancer'].map(Yes_no_questions)
main_heart_df['Had COPD'] = main_heart_df['Had COPD'].map(Yes_no_questions)
main_heart_df['Had Depressive Disorder'] = main_heart_df['Had Depressive Disorder'].map(Yes_no_questions)
main_heart_df['Had Kidney Disease'] = main_heart_df['Had Kidney Disease'].map(Yes_no_questions)
main_heart_df['Had Arthritis'] = main_heart_df['Had Arthritis'].map(Yes_no_questions)
main_heart_df['Had Diabetes'] = main_heart_df['Had Diabetes'].map(Diabetes)
main_heart_df['Deaf or Hard of Hearing'] = main_heart_df['Deaf or Hard of Hearing'].map(Yes_no_questions)
main_heart_df['Blind or Vision Difficulty'] = main_heart_df['Blind or Vision Difficulty'].map(Yes_no_questions)
main_heart_df['Difficulty Concentrating'] = main_heart_df['Difficulty Concentrating'].map(Yes_no_questions)
main_heart_df['Difficulty Walking'] = main_heart_df['Difficulty Walking'].map(Yes_no_questions)
main_heart_df['Difficulty Dressing or Bathing'] = main_heart_df['Difficulty Dressing or Bathing'].map(Yes_no_questions)
main_heart_df['Difficulty Errands'] = main_heart_df['Difficulty Errands'].map(Yes_no_questions)
main_heart_df['Smoker Status'] = main_heart_df['Smoker Status'].map(Smoker_status)
main_heart_df['Years Smoking'] = main_heart_df['Years Smoking'].map
main_heart_df['ECigarette Usage'] = main_heart_df['ECigarette Usage'].map(Ecigarettes)
main_heart_df['Chest Scan'] = main_heart_df['Chest Scan'].map(Yes_no_questions)
main_heart_df['Race Ethnicity Category'] = main_heart_df['Race Ethnicity Category'].map(Race)
main_heart_df['Age Category'] = main_heart_df['Age Category'].map(Age_cat)
main_heart_df['Height in Meters'] = main_heart_df['Height in Meters'] / 100
main_heart_df['Weight in Kilograms'] = main_heart_df['Weight in Kilograms'] / 100
main_heart_df['BMI Category'] = main_heart_df['BMI Category'].map(BMI_cat)
main_heart_df['Alcohol Drinker'] = main_heart_df['Alcohol Drinker'].map(Yes_no_questions)
main_heart_df['HIV Testing'] = main_heart_df['HIV Testing'].map(Yes_no_questions)
main_heart_df['Flu Vax Last 12 Months'] = main_heart_df['Flu Vax Last 12 Months'].map(Yes_no_questions)
main_heart_df['Pneumo Vax Ever'] = main_heart_df['Pneumo Vax Ever'].map(Yes_no_questions)
main_heart_df['Tetanus Tdap Last 10 Years'] = main_heart_df['Tetanus Tdap Last 10 Years'].map(Tetanus)
main_heart_df['High Risk Last Year'] = main_heart_df['High Risk Last Year'].map(Yes_no_questions)
main_heart_df['Covid Positive'] = main_heart_df['Covid Positive'].map(COVID)
main_heart_df['Income Category'] = main_heart_df['Income Category'].map(Income)

## Dropping any rows that are missing data on cardiovascular disease:

#### Explanation:
Rows missing data in the 'Had CAD or MI' column are dropped from the dataset as this column is a primary indicator of cardiovascular disease.

In [None]:
main_heart_df = main_heart_df.dropna(subset=['Had CAD or MI'])

In [None]:
main_heart_df

# Data Visualisation:

In [None]:
visual_heart_df = main_heart_df.copy()

In [None]:
%matplotlib inline

In [None]:
visual_heart_df['Had CAD or MI'] = visual_heart_df['Had CAD or MI'].map({'Yes': 1, 'No': 0})

proportions = visual_heart_df.groupby('Smoker Status')['Had CAD or MI'].mean().reset_index()
proportions

