#Q1) What is the distribution of age among heart failure patients in the dataset?

#Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

# Filter the data for patients who experienced heart failure
heart_failure_patients = data[data['DEATH_EVENT'] == 1]

# Plot the distribution of age among heart failure patients
plt.figure(figsize=(10, 6))
sns.histplot(heart_failure_patients['age'], kde=True, bins=20, color='red')
plt.title('Age Distribution among Heart Failure Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.grid(True)
plt.show()


##Explanation
#Patients with Age group between 60 to 70 are most likely to be afftected with heart failure as per the given dataset.

#Q2) How does the death rate vary with age?

#Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

# Calculate the number of patients who experienced heart failure and the total number of patients by age
age_groups = data.groupby('age')['DEATH_EVENT'].agg(['sum', 'count']).reset_index()

# Calculate the death rate (sum/count)
age_groups['death_rate'] = age_groups['sum'] / age_groups['count']

# Plot the death rate against age using a histogram
plt.figure(figsize=(12, 6))
sns.histplot(age_groups, x='age', weights='death_rate', bins=20, color='red', kde=False)
plt.title('Death Rate by Age')
plt.xlabel('Age')
plt.ylabel('Death Rate')
plt.grid(True)
plt.show()

##Explanation
#The death rate typically increases with age due to several factors, including the natural aging process, the cumulative effects of lifestyle choices, and the increased likelihood of chronic diseases

#Q3) What is the percentage of male and female patients in the dataset?

#Code
import pandas as pd

# Step 1: Load the dataset
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(file_path)

# Step 2: Calculate the count of each gender
gender_counts = df['sex'].value_counts()

# Step 3: Calculate the percentage of each gender
gender_percentage = (gender_counts / len(df)) * 100

# Step 4: Display the results
print("Percentage of Male and Female Patients:")
print(gender_percentage)

#Q4) How does the platelet count vary among different age groups?

#Code
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

data = [
    (75, 265000), (55, 263358.03), (65, 162000), (50, 210000), (65, 327000),
    (90, 204000), (75, 127000), (60, 454000), (65, 263358.03), (80, 388000),
    (75, 368000), (62, 253000), (45, 136000), (50, 276000), (49, 427000),
    (82, 47000), (87, 262000), (45, 582), (70, 237000), (48, 87000), (65, 276000),
    (68, 289000), (53, 368000), (65, 451000), (80, 140000), (57, 395000),
    (68, 166000), (53, 418000), (60, 263358.03), (50, 153000), (55, 109),
    (45, 385000), (45, 385000), (60, 119000), (42, 213000), (72, 274000),
    (70, 244000), (65, 497000), (41, 374000), (58, 122000), (85, 5882),
    (65, 149000), (69, 582), (90, 226000), (82, 582), (94, 263358.03),
    (85, 360000), (50, 319000), (50, 128), (65, 188000), (69, 582),
    (90, 226000), (82, 321000), (60, 305000), (60, 200000), (94, 263358.03),
    (85, 23), (50, 30), (70, 69), (67, 582), (60, 582), (50, 124),
    (70, 571), (72, 127), (60, 588), (50, 582), (51, 271000), (60, 582),
    (80, 898), (42, 5209), (60, 53), (72, 328), (55, 748), (45, 1876),
    (63, 936), (45, 292), (85, 129), (55, 60), (50, 369), (70, 143),
    (60, 754), (58, 400), (60, 96), (85, 102), (65, 113), (86, 582)]

df = pd.DataFrame(data, columns=['age', 'platelets'])

age_groups = [(40, 45), (46, 50), (51, 55), (56, 60), (61, 65), (66, 70), (71, 75), (76, 80), (81, 85), (86, 90), (91, 95)]
grouped_means = df.groupby(df['age'].apply(lambda x: min(age_groups, key=lambda g: g[0] <= x < g[1])[0]))['platelets'].mean().reindex(age_groups)
   
plt.figure(figsize=(10, 6))
sns.barplot(x=grouped_means.index, y=grouped_means['platelets'])
plt.title('Mean Platelet Count by Age Group')
plt.xlabel('Age Group')
plt.ylabel('Mean Platelet Count')
plt.show()

##Explanation
#Platelets Counts for different age groups are as follows :
#Ages 40-45: platelet count varies between 115,000 and 582,000
#Ages 49-50: platelet count varies between 136,000 and 310,000
#Ages 51-55: platelet count varies between 748,000 and 279,000
#Ages 56-60: platelet count varies between 3964 and 305000
#Ages 61-65: platelet count varies between 224 and 497,000
#Ages 68-70: platelet count varies between 30 and 972,000
#Ages 72-75: platelet count varies between 364 and 675,000
#Ages 77-78: platelet count varies between 418 and 805,000
#Ages 80-82: platelet count varies between 776 and 47000
#Ages 85-87: platelet count varies between 5882 and 507000
#Ages 90-95: platelet count varies between 60 and 196000

#Q5) Is there a correlation between creatinine and sodium levels in the blood?

#Code
import pandas as pd

file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

creatinine = data['serum_creatinine']
sodium = data['serum_sodium']

# Calculate the Pearson correlation coefficient
correlation_coefficient = creatinine.corr(sodium)

print("The Pearson correlation coefficient between creatinine and sodium is: {correlation_coefficient}")

#Q6) How does the prevalence of high blood pressure differ between male and female patients?

#Code
# Load the dataset into a pandas DataFrame
import pandas as pd

file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(file_path, header=0)

# Calculate the number of male and female patients with high blood pressure
num_male_high_bp = df[(df['sex'] == 1) & (df['high_blood_pressure'] == 1)].shape[0]
num_female_high_bp = df[(df['sex'] == 0) & (df['high_blood_pressure'] == 1)].shape[0]

# Calculate the total number of male and female patients
num_male = df[df['sex'] == 1].shape[0]
num_female = df[df['sex'] == 0].shape[0]

# Calculate the prevalence of high blood pressure in male and female patients
prevalence_male = num_male_high_bp / num_male * 100
prevalence_female = num_female_high_bp / num_female * 100

# Print the results
print('Prevalence of high blood pressure in male patients: {prevalence_male:.2f}%')
print('Prevalence of high blood pressure in female patients: {prevalence_female:.2f}%')

##Explanation
#The prevalence of high blood pressure is 42% for male patients (42/100) and 15% for female patients (15/100)

#Q7) What is the relationship between smoking habits and the occurrence of heart failure?

#Code
import pandas as pd

file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
df = pd.read_csv(file_path)

# Determine the number of individuals who experienced heart failure
heart_failure = df[df["DEATH_EVENT"] == 1]

# Determine the number of smokers and non-smokers
smokers = df[df["smoking"] == 1]
non_smokers = df[df["smoking"] == 0]

# Calculate the incidence of heart failure in smokers and non-smokers
smoker_heart_failure = heart_failure[heart_failure["smoking"] == 1].shape[0]
non_smoker_heart_failure = heart_failure[heart_failure["smoking"] == 0].shape[0]

# Calculate the total number of smokers and non-smokers
total_smokers = smokers.shape[0]
total_non_smokers = non_smokers.shape[0]

# Calculate the incidence of heart failure in smokers and non-smokers
smoker_heart_failure_rate = smoker_heart_failure / total_smokers
non_smoker_heart_failure_rate = non_smoker_heart_failure / total_non_smokers

# Print the incidence of heart failure in smokers and non-smokers
print("Heart failure rate in smokers:", smoker_heart_failure_rate)
print("Heart failure rate in non-smokers:", non_smoker_heart_failure_rate)

#Q8) Are there any noticeable patterns in the distribution of death events across different age groups?

#Code
import csv
from collections import defaultdict

# Function to calculate the proportion of death events for each age group
def proportion_death_events(age_group, death_events, total_instances):
    return death_events[age_group] / total_instances[age_group]

# Read data from the CSV file
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
with open(file_path, 'r') as f:
    reader = csv.DictReader(f)
    for row in reader:
        data[row['age']].update(row)

# Calculate the number of instances and death events for each age group
age_groups = sorted(data.keys())
total_instances = {age_group: 0 for age_group in age_groups}
death_events = {age_group: 0 for age_group in age_groups}
for age_group in age_groups:
    total_instances[age_group] = data[age_group]['time']
    if data[age_group]['DEATH_EVENT'] == '1':
        death_events[age_group] += 1

# Calculate the proportion of death events for each age group
proportions = {age_group: proportion_death_events(age_group, death_events, total_instances) for age_group in age_groups}

# Print the results
print("Age Group\tTotal Instances\tDeath Events\tProportion of Death Events")
for age_group in age_groups:
    print("{age_group}\t{total_instances[age_group]}\t{death_events[age_group]}\t{proportions[age_group]:.4f}")

##Explanation
##Age 40: 15 instances (3 with DEATH_EVENT = 1)
#Age 42: 3 instances (1 with DEATH_EVENT = 1)
#Age 45: 11 instances (5 with DEATH_EVENT = 1)
#Age 46: 1 instance (1 with DEATH_EVENT = 1)
#Age 49: 3 instances (1 with DEATH_EVENT = 1)
#Age 50: 10 instances (5 with DEATH_EVENT = 1)
#Age 51: 2 instances (1 with DEATH_EVENT = 1)
#Age 52: 2 instances (1 with DEATH_EVENT = 1)
#Age 53: 4 instances (3 with DEATH_EVENT = 1)
#Age 54: 1 instance (1 with DEATH_EVENT = 1)
#Age 55: 5 instances (3 with DEATH_EVENT = 1)
#Age 56: 1 instance (1 with DEATH_EVENT = 1)
#Age 57: 1 instance (0 with DEATH_EVENT = 1)
#Age 58: 4 instances (1 with DEATH_EVENT = 1)
#Age 59: 2 instances (1 with DEATH_EVENT = 1)
#Age 60: 17 instances (8 with DEATH_EVENT = 1)
#Age 61: 2 instances (1 with DEATH_EVENT = 1)
#Age 62: 3 instances (1 with DEATH_EVENT = 1)
#Age 63: 6 instances (4 with DEATH_EVENT = 1)
#Age 64: 2 instances (1 with DEATH_EVENT = 1)
#Age 65: 13 instances (6 with DEATH_EVENT = 1)
#Age 68: 3 instances (2 with DEATH_EVENT = 1)
#Age 69: 2 instances (1 with DEATH_EVENT = 1)
#Age 70: 14 instances (7 with DEATH_EVENT = 1)
#Age 72: 3 instances (2 with DEATH_EVENT = 1)
#Age 73: 3 instances (2 with DEATH_EVENT = 1)
#Age 75: 6 instances (3 with DEATH_EVENT = 1)
#Age 77: 1 instance (0 with DEATH_EVENT = 1)
#Age 78: 2 instances (1 with DEATH_EVENT = 1)
#Age 80: 3 instances (2 with DEATH_EVENT = 1)
#Age 81: 1 instance (0 with DEATH_EVENT = 1)
#Age 82: 3 instances (2 with DEATH_EVENT = 1)
#Age 85: 3 instances (1 with DEATH_EVENT = 1)
#Age 86: 1 instance (0 with DEATH_EVENT = 1)
#Age 87: 2 instances (1 with DEATH_EVENT = 1)
#Age 90: 2 instances (2 with DEATH_EVENT = 1)
#Age 94: 2 instances (1 with DEATH_EVENT = 1)
#Age 95: 2 instances (1 with DEATH_EVENT = 1)

#Q9)  Is there any significant difference in erection fraction between patients with and without diabetes?

#Code
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
data = pd.read_csv(file_path)

# Extract the ejection fraction and diabetes columns from the data
efraction = [row[4] for row in data]
diabetes = [row[3] for row in data]

# Calculate the average ejection fraction for patients with and without diabetes
avg_efraction_diabetes = sum(efraction) / sum(diabetes)
avg_efraction_no_diabetes = sum(efraction) / (len(efraction) - sum(diabetes))

# Print the results
print("Average ejection fraction for patients with diabetes:", avg_efraction_diabetes)
print("Average ejection fraction for patients without diabetes:", avg_efraction_no_diabetes)

# Determine if there is a significant difference between the two averages
if abs(avg_efraction_diabetes - avg_efraction_no_diabetes) > 5:
    print("There is a significant difference in ejection fraction between patients with and without diabetes.")
else:
    print("There is not a significant difference in ejection fraction between patients with and without diabetes.")

#Q10) How does the serum creatinine level vary between patients who survived and those who did not?

#Code
import pandas as pd

# Create a DataFrame from the context
file_path = 'work/EDA Assignment-1/heart_failure_clinical_records_dataset.csv'
df = pd.DataFrame.from_dict({k: v for k, v in zip(file_path.keys(), file_path.values())})

# Calculate the average serum creatinine level for patients who survived and those who did not
avg_creatinine_survived = df[df['DEATH_EVENT'] == 0]['serum_creatinine'].mean()
avg_creatinine_deceased = df[df['DEATH_EVENT'] == 1]['serum_creatinine'].mean()

# Print the result
print(f"The average serum creatinine level for patients who survived is {avg_creatinine_survived:.2f}.")
print(f"The average serum creatinine level for patients who did not survive is {avg_creatinine_deceased:.2f}.")