In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# Load dataset
dataset = pd.read_csv('dataset_alejandro_garzon.csv')
dataset

This dataset includes information of multiple patients, some of whom have a heart disease, this dataset has some important columns that can be useful for the predictions, such as gender, age, current smoker, cigarettes per day blood pressure medication, and more

The question is the following: *Does a 40-year-old man, with a BMI of 22, who doesn't smoke, but has a prevalent stroke (the other values are average), has risk of a heart disease in the next 10 years?*

There are certain elements that don't seem to have a big impact on whether the patient has a heart disease such as the education of the patient

In [None]:
# Check info to check the amount of null elements
dataset.info()

In [None]:
# Drop the unnecessary columns and null rows
dataset.drop(columns=['education'], inplace=True, axis=1)
dataset.dropna(inplace=True)

# Check new dataset
dataset.info()

In [None]:
# Graph amount of smoker patients per age
fig, ax = plt.subplots(figsize=(20, 7))

bp = sns.barplot(data=dataset, x=dataset['age'], y=dataset['currentSmoker'], ax=ax,)
plt.title('Follower count of the first 20 users')

for g in bp.patches:
    bp.annotate(format(g.get_height(), '.3f'), (g.get_x() + g.get_width() / 2., g.get_height() + .05))

plt.title('Smokers per age')
plt.xlabel('Age')
plt.ylabel('Current Smoker')

This is a great graph that shows the amount of patients that smoke per age, it is interesting to see how the trend goes down until 65, where it starts going up at the age of 66, on the other hand, most of the ages have less than half the patient smoking

In [None]:
# Graph the number of patients with a prevalent stroke per age

fig, ax = plt.subplots(figsize=(20, 7))
bp = sns.barplot(data=dataset, x=dataset['age'], y=dataset['prevalentStroke'])

for g in bp.patches:
    bp.annotate(format(g.get_height(), '.3f'), (g.get_x() + g.get_width() / 2., g.get_height() + .05))

plt.title('Number of patients with prevalent strokes by age')
plt.ylabel('Prevalent Strokes')
plt.xlabel('Age')

This graph shows us that there is not a big amount of patients with a prevalent stroke, and we can also see that most of the patients with prevalent strokes are in the range of 50 - 67 years, but it also seems like strokes tend to appear in the 35 - 50 year range

In [None]:
# Number of male patients with diabetes per age

# Query
male_diabetes = dataset[dataset['male'] == 1]

# Graph
fig, ax = plt.subplots(figsize=(20, 7))
bp = sns.barplot(data=male_diabetes, x=male_diabetes['age'], y=male_diabetes['diabetes'], ax=ax)
plt.title('Male patients with diabetes by age')
plt.ylabel('Diabetes')
plt.xlabel('Age')

for g in bp.patches:
    bp.annotate(format(g.get_height(), '.3f'), (g.get_x() + g.get_width() / 2., g.get_height() + .05))

The graph shows us that male patients start to get diagnosed with diabetes around the age of 39 years, it becomes more common as the patients get older, with an interesting case at the age of 52, where there are more patients with diabetes, being this age the third one with the most diabetes patients

In [None]:
# BMI of female patients by age
# Query
female_bmi = dataset[dataset['male'] == 0]

# Graph
fig, ax = plt.subplots(figsize=(20, 7))
bp = sns.barplot(data=female_bmi, x=female_bmi['age'], y=female_bmi['BMI'], ax=ax)

for g in bp.patches:
    bp.annotate(format(g.get_height(), '.3f'), (g.get_x() + g.get_width() / 2., g.get_height() + .05))
    
plt.title('Female patients BMI by age')
plt.ylabel('BMI')
plt.xlabel('Age')

The graph shows how there is a trend in the average BMI per age, where it slowly grows from around 21 to around 28, before the trend changes, it is also very interesting to see the great difference in the BMI for ages 32 and 70 compared to the other ages, being these two the highest BMIs on the dataset

In [None]:
# Number of cigarettes per day for female patients
# Query
female_cigarettes_per_day = dataset[dataset['male'] == 0]

# Graph
fig, ax = plt.subplots(figsize=(20, 7))
bp = sns.barplot(data=female_cigarettes_per_day, x=female_cigarettes_per_day['age'], y=female_cigarettes_per_day['cigsPerDay'], ax=ax)
plt.title('Amount of cigarettes per day (Woman)')
plt.ylabel('Amount of cigarettes per day')
plt.xlabel('Age')

for g in bp.patches:
    bp.annotate(format(g.get_height(), '.3f'), (g.get_x() + g.get_width() / 2., g.get_height() + .05))

Overall, in this graph we see a trend that goes down in regard to the amount of cigarettes smoked per day, although, there are some spikes on the graph, they are usually lower than the last spike, in this case, there is a similar trend as the last graph, where ages 32 and 68 are some of the highest smokers, in this case,age 32 has the most smoked cigarettes per day, and age 68 has 10 cigarettes per day 

In [None]:
# Get the dependent and independent variables
X, y = dataset.drop('TenYearCHD', axis=1), dataset['TenYearCHD']

# Split data, display X_train
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train

In [None]:
# Create the model using Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Test accuracy
accuracy = model.score(X_test, y_test)
accuracy 

In [None]:
# Create test based on the question in part 3
# Does a 40-year-old man, with a BMI of 22, who doesn't smoke, but has a prevalent stroke (the other values are average), have a heart disease?
test = {
    'male': [0],
    'age': [40],
    'currentSmoker': [0],
    'cigsPerDay': [0.0],
    'BPMeds': [0.0],
    'prevalentStroke': [1],
    'prevalentHyp': [0],
    'diabetes': [0],
    'totChol': [190.0],
    'sysBP': [100.0],
    'diaBP': [70.0],
    'BMI': [22],
    'heartRate': [70],
    'glucose': [95]
}

test_df = pd.DataFrame(test)
result = model.predict(test_df)
print(f'Result (0 - No heart disease risk, 1 - Heart disease risk): {result}')