In [1]:
# Import the required modules

import pandas as pd
import hvplot.pandas
from pathlib import Path
from sklearn.svm import SVC 
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.decomposition import PCA

In [2]:
# Read CSV file

heart_df = pd.read_csv(("HeartAttack.csv"),
                       index_col = 0
                      
)
heart_df.head(10)

Unnamed: 0,Age,Gender,Chest Pain,Resting BP(mm Hg),Cholestrol(mg/dl),Fasting Blood Sugar,ECG,Max Heart Rate,Angina,Heart Attack
0,63,1,3,145,233,1,0,150,0,1
1,37,1,2,130,250,0,1,187,0,1
2,41,0,1,130,204,0,0,172,0,1
3,56,1,1,120,236,0,1,178,0,1
4,57,0,0,120,354,0,1,163,1,1
5,57,1,0,140,192,0,1,148,0,1
6,56,0,1,140,294,0,0,153,0,1
7,44,1,1,120,263,0,1,173,0,1
8,52,1,2,172,199,1,1,162,0,1
9,57,1,2,150,168,0,1,174,0,1


## Column values explanations
* Chest Pain: 
           1 = typical angina,
           2 = atypical angina,
           3 = non-anginal, 
           4 = asymptomatic

* Fasting Blood Suagr > 120mg/dl
           1 = True,
           2 = False
           
* ECG:
    0 = Normal,
    1 = Abnormal,
    2 = Definite left ventrical hypertrophy
    
* Angina:
       1 = yes,
       2 = No
       
* Heart Attack
      1 = True,
      0 = False

In [3]:
# Generate summary statistics
heart_df.describe()

Unnamed: 0,Age,Gender,Chest Pain,Resting BP(mm Hg),Cholestrol(mg/dl),Fasting Blood Sugar,ECG,Max Heart Rate,Angina,Heart Attack
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,1.0


In [4]:
# Plot your data to see what's in your DataFrame
heart_df.hvplot.line(
    width=800,
    height=400,
    rot=90
)

## Test_train_split 

In [5]:
# Set variables
y = heart_df["Heart Attack"] 
X = heart_df.drop(columns = "Heart Attack")

In [6]:
# Split into testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

## Logistic Regression

In [7]:
# Declare a logistic regression model
logistic_regression_model = LogisticRegression(random_state=42)

# Fit and save the logistic regression model using the training data
lr_model = logistic_regression_model.fit(X_train, y_train)

In [8]:
#Generate training predictions
training_predictions = lr_model.predict(X_train)

#Generate testing predictions
testing_predictions = logistic_regression_model.predict(X_test)


In [9]:
# Create and save the confusion matrix for the training data
training_matrix = confusion_matrix(y_train, training_predictions)

# Print the confusion matrix for the training data
print(training_matrix)

[[ 74  29]
 [ 19 105]]


In [10]:
# Create and save the training classification report
training_report = classification_report(y_train, training_predictions)

# Print the training classification report
print(training_report)

              precision    recall  f1-score   support

           0       0.80      0.72      0.76       103
           1       0.78      0.85      0.81       124

    accuracy                           0.79       227
   macro avg       0.79      0.78      0.78       227
weighted avg       0.79      0.79      0.79       227



In [11]:
# Create and save the confusion matrix for the testing data
test_matrix = confusion_matrix(y_test, testing_predictions)

# Print the confusion matrix for the testing data
print(test_matrix)

[[28  7]
 [ 7 34]]


In [12]:
# Create and save the testing classification report
testing_report = classification_report(y_test, testing_predictions)

# Print the testing classification report
print(testing_report)

              precision    recall  f1-score   support

           0       0.80      0.80      0.80        35
           1       0.83      0.83      0.83        41

    accuracy                           0.82        76
   macro avg       0.81      0.81      0.81        76
weighted avg       0.82      0.82      0.82        76



## Support Vector Machine

In [13]:
# Support vector machine linear classifier

model = SVC(kernel='linear')
model.fit(X_train, y_train)

In [14]:
# Model Accuracy

print('Test Accuracy: %.3f' % model.score(X_test, y_test))

Test Accuracy: 0.855


In [15]:
# Calculate the classification report

predictions = model.predict(X_test)
print(classification_report(y_test, predictions
                            ))

              precision    recall  f1-score   support

           0       0.83      0.86      0.85        35
           1       0.88      0.85      0.86        41

    accuracy                           0.86        76
   macro avg       0.85      0.86      0.85        76
weighted avg       0.86      0.86      0.86        76



## Find the Best K value for the original Data

In [16]:
# Create a list with of k-values
k = list (range(1,11))

In [17]:
# Create a a list to store inertia values
inertia = []

# Create a a list to store the values of k
k = list(range(1, 11))

In [18]:
# Create a for-loop where each value of k is evaluated using the K-means algorithm
# Fit the model using the spread_df DataFrame
# Append the value of the computed inertia from the `inertia_` attribute of the KMeans model instance
for i in k:
    k_model = KMeans(n_clusters=i, random_state=1)
    k_model.fit(heart_df)
    inertia.append(k_model.inertia_)

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [19]:
# Create a Dictionary that holds the list values for k and inertia
elbow_data = {"k": k, "inertia": inertia}

# Create a DataFrame using the elbow_data Dictionary
df_elbow = pd.DataFrame(elbow_data)

# Review the DataFrame
df_elbow.head()

Unnamed: 0,k,inertia
0,1,1088196.0
1,2,604278.6
2,3,470892.7
3,4,390734.4
4,5,331289.8


In [20]:
# Plot the DataFrame
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)

## Principle Component Analysis

In [21]:
# Create a PCA model instance and set `n_components=2`.
pca = PCA(n_components = 2)

In [22]:
# Use the PCA model with `fit_transform`
heart_pca = pca.fit_transform(heart_df)

# View the first five rows of the DataFrame. 
heart_pca [:5]

array([[-12.26702452,   2.83905176],
       [  2.68813205, -39.93507522],
       [-42.94947732, -23.64058737],
       [-10.94309721, -28.42865979],
       [106.98116013, -15.85123911]])

In [23]:
# Retrieve the explained variance  
pca.explained_variance_ratio_

array([0.74816235, 0.15042298])

About 90% of the total variance is condensed into the 2 PCA variables.

In [24]:
# Create a new DataFrame with the PCA data.
heart_pca_df = pd.DataFrame(
    heart_pca,
    columns = ["PC1", "PC2"])


# Copy the crypto names from the original data
heart_pca_df["0"] = heart_df.index


# Set the index
heart_pca_df = heart_pca_df.set_index("0")

# Display sample data
heart_pca_df.head()

Unnamed: 0_level_0,PC1,PC2
0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,-12.267025,2.839052
1,2.688132,-39.935075
2,-42.949477,-23.640587
3,-10.943097,-28.42866
4,106.98116,-15.851239


## Find the Best K value for the PCA Data

In [25]:
# Create a list with the number of k-values from 1 to 11
K = list(range(1,11))

In [26]:
# Create an empty list to store the inertia values
inertia = []

# Create a for loop to compute the inertia with each possible value of k
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
for i in k:
    k_model = KMeans(n_clusters= i, random_state=0)
    k_model.fit(heart_pca_df)
    inertia.append(k_model.inertia_)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


In [27]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k": k, "inertia": inertia}
# Create a DataFrame with the data to plot the Elbow curve
df_elbow2 = pd.DataFrame(elbow_data)
df_elbow2[:5]

Unnamed: 0,k,inertia
0,1,977837.337489
1,2,493982.22511
2,3,361197.822898
3,4,281404.81093
4,5,221821.817657


In [28]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
df_elbow2.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve_PCA", 
    xticks=k
)

In [29]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=2, random_state=0)

# Fit the model
model.fit(heart_pca_df)

# Make predictions
k_2 = model.predict(heart_pca_df)

# Create a copy of the customers_pca_df DataFrame
heart_pca_predictions_df = heart_pca_df.copy()

# Add a class column with the labels
heart_pca_predictions_df["segments"] = k_2

  super()._check_params_vs_input(X, default_n_init=10)


In [30]:
# Plot the clusters
heart_pca_predictions_df.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="segments"
)

In [31]:
# Create a PCA model instance and set `n_components=2`.
pca = PCA(n_components = 4)

# Use the PCA model with `fit_transform`
heart_pca = pca.fit_transform(heart_df)

# View the first five rows of the DataFrame. 
heart_pca [:5]

array([[-12.26702452,   2.83905176,  14.96055496,   6.91764709],
       [  2.68813205, -39.93507522,   0.83513195, -10.57137803],
       [-42.94947732, -23.64058737,   1.74772051,  -7.77986818],
       [-10.94309721, -28.42865979,  -7.05283928,   8.33768278],
       [106.98116013, -15.85123911, -14.86009469,   3.23749767]])

In [32]:
# Retrieve the explained variance  
pca.explained_variance_ratio_

array([0.74816235, 0.15042298, 0.08465607, 0.01622086])

About 91% of the total variance is condensed into the 4 PCA variables.

In [33]:
# Create a new DataFrame with the PCA data.
heart_pca_df = pd.DataFrame(
    heart_pca,
    columns = ["PC1", "PC2", "PC3", "PC4"])


# Copy the crypto names from the original data
heart_pca_df["0"] = heart_df.index


# Set the index
heart_pca_df = heart_pca_df.set_index("0")

# Display sample data
heart_pca_df.head()

Unnamed: 0_level_0,PC1,PC2,PC3,PC4
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,-12.267025,2.839052,14.960555,6.917647
1,2.688132,-39.935075,0.835132,-10.571378
2,-42.949477,-23.640587,1.747721,-7.779868
3,-10.943097,-28.42866,-7.052839,8.337683
4,106.98116,-15.851239,-14.860095,3.237498


In [34]:
# Define the model Kmeans model using the optimal value of k for the number of clusters.
model = KMeans(n_clusters=4, random_state=0)

# Fit the model
model.fit(heart_pca_df)

# Make predictions
k_4 = model.predict(heart_pca_df)

# Create a copy of the customers_pca_df DataFrame
heart_pca_predictions_df = heart_pca_df.copy()

# Add a class column with the labels
heart_pca_predictions_df["segments"] = k_4

  super()._check_params_vs_input(X, default_n_init=10)


In [37]:
# Plot the clusters
heart_pca_predictions_df.hvplot.scatter(
    x="PC1",
    y="PC2",
    by="segments"
)