In [1]:
import pandas as pd

# Load the dataset from the GitHub link
url = "https://raw.githubusercontent.com/kb22/Heart-Disease-Prediction/dbd27c35db3a128f7f87a2d1b8200f1f14e4affb/dataset.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset to inspect it
df.head()



Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
# Check for missing values in the dataset
df.isnull().sum()

# Check for duplicates
df.duplicated().sum()


1

In [5]:
# Drop duplicates
df = df.drop_duplicates()

# Fill missing values for numerical columns with the mean and categorical with the mode
df['age'] = df['age'].fillna(df['age'].mean())
df['trestbps'] = df['trestbps'].fillna(df['trestbps'].mean())
df['chol'] = df['chol'].fillna(df['chol'].mean())
df['fbs'] = df['fbs'].fillna(df['fbs'].mode()[0])

# Check again for missing values
df.isnull().sum()


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [7]:
# Encode 'sex' as a binary variable (Male = 1, Female = 0)
df['sex'] = df['sex'].map({'male': 1, 'female': 0})

# Encode 'cp' (chest pain type) as a categorical variable using one-hot encoding
df = pd.get_dummies(df, columns=['cp'], drop_first=True)

# Check the transformed dataset
df.head()


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_1,cp_2,cp_3
0,63,,145,233,1,0,150,0,2.3,0,0,1,1,False,False,True
1,37,,130,250,0,1,187,0,3.5,0,0,2,1,False,True,False
2,41,,130,204,0,0,172,0,1.4,2,0,2,1,True,False,False
3,56,,120,236,0,1,178,0,0.8,2,0,2,1,True,False,False
4,57,,120,354,0,1,163,1,0.6,2,0,2,1,False,False,False


In [9]:
from sklearn.preprocessing import StandardScaler

# Initialize the scaler
scaler = StandardScaler()

# Normalize numerical columns: age, trestbps, and chol
df[['age', 'trestbps', 'chol']] = scaler.fit_transform(df[['age', 'trestbps', 'chol']])

# Check the transformed dataset
df.head()


Unnamed: 0,age,sex,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target,cp_1,cp_2,cp_3
0,0.949794,,0.764066,-0.261285,1,0,150,0,2.3,0,0,1,1,False,False,True
1,-1.928548,,-0.091401,0.067741,0,1,187,0,3.5,0,0,2,1,False,True,False
2,-1.485726,,-0.091401,-0.822564,0,0,172,0,1.4,2,0,2,1,True,False,False
3,0.174856,,-0.661712,-0.203222,0,1,178,0,0.8,2,0,2,1,True,False,False
4,0.285561,,-0.661712,2.080602,0,1,163,1,0.6,2,0,2,1,False,False,False


In [11]:
from sklearn.model_selection import train_test_split

# Features (X) and target (y)
X = df.drop('fbs', axis=1)  # Dropping 'fbs' as it will be the target
y = df['fbs']  # Target variable is 'fbs'

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the dimensions of the training data
X_train.shape, X_test.shape


((211, 15), (91, 15))

In [17]:
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values with the column mean for numerical features
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to the entire dataset
df_imputed = imputer.fit_transform(df)

# Convert back to DataFrame after imputation
df = pd.DataFrame(df_imputed, columns=df.columns)

# Check for any remaining missing values
print("Missing values after imputation:", df.isnull().sum().sum())
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Features (X) and target (y)
X = df.drop('fbs', axis=1)  # Drop 'fbs' as it will be the target
y = df['fbs']  # Target variable is 'fbs'

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Set max_iter to handle convergence issues if necessary

# Train the model
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Evaluation metrics (confusion matrix, accuracy, precision, recall, F1-score)
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

conf_matrix = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [19]:
# Check for missing values in the training and test sets
print("Missing values in X_train:", X_train.isnull().sum().sum())
print("Missing values in X_test:", X_test.isnull().sum().sum())
from sklearn.impute import SimpleImputer

# Create an imputer to fill missing values with the column mean for numerical features
imputer = SimpleImputer(strategy='mean')

# Apply the imputer to both training and test sets
X_train = imputer.fit_transform(X_train)
X_test = imputer.transform(X_test)  # Notice we're only using transform() on the test set

# Check if there are any missing values left
print("Missing values in X_train after imputation:", pd.isnull(X_train).sum().sum())
print("Missing values in X_test after imputation:", pd.isnull(X_test).sum().sum())
# Initialize and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # Set max_iter to ensure convergence if necessary
model.fit(X_train, y_train)

# Predict on the test set
y_pred = model.predict(X_test)

# Confusion Matrix and Evaluation Metrics
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Accuracy, Precision, Recall, F1-Score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

# Print the evaluation metrics
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-Score: {f1}')


Missing values in X_train: 211
Missing values in X_test: 91
Missing values in X_train after imputation: 0
Missing values in X_test after imputation: 0
Confusion Matrix:
[[68  5]
 [17  1]]
Accuracy: 0.7582417582417582
Precision: 0.16666666666666666
Recall: 0.05555555555555555
F1-Score: 0.08333333333333333


