In [10]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

Data Collection and Analysis

In [5]:
# Loading the diabetes dataset to a pandas DataFrame
heartdisease_dataset = pd.read_csv("/content/heart_disease.csv")

In [6]:
# Printing the first 5 rows of the dataset
heartdisease_dataset.head()

Unnamed: 0,Age,Gender,Blood Pressure,Cholesterol Level,Exercise Habits,Smoking,Family Heart Disease,Diabetes,BMI,High Blood Pressure,...,High LDL Cholesterol,Alcohol Consumption,Stress Level,Sleep Hours,Sugar Consumption,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level,Heart Disease Status
0,56.0,Male,153.0,155.0,High,Yes,Yes,No,24.991591,Yes,...,No,High,Medium,7.633228,Medium,342.0,,12.969246,12.38725,No
1,69.0,Female,146.0,286.0,High,No,Yes,Yes,25.221799,No,...,No,Medium,High,8.744034,Medium,133.0,157.0,9.355389,19.298875,No
2,46.0,Male,126.0,216.0,Low,No,No,No,29.855447,No,...,Yes,Low,Low,4.44044,Low,393.0,92.0,12.709873,11.230926,No
3,32.0,Female,122.0,293.0,High,Yes,Yes,No,24.130477,Yes,...,Yes,Low,High,5.249405,High,293.0,94.0,12.509046,5.961958,No
4,60.0,Male,166.0,242.0,Low,Yes,Yes,Yes,20.486289,Yes,...,No,Low,High,7.030971,High,263.0,154.0,10.381259,8.153887,No


In [7]:
# number of rows and columns in this dataset
heartdisease_dataset.shape

(10000, 21)

In [8]:
# Getting the statistical measures of the data
heartdisease_dataset.describe()

Unnamed: 0,Age,Blood Pressure,Cholesterol Level,BMI,Sleep Hours,Triglyceride Level,Fasting Blood Sugar,CRP Level,Homocysteine Level
count,9971.0,9981.0,9970.0,9978.0,9975.0,9974.0,9978.0,9974.0,9980.0
mean,49.296259,149.75774,225.425577,29.077269,6.991329,250.734409,120.142213,7.472201,12.456271
std,18.19397,17.572969,43.575809,6.307098,1.753195,87.067226,23.584011,4.340248,4.323426
min,18.0,120.0,150.0,18.002837,4.000605,100.0,80.0,0.003647,5.000236
25%,34.0,134.0,187.0,23.658075,5.449866,176.0,99.0,3.674126,8.723334
50%,49.0,150.0,226.0,29.079492,7.003252,250.0,120.0,7.472164,12.409395
75%,65.0,165.0,263.0,34.520015,8.531577,326.0,141.0,11.255592,16.140564
max,80.0,180.0,300.0,39.996954,9.999952,400.0,160.0,14.997087,19.999037


In [11]:
heartdisease_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Age                   9971 non-null   float64
 1   Gender                9981 non-null   object 
 2   Blood Pressure        9981 non-null   float64
 3   Cholesterol Level     9970 non-null   float64
 4   Exercise Habits       9975 non-null   object 
 5   Smoking               9975 non-null   object 
 6   Family Heart Disease  9979 non-null   object 
 7   Diabetes              9970 non-null   object 
 8   BMI                   9978 non-null   float64
 9   High Blood Pressure   9974 non-null   object 
 10  Low HDL Cholesterol   9975 non-null   object 
 11  High LDL Cholesterol  9974 non-null   object 
 12  Alcohol Consumption   7414 non-null   object 
 13  Stress Level          9978 non-null   object 
 14  Sleep Hours           9975 non-null   float64
 15  Sugar Consumption   

0 ---> Non-Diabetic
1 ---> Diabetic

In [28]:
# Seperating the data and labels
X = heartdisease_dataset.drop(columns= "CRP Level", axis=1)
y = heartdisease_dataset['CRP Level']

# Remove rows with NaN values in the target variable
nan_indices = y.isnull()
X = X[~nan_indices]
y = y[~nan_indices]

In [15]:
print(X)

       Age  Gender  Blood Pressure  Cholesterol Level Exercise Habits Smoking  \
0     56.0    Male           153.0              155.0            High     Yes   
1     69.0  Female           146.0              286.0            High      No   
2     46.0    Male           126.0              216.0             Low      No   
3     32.0  Female           122.0              293.0            High     Yes   
4     60.0    Male           166.0              242.0             Low     Yes   
...    ...     ...             ...                ...             ...     ...   
9995  25.0  Female           136.0              243.0          Medium     Yes   
9996  38.0    Male           172.0              154.0          Medium      No   
9997  73.0    Male           152.0              201.0            High     Yes   
9998  23.0    Male           142.0              299.0             Low     Yes   
9999  38.0  Female           128.0              193.0          Medium     Yes   

     Family Heart Disease D

In [16]:
print(y)

0       12.969246
1        9.355389
2       12.709873
3       12.509046
4       10.381259
          ...    
9995     3.588814
9996     2.658267
9997     4.408867
9998     7.215634
9999    14.387810
Name: CRP Level, Length: 10000, dtype: float64


In [25]:
print(y.isnull().sum())     # check NaN count in target

26


In [26]:
print(X.isnull().sum())     # check NaN count in features

Age                       29
Gender                    19
Blood Pressure            19
Cholesterol Level         30
Exercise Habits           25
Smoking                   25
Family Heart Disease      21
Diabetes                  30
BMI                       22
High Blood Pressure       26
Low HDL Cholesterol       25
High LDL Cholesterol      26
Alcohol Consumption     2586
Stress Level              22
Sleep Hours               25
Sugar Consumption         30
Triglyceride Level        26
Fasting Blood Sugar       22
Homocysteine Level        20
Heart Disease Status       0
dtype: int64


In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [32]:
scalar = StandardScaler()

In [35]:
scalar.fit(X_train)

In [37]:
standardized_data = scalar.transform(X_train) # This line is not needed
X_train_standardized = scalar.transform(X_train)
X_test_standardized = scalar.transform(X_test)

X = standardized_data # Keep X as standardized_data for consistency with later cells # This line is also not needed

In [38]:
print("X_train_standardized shape:", X_train_standardized.shape)
print("X_test_standardized shape:", X_test_standardized.shape)

X_train_standardized shape: (7979, 36)
X_test_standardized shape: (1995, 36)


In [39]:
X_train_standardized = scalar.transform(X_train)
X_test_standardized = scalar.transform(X_test)

X = standardized_data # Keep X as standardized_data for consistency with later cells

In [40]:
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_train shape:", X_train_standardized.shape)
print("X_test shape:", X_test_standardized.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X shape: (7979, 36)
y shape: (9974,)
X_train shape: (7979, 36)
X_test shape: (1995, 36)
y_train shape: (7979,)
y_test shape: (1995,)


In [42]:
print(X.shape, X_train.shape, X_test.shape)

(7979, 36) (7979, 36) (1995, 36)


Training the model

In [43]:
classifier = LogisticRegression(max_iter=1000)

In [49]:
classifier.fit(X_train, y_train)

ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

Model Evaluation

Accuracy Score

In [None]:
# accuracy score on the test data
X_test_prediction = Classifier.predict(X_test_standardized)
test_data_accuracy = accuracy_score(X_test_prediction, y_test)

In [None]:
print("Accuracy score of the test data :", test_data_accuracy)

Accuracy score of the test data : 0.7727272727272727


Making a Predictive system

In [None]:
input_data = (10,101,76,48,180,32.9,0.171,63)

# Changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# Reshape the array as we are predicting for one instance
input_data_reshape = input_data_as_numpy_array.reshape(1, -1)

# Standardize the input_data using the fitted scalar
std_data = scalar.transform(input_data_reshape)
print(std_data)

prediction = Classifier.predict(std_data)
print(prediction)
if (prediction[0] == 0):
  print("The person is not diabetic")
else:
  print("The person is diabetic")

[[9.98415598e+00 1.01682529e+02 7.51871626e+01 4.76075140e+01
  1.74959081e+02 3.23951653e+01 1.73568432e-01 6.30656221e+01]]
[1]
The person is diabetic


In [34]:
# Select categorical columns
categorical_cols = X_train.select_dtypes(include='object').columns

# Apply one-hot encoding to training and testing sets
X_train = pd.get_dummies(X_train, columns=categorical_cols, dummy_na=False)
X_test = pd.get_dummies(X_test, columns=categorical_cols, dummy_na=False)

# Ensure that the columns are the same in both training and testing sets
X_train, X_test = X_train.align(X_test, join='inner', axis=1, fill_value=0)

print("X_train shape after one-hot encoding:", X_train.shape)
print("X_test shape after one-hot encoding:", X_test.shape)

X_train shape after one-hot encoding: (7979, 36)
X_test shape after one-hot encoding: (1995, 36)


Imputing Missing Values in Features

In [46]:
from sklearn.impute import SimpleImputer

# Impute missing values in X
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X = imputer.fit_transform(X)

print("Shape of X after imputation:", X.shape)

Shape of X after imputation: (7979, 36)
