In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('heart.csv')

In [6]:
df.head()

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [7]:
#DATA CLEANING
df.isna().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

#if there is null then
df = df['caa'].fillna(df.median)
df.isnull().sum()

In [8]:
#Data Transformation
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)

Number of duplicate rows ::  (1, 14)


In [9]:
df = df.drop_duplicates()
duplicate_rows = df[df.duplicated()]
print("Number of duplicate rows :: ", duplicate_rows.shape)

Number of duplicate rows ::  (0, 14)


In [10]:
#data integration
df1 = df[['age','sex','output']]
df2 = df[['age', 'oldpeak']]

In [11]:
df3 = pd.merge(df1,df2,on='age')
df3.head()

Unnamed: 0,age,sex,output,oldpeak
0,63,1,1,2.3
1,63,1,1,0.0
2,63,1,1,0.0
3,63,1,1,1.4
4,63,1,1,1.8


In [35]:
#Data Transformation
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Read the dataset into a pandas DataFrame
df = pd.read_csv("heart.csv")

# Scale numeric features using Min-Max scaling
scaler = MinMaxScaler()
numeric_columns = ["age", "chol", "trtbps"]
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Print the transformed dataset
print(df)


          age  sex  cp    trtbps      chol  fbs  restecg  thalachh  exng  \
0    0.708333    1   3  0.481132  0.244292    1        0       150     0   
1    0.166667    1   2  0.339623  0.283105    0        1       187     0   
2    0.250000    0   1  0.339623  0.178082    0        0       172     0   
3    0.562500    1   1  0.245283  0.251142    0        1       178     0   
4    0.583333    0   0  0.245283  0.520548    0        1       163     1   
..        ...  ...  ..       ...       ...  ...      ...       ...   ...   
298  0.583333    0   0  0.433962  0.262557    0        1       123     1   
299  0.333333    1   3  0.150943  0.315068    0        1       132     0   
300  0.812500    1   0  0.471698  0.152968    1        1       141     0   
301  0.583333    1   0  0.339623  0.011416    0        1       115     1   
302  0.583333    0   1  0.339623  0.251142    0        0       174     0   

     oldpeak  slp  caa  thall  output  
0        2.3    0    0      1       1  
1      

In [36]:
#Error Correcting
import pandas as pd

# Read the dataset into a pandas DataFrame
df = pd.read_csv("heart.csv")

# Manually correct errors in the "sex" column
df.loc[df["sex"] == 0, "sex"] = "female"
df.loc[df["sex"] == 1, "sex"] = "male"

# Print the corrected dataset
print(df)


     age     sex  cp  trtbps  chol  fbs  restecg  thalachh  exng  oldpeak  \
0     63    male   3     145   233    1        0       150     0      2.3   
1     37    male   2     130   250    0        1       187     0      3.5   
2     41  female   1     130   204    0        0       172     0      1.4   
3     56    male   1     120   236    0        1       178     0      0.8   
4     57  female   0     120   354    0        1       163     1      0.6   
..   ...     ...  ..     ...   ...  ...      ...       ...   ...      ...   
298   57  female   0     140   241    0        1       123     1      0.2   
299   45    male   3     110   264    0        1       132     0      1.2   
300   68    male   0     144   193    1        1       141     0      3.4   
301   57    male   0     130   131    0        1       115     1      1.2   
302   57  female   1     130   236    0        0       174     0      0.0   

     slp  caa  thall  output  
0      0    0      1       1  
1      0    0

In [12]:
#Error Correction
df.nunique()


age          41
sex           2
cp            4
trtbps       49
chol        152
fbs           2
restecg       3
thalachh     91
exng          2
oldpeak      40
slp           3
caa           5
thall         4
output        2
dtype: int64

In [13]:
df['caa'].unique() # ‘ca’ ranges from 0–3, however, df.nunique() listed 0–4. So lets find the ‘4’ and change them to NaN.

array([0, 2, 1, 3, 4])

In [14]:
df.caa.value_counts()

0    175
1     65
2     38
3     20
4      4
Name: caa, dtype: int64

In [15]:
df[df['caa']==4]

Unnamed: 0,age,sex,cp,trtbps,chol,fbs,restecg,thalachh,exng,oldpeak,slp,caa,thall,output
92,52,1,2,138,223,0,1,169,0,0.0,2,4,2,1
158,58,1,1,125,220,0,1,144,0,0.4,1,4,3,1
163,38,1,2,138,175,0,1,173,0,0.0,2,4,2,1
251,43,1,0,132,247,1,0,143,1,0.1,1,4,3,0


In [16]:
df.loc[df['caa']==4,'caa']=np.NaN

In [17]:
df['caa'].unique()

array([ 0.,  2.,  1.,  3., nan])

In [18]:
df['thall'].unique() #‘thal’ ranges from 1–3, however, df.nunique() listed 0–3. There are two values of ‘0’. So lets change them to NaN

array([1, 2, 3, 0])

In [19]:
df['thall'].value_counts() # after adding nun

2    165
3    117
1     18
0      2
Name: thall, dtype: int64

In [20]:
df.loc[df['thall']==0,'thall']=np.NaN

In [21]:
df['thall'].unique()

array([ 1.,  2.,  3., nan])

In [22]:
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         4
thall       2
output      0
dtype: int64

In [23]:
df=df.fillna(df.median())
df.isnull().sum()

age         0
sex         0
cp          0
trtbps      0
chol        0
fbs         0
restecg     0
thalachh    0
exng        0
oldpeak     0
slp         0
caa         0
thall       0
output      0
dtype: int64

In [24]:
#Data Modelling
from sklearn.model_selection import train_test_split

In [25]:
X= df.drop('output',axis=1)
y=df['output']

In [26]:
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [27]:
from sklearn.linear_model import LogisticRegression

In [28]:
lr = LogisticRegression()

In [29]:
lr.fit(X_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
predictions = lr.predict(X_test)

In [31]:
from sklearn import metrics

print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

MAE: 0.15
MSE: 0.15
RMSE: 0.3872983346207417


In [32]:
from sklearn.metrics import accuracy_score,classification_report
print(accuracy_score(y_test,predictions)*100)
print(classification_report(y_test,predictions))

85.0
              precision    recall  f1-score   support

           0       0.83      0.81      0.82        43
           1       0.86      0.88      0.87        57

    accuracy                           0.85       100
   macro avg       0.85      0.85      0.85       100
weighted avg       0.85      0.85      0.85       100



In [45]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Read the dataset into a pandas DataFrame
df = pd.read_csv("heart.csv")

# Preprocessing
scaler = MinMaxScaler()
numeric_columns = ["age", "chol", "trtbps"]
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

# Split the dataset into features (X) and target variable (y)
X = df.drop("output", axis=1)
y = df["output"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.8852459016393442


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
