In [1]:
from path import Path
import pandas as pd

In [2]:
data = Path('heart.csv')
heart_df = pd.read_csv(data)
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [3]:
#List data types
heart_df.dtypes

age           int64
sex           int64
cp            int64
trestbps      int64
chol          int64
fbs           int64
restecg       int64
thalach       int64
exang         int64
oldpeak     float64
slope         int64
ca            int64
thal          int64
target        int64
dtype: object

In [4]:
# Find null values
for column in heart_df.columns:
    print(f"Column {column} has {heart_df[column].isnull().sum()} null values")

Column age has 0 null values
Column sex has 0 null values
Column cp has 0 null values
Column trestbps has 0 null values
Column chol has 0 null values
Column fbs has 0 null values
Column restecg has 0 null values
Column thalach has 0 null values
Column exang has 0 null values
Column oldpeak has 0 null values
Column slope has 0 null values
Column ca has 0 null values
Column thal has 0 null values
Column target has 0 null values


In [16]:
# Create Gender DF
gender_df = heart_df[["sex", "target"]]
gender_df

Unnamed: 0,sex,target
0,1,0
1,1,0
2,1,0
3,1,0
4,0,0
...,...,...
1020,1,1
1021,1,0
1022,1,0
1023,0,1


In [18]:
# Create age_df
age_df = heart_df[["age", "target"]]
age_df

Unnamed: 0,age,target
0,52,0
1,53,0
2,70,0
3,61,0
4,62,0
...,...,...
1020,59,1
1021,60,0
1022,47,0
1023,50,1


In [5]:
# Feature and Target Columns
y = heart_df["target"]
X = heart_df.drop(columns="target")

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(768, 13)

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

In [8]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=500, random_state=1)

In [9]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,1
2,1,1
3,0,0
4,1,1
5,1,0
6,1,1
7,1,1
8,1,1
9,1,1


In [10]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8326848249027238


# EXPORT TO DATABASE

In [11]:
from sqlalchemy import create_engine
from config import db_password

In [12]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/heart_data"

In [13]:
engine = create_engine(db_string)

In [14]:
heart_df.to_sql(name='risk_factors', con=engine)

In [15]:
pip install psycopg2-binary 

Note: you may need to restart the kernel to use updated packages.


In [17]:
gender_df.to_sql(name='gender', con=engine)

In [19]:
age_df.to_sql(name='age', con=engine)