In [8]:
from path import Path
import pandas as pd

In [9]:
data = Path('heart.csv')
heart_df = pd.read_csv(data)
heart_df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0


In [10]:
#Change names of columns
heart_df.columns=['age', 'sex','chest_pain_type','resting_blood_pressure','serum_cholestoral(mg/dl)', 'fasting_blood_sugar', 'resting_ecg_results','max_heart_rate_achieved', 'exercise_induced_angina', 'st_depression', 'st_slope_elevation', 'major_vessels', 'thal', 'heart_disease']
heart_df

Unnamed: 0,age,sex,chest_pain_type,resting_blood_pressure,serum_cholestoral(mg/dl),fasting_blood_sugar,resting_ecg_results,max_heart_rate_achieved,exercise_induced_angina,st_depression,st_slope_elevation,major_vessels,thal,heart_disease
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1020,59,1,1,140,221,0,1,164,1,0.0,2,0,2,1
1021,60,1,0,125,258,0,0,141,1,2.8,1,1,3,0
1022,47,1,0,110,275,0,0,118,1,1.0,1,1,2,0
1023,50,0,0,110,254,0,0,159,0,0.0,2,0,2,1


In [11]:
#List data types
heart_df.dtypes

age                           int64
sex                           int64
chest_pain_type               int64
resting_blood_pressure        int64
serum_cholestoral(mg/dl)      int64
fasting_blood_sugar           int64
resting_ecg_results           int64
max_heart_rate_achieved       int64
exercise_induced_angina       int64
st_depression               float64
st_slope_elevation            int64
major_vessels                 int64
thal                          int64
heart_disease                 int64
dtype: object

In [12]:
# Find null values
for column in heart_df.columns:
    print(f"Column {column} has {heart_df[column].isnull().sum()} null values")

Column age has 0 null values
Column sex has 0 null values
Column chest_pain_type has 0 null values
Column resting_blood_pressure has 0 null values
Column serum_cholestoral(mg/dl) has 0 null values
Column fasting_blood_sugar has 0 null values
Column resting_ecg_results has 0 null values
Column max_heart_rate_achieved has 0 null values
Column exercise_induced_angina has 0 null values
Column st_depression has 0 null values
Column st_slope_elevation has 0 null values
Column major_vessels has 0 null values
Column thal has 0 null values
Column heart_disease has 0 null values


In [13]:
# Create Gender DF
gender_df = heart_df[["sex", "heart_disease"]]
gender_df

Unnamed: 0,sex,heart_disease
0,1,0
1,1,0
2,1,0
3,1,0
4,0,0
...,...,...
1020,1,1
1021,1,0
1022,1,0
1023,0,1


In [14]:
# Create age_df
age_df = heart_df[["age", "heart_disease"]]
age_df

Unnamed: 0,age,heart_disease
0,52,0
1,53,0
2,70,0
3,61,0
4,62,0
...,...,...
1020,59,1
1021,60,0
1022,47,0
1023,50,1


In [15]:
# Feature and Target Columns
y = heart_df["heart_disease"]
X = heart_df.drop(columns="heart_disease")

In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    random_state=1, 
                                                    stratify=y)
X_train.shape

(768, 13)

In [17]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(solver='lbfgs',
                                max_iter=500,
                                random_state=1)

In [18]:
classifier.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(max_iter=500, random_state=1)

In [19]:
y_pred = classifier.predict(X_test)
results = pd.DataFrame({"Prediction": y_pred, "Actual": y_test}).reset_index(drop=True)
results.head(20)

Unnamed: 0,Prediction,Actual
0,1,0
1,1,1
2,1,1
3,0,0
4,1,1
5,1,0
6,1,1
7,1,1
8,1,1
9,1,1


In [20]:
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test, y_pred))

0.8326848249027238


# EXPORT TO DATABASE

In [21]:
from sqlalchemy import create_engine
from config import db_password

In [22]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5433/heart_data"

In [23]:
engine = create_engine(db_string)

In [24]:
heart_df.to_sql(name='risk_factors', con=engine)

OperationalError: (psycopg2.OperationalError) connection to server at "127.0.0.1", port 5433 failed: FATAL:  database "heart_data" does not exist

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [None]:
pip install psycopg2-binary 

In [None]:
gender_df.to_sql(name='gender', con=engine)

In [None]:
age_df.to_sql(name='age', con=engine)