# Machine Learning Basics

In [14]:
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,  f_classif, f_regression
import matplotlib.pyplot as plt

In [15]:
df = pd.read_csv("heart.csv")

In [16]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Feature Selection:

Our target variables include target and cholestral

### Selection for Heart Attack

In [17]:
list_predictor_continuous = []
list_predictor_categorical = []

for name in df.columns[3:]:
    if df[name].dtype == 'float64' or df[name].dtype == 'int64':
        list_predictor_continuous.append(name)
    else:
        list_predictor_categorical.append(name)

print('List of continuous predictor:')
print(list_predictor_continuous)
print('List of categorical predictor:')
print(list_predictor_categorical)

List of continuous predictor:
['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
List of categorical predictor:
[]


In [18]:
features = ['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [19]:
X = df[features]
y = df['target']

In [20]:
# feature extraction
fs = SelectKBest(score_func=f_classif, k=4)
fs.fit_transform(X, y)

fs_support = fs.get_support()
fs_feature = X.loc[:,fs_support].columns.tolist()
print(str(fs_feature), 'selected features')

['thalach', 'exang', 'oldpeak', 'ca'] selected features


### Selection for Cholestrol

In [9]:
features = ['trestbps', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [10]:
X = df[features]
y = df['chol']

In [12]:
# feature extraction
fs = SelectKBest(score_func=f_regression, k=4)
fs.fit_transform(X, y)

fs_support = fs.get_support()
fs_feature = X.loc[:,fs_support].columns.tolist()
print(str(fs_feature), 'selected features')

['trestbps', 'restecg', 'ca', 'thal'] selected features


## Data Preprocessing

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Classification Models

In [22]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [23]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [24]:
predict_log = logreg.predict(X_test)

In [25]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.76


In [28]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predict_log))

              precision    recall  f1-score   support

           0       0.81      0.66      0.73        44
           1       0.73      0.85      0.78        47

    accuracy                           0.76        91
   macro avg       0.77      0.76      0.75        91
weighted avg       0.77      0.76      0.76        91



## Hyperparamter Tuning