# Machine Learning Basics

In [98]:
import pandas as pd
import numpy as np
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,  f_classif, f_regression
import matplotlib.pyplot as plt

In [99]:
df = pd.read_csv("heart.csv")

In [100]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


## Feature Selection:

Our target variables include target and cholestral

### Selection for Heart Attack

In [4]:
list_predictor_continuous = []
list_predictor_categorical = []

for name in df.columns[3:]:
    if df[name].dtype == 'float64' or df[name].dtype == 'int64':
        list_predictor_continuous.append(name)
    else:
        list_predictor_categorical.append(name)

print('List of continuous predictor:')
print(list_predictor_continuous)
print('List of categorical predictor:')
print(list_predictor_categorical)

List of continuous predictor:
['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']
List of categorical predictor:
[]


In [5]:
features = ['trestbps', 'chol', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [6]:
X = df[features]
y = df['target']

In [7]:
# feature extraction
fs = SelectKBest(score_func=f_classif, k=4)
fs.fit_transform(X, y)

fs_support = fs.get_support()
fs_feature = X.loc[:,fs_support].columns.tolist()
print(str(fs_feature), 'selected features')

['thalach', 'exang', 'oldpeak', 'ca'] selected features


### Selection for Cholestrol

In [8]:
features = ['trestbps', 'fbs', 'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']

In [9]:
X = df[features]
y = df['chol']

In [10]:
# feature extraction
fs = SelectKBest(score_func=f_regression, k=4)
fs.fit_transform(X, y)

fs_support = fs.get_support()
fs_feature = X.loc[:,fs_support].columns.tolist()
print(str(fs_feature), 'selected features')

['trestbps', 'restecg', 'ca', 'thal'] selected features


## Data Preprocessing

In [101]:
new_features = ['thalach', 'exang', 'oldpeak', 'ca']
X = df[new_features]

In [102]:
y = df['target']

In [103]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Classification Models

In [86]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

In [87]:
logreg.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [88]:
predict_log = logreg.predict(X_test)

In [89]:
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test, y_test)))

Accuracy of logistic regression classifier on test set: 0.73


In [90]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predict_log))

              precision    recall  f1-score   support

           0       0.77      0.61      0.68        44
           1       0.70      0.83      0.76        47

    accuracy                           0.73        91
   macro avg       0.73      0.72      0.72        91
weighted avg       0.73      0.73      0.72        91



## Hyperparamter Tuning

In [91]:
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]
# define grid search
grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(estimator=logreg, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
grid_result = grid_search.fit(X, y)



In [92]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.822327 using {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.819182 (0.064385) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.819182 (0.064385) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.820755 (0.065857) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.820755 (0.065857) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.820755 (0.065857) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.822327 (0.064127) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.805031 (0.078880) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.805031 (0.078880) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.816038 (0.071134) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.789308 (0.076166) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.789308 (0.076166) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.797170 (0.070387) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.732704 (0.093801) with

This means redefine logistic regression: 
logreg = LogisticRegression(C = 10, penalty = 'l2', solver = 'liblinear')

## Regression


In [109]:
features_num = ['trestbps', 'restecg', 'ca', 'thal']
X = df[features_num]

In [110]:
y = df['chol']

In [111]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [112]:
from sklearn.linear_model import LinearRegression

In [113]:
linear = LinearRegression()

In [114]:
linear.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [115]:
predict_linear = linear.predict(X_test)

In [118]:
# Importing r2_square
from sklearn.metrics import r2_score

# Checking the R-squared value
r_squared = r2_score(y_test, predict_linear)
r_squared

0.012249314027290237

Try new model? Or Hyperparameter Tuning?