In [36]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

# 1. Load dataset

In [2]:
data_dir = "dataset/heart.csv"
dataset = pd.read_csv(data_dir)

print("dataset shape: {}".format(dataset.shape))

dataset shape: (303, 14)


# 2. Split train test

In [3]:
train_set, test_set = train_test_split(dataset, test_size=0.2, random_state=42)

train_set = pd.DataFrame(train_set.values, columns=train_set.columns)
test_set = pd.DataFrame(test_set.values, columns=test_set.columns)

print("train_set: {}".format(train_set.shape))
print("test_set: {}".format(test_set.shape))

train_set: (242, 14)
test_set: (61, 14)


# 3. Preprocessing data

## 3.1 Numerical

In [4]:
numerical_columns = ["age", "trestbps", "chol", "thalach", "oldpeak"]

train_numerical_set = train_set[numerical_columns]

std_scaler = StandardScaler()
std_scaler.fit(train_numerical_set)
train_set_normalized = std_scaler.transform(train_numerical_set)

train_set_normalized = pd.DataFrame(train_set_normalized, columns=numerical_columns)

## 3.2 Categorical

### 3.2.1 Binary

In [5]:
binary_columns = ["exang", "fbs", "sex", "target"]

train_binary_set = train_set[binary_columns]

### 3.2.2 Multi class

In [6]:
multiclass_columns = ["cp", "restecg", "slope", "ca", "thal"]

train_multiclass_set = train_set[multiclass_columns]

encoder = OneHotEncoder(categories="auto")
encoder.fit(train_multiclass_set)
train_set_encoded = encoder.transform(train_multiclass_set)

train_set_encoded = pd.DataFrame(train_set_encoded.toarray(), columns=encoder.get_feature_names(multiclass_columns))

## 3.3 Merge

In [35]:
train_set_processed = pd.concat([train_set_normalized, train_binary_set, train_set_encoded], axis=1)

y_train = train_set_processed["target"].values
x_train = train_set_processed.drop(["target"], axis=1).values

# 4. Get default accuracy

In [77]:
# Fit logistic regression model to dataset without using regularization
logis_reg = LogisticRegression(solver="lbfgs", C=1e10)
logis_reg.fit(x_train, y_train)
acc_before = logis_reg.score(x_train, y_train)

print("Accuracy before: {:.2f}%".format(acc_before * 100))

Accuracy before: 88.02%


# 5. Removing collinearity

# 6. F-test
\begin{equation}
F = \frac{(LSS(overall) - LSS(fit)) / p}{LSS(fit) / (n - p - 1)}
\tag{1}
\end{equation}

\begin{equation}
LSS(fit) = \sum_{i=1}^{n} y_i*log(\hat{y_i}) + (1 - y_i)*log(1 - \hat{y_i})
\tag{2}
\end{equation}

\begin{equation}
LSS(overall) = \sum_{i=1}^{n} y_i*log(\frac{len(where(y_i == 1))}{len(y_i)}) + (1 - y_i)*log(1 - \frac{len(where(y_i == 1))}{len(y_i)})
\tag{3}
\end{equation}

When  
n: is numbers of observation    
p: is numbers of predictor

In [112]:
# Calculate LSS(overall)
def LSS_overall_cal(y):
    overall_prob = len(np.where(y == 1)[0]) / len(y)
    LSS_overall = np.sum(np.where(y == 1, np.log(overall_prob + 1e-7), np.log(1 - overall_prob + 1e-7)))
    return LSS_overall

# Calculate LSS(fit)
def LSS_fit_cal(classifier, x, y):
    classifier.fit(x, y)
    predicted_prob = classifier.predict_proba(x)
    LSS_fit = np.sum(np.where(y == 1, np.log(predicted_prob[:, 1] + 1e-7), np.log(predicted_prob[:, 0] + 1e-7)))
    return LSS_fit

# Calculate F-statistic
def F_statistic_cal(classifier, x, y):
    n = x.shape[0]
    p = x.shape[1]
    LSS_overall = LSS_overall_cal(y)
    LSS_fit = LSS_fit_cal(classifier, x, y)
    F_statistic = ((LSS_overall - LSS_fit) / p) / (LSS_fit / (n - p - 1))
    return F_statistic

print("LSS_overall: {}".format(LSS_overall_cal(y_train)))
print("LSS_fit: {}".format(LSS_fit_cal(LogisticRegression(solver="lbfgs", C=1e10), x_train, y_train)))
print("F_statistic: {}".format(F_statistic_cal(LogisticRegression(solver="lbfgs", C=1e10), x_train, y_train)))

LSS_overall: -166.54952811345308
LSS_fit: -69.35808000328582
F_statistic: 11.106596640480312


### From the result of F-test above, F-statistic value is more than 1, hence this indicate that there is relationship between predictors and response.

# 7. Feature selection

## 7.1 Using Tree's feature important

## 7.2 Using Subset selection

### 7.2.1 Best subset selection

### 7.2.2 Forward stepwise selection

### 7.2.3 Backward stepwise selection

### 7.2.4 Hybrid appoaches