SML Assignment-2
Sachin Sharma
2021559

Q2
Logistics Regression using FDA, PCA + FDA

In [489]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split

In [490]:
class FDA:
    def __init__(self) -> None:
        self.w = []

    def fit(self, X, y):
        classes = np.unique(y)
        X_0 = X[y == classes[0]]
        X_1 = X[y == classes[1]]
        n_0 = X_0.shape[0]
        n_1 = X_1.shape[0]
        centroid_0 = np.mean(X_0, axis=0)
        centroid_1 = np.mean(X_1, axis=0)

        diff_0 = X_0 - centroid_0
        S_0 = np.dot(diff_0.T, diff_0) / (n_0 - 1)
        diff_1 = X_1 - centroid_1
        S_1 = np.dot(diff_1.T, diff_1) / (n_1 - 1)

        A = n_0 * S_0 + n_1 * S_1
        centroid_0 = centroid_0[:, np.newaxis]
        centroid_1 = centroid_1[:, np.newaxis]
        self.w = np.array(np.dot(np.linalg.inv(A), (centroid_0 - centroid_1)))
        self.w = self.w / np.linalg.norm(self.w)


    def transform(self, X):
        return np.dot(X, self.w)

In [491]:
class PCA:
    def __init__(self, n_comps) -> None:
        self.n_comps = n_comps

    def fit(self, X):
        means = np.mean(X, axis=0)
        standard_dev = np.sqrt(np.mean(np.square(X - means)))
        X = X - means
        cov_matrix = np.cov(X, rowvar=False)

        eigen_values, eigen_vectors = np.linalg.eig(cov_matrix)
        pairs = [(np.abs(eigen_values[i]), eigen_vectors.T[i]) for i in range(len(eigen_values))]
        pairs = sorted(pairs, key=lambda x: x[0], reverse=True)

        self.W = np.vstack([pairs[i][1] for i in range(self.n_comps)]).T

    def transform(self, X):
        return np.dot(X, self.W)

In [492]:
class LogisticRegression:
    def __init__(self, learning_rate=0.1, max_iter=1000):
        self.learning_rate = learning_rate
        self.max_iter = max_iter

    def fit(self, X, y: np.ndarray):
        N, n_features = X.shape
        # Add a column of ones to X for bias term
        X = np.c_[np.ones(N), X]
        n_features += 1

        y = y[:, np.newaxis]

        # Initialize the weights to zeros
        self.w = np.zeros(n_features)[:, np.newaxis]

        # Update the weights using gradient descent
        for i in range(self.max_iter):
            z = np.dot(X, self.w)
            y_pred = np.array(self._sigmoid(z))
            grad = np.dot(X.T, (y_pred - y)) / y.size
            self.w -= self.learning_rate * grad

    def predict(self, X):
        N, n_features = X.shape

        # Add a column of ones to X for bias term
        X = np.c_[np.ones(N), X]

        # Predict the probabilities of the positive class
        X_proj = np.dot(X, self.w).reshape(N)
        proba = self._sigmoid(X_proj).reshape(N)

        # Classify as 1 if the probability is >= 0.5, otherwise as 0
        y_pred = np.where(proba >= 0.5, 1, 0)
        return y_pred.reshape(N)

    def _sigmoid(self, z: np.ndarray) -> float:
        z = z.astype(np.float128)
        return 1 / (1 + np.exp(-z))

In [493]:
def accuracy_score(actual, predicted):
    correct = 0
    for i in range(len(actual)):
        if actual[i] == predicted[i]:
            correct += 1
    
    return correct / float(len(actual)) * 100.0

In [494]:
def map_to_numeric(X: pd.DataFrame):
    mapper = {}
    for c in X.columns:
        if X[c].dtypes == 'object':
            mapper[c] = {}
            i = 0
            for r in range(len(X[c])):
                if X[c][r] not in mapper[c]:
                    mapper[c][X[c][r]] = i
                    i += 1
                X[c][r] = mapper[c][X[c][r]]
            X[c] = pd.to_numeric(X[c])
    return X

In [495]:
dataset = pd.read_csv(r'Heart.csv')
dataset.dropna(inplace=True)
dataset.reset_index(drop=True, inplace=True)

In [496]:
y = pd.DataFrame(dataset, columns=['AHD',])
X = dataset.drop(['Unnamed: 0', 'AHD'], axis=1)

N, n_features = X.shape

X = np.array(map_to_numeric(X))
y = np.array(map_to_numeric(y)).reshape(N)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[c][r] = mapper[c][X[c][r]]


In [497]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [498]:
fda = FDA()
fda.fit(X_train, y_train)

In [499]:
X_train_FDA = np.array(fda.transform(X_train))
X_test_FDA = np.array(fda.transform(X_test))

In [500]:
pca = PCA(n_comps=2)
pca.fit(X_train)

In [501]:
X_train_PCA = np.array(pca.transform(X_train))
X_test_PCA = np.array(pca.transform(X_test))

In [502]:
fda_after_pca = FDA()
fda_after_pca.fit(X_train_PCA, y_train)

In [503]:
X_train_PCA_FDA = np.array(fda_after_pca.transform(X_train_PCA))
X_test_PCA_FDA = np.array(fda_after_pca.transform(X_test_PCA))

In [504]:
model_FDA = LogisticRegression()
model_FDA.fit(X_train_FDA, y_train)

model_PCA = LogisticRegression()
model_PCA.fit(X_train_PCA, y_train)

model_PCA_FDA = LogisticRegression()
model_PCA_FDA.fit(X_train_PCA_FDA, y_train)

In [505]:
y_pred_FDA = model_FDA.predict(X_test_FDA)
accuracy_FDA = accuracy_score(y_test, y_pred_FDA)
print(f'Accuracy after FDA: {accuracy_FDA}')

Accuracy after FDA: 88.33333333333333


In [506]:
y_pred_PCA = model_PCA.predict(X_test_PCA)
accuracy_PCA = accuracy_score(y_test, y_pred_PCA)
# print(f'Accuracy after PCA: {accuracy_PCA}')

In [507]:
y_pred_PCA_FDA = model_PCA_FDA.predict(X_test_PCA_FDA)
accuracy_PCA_FDA = accuracy_score(y_test, y_pred_PCA_FDA)
print(f'Accuracy after PCA + LDA: {accuracy_PCA_FDA}')

Accuracy after PCA + LDA: 60.0
