In [1]:
from logisticRegression import LogisticRegression, Optimizer
from scipy.io.arff import loadarff
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
def addInteractions(X):
    newX=X
    for i in range(X.shape[1]):
        for j in range(i+1,X.shape[1]):
            newX=np.c_[newX,np.multiply(X[:,i],X[:,j])]
    return newX

1. Add a stopping rule
2. Perform tests for max 500 iterations (unless it converges earlier), use balanced accuracy (so do some train test splits, at least 5)
3. Check how the log likelihood value depends on iterations for each algorithm for train data
4. Compare the algorithms with 4 other existing solutions such as: LDA (Linear Discriminant analysis), QDA (Quadratic
Discriminant Analysis), Decision tree and Random Forest
5. In the case of small datasets, please compare the two versions of the logistic regression: model
without interactions and model with interactions.

1. Stopping rule: If the differences in loss function are smaller than 0.0001, stop.

In [3]:
# # This is just an overall idea, feel free to delete it all or modify or whatever :)
# # I'm wondering if this way of keeping results is optimal for later generating graphs from them
# # Currently it's just the result, name of the classifier, and split, seed
# # hmm cause for the accuracies we will probably want to draw boxplots, so like on the y axis accuracy, on the x axis train test split and then for each train test split a boxplot for each type of classifier
# # I guess it should be possible with sns, maybe by setting y to accuracy, x to split and hue by classifier type or smth like that
# # For the plots of loss function, I'm not sure how to represent it, whether to compute the mean for each iteration and plot a line going through those points
# # or to just take a single result and plot that
# def performExperiment(X,y,logisticparams):
#     costs=[]
#     accuracies=[]
#     # splits = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
#     splits = [0.1,0.2]
#     # seeds = [42,123,0,321,9]
#     seeds = [42]
#     classifiersNames = ['Linear Discriminant Analysis','Quadratic Discriminant Analysis', 'Decision Tree Classifier', 'Random Forest Classifier']
#     classifiers = [LinearDiscriminantAnalysis(),QuadraticDiscriminantAnalysis(),DecisionTreeClassifier(random_state=seed),RandomForestClassifier(random_state=seed)]
#     for split in splits:
#         for seed in seeds:
#             np.random.seed(seed)
#             X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed)

#             for logisticparam in len(logisticparams):
#                 logisticRegressor = LogisticRegression(learningRate=logisticparam.learningRate,noOfIterations=500, optimizer=logisticparam.optimizer)
#                 costs = logisticRegressor.fit(X_train.astype(float),y_train.astype(float))
#                 y_pred=logisticRegressor.predict(X_test)
#                 costs.append([costs,logisticparam.optimizer,seed,split])
#                 accuracies.append([1-np.sum(np.abs(y_pred-y_test))/len(y_test),logisticparam.optimizer,seed,split])

#             for i,classifier in enumerate(classifiers):
#                 costs = classifier.fit(X_train.astype(float),y_train.astype(float))
#                 y_pred=classifier.predict(X_test)
#                 costs.append([costs, classifiersNames[i],seed,split])
#                 accuracies.append([1-np.sum(np.abs(y_pred-y_test))/len(y_test),classifiersNames[i],seed,split])
#     # We could then produce some graphs and means from those results
#     return costs, accuracies
    

In [3]:
# Loading the dataset
raw_blood_data = loadarff("./Datasets/blood-transfusion-service-center.arff")
df_blood_data = pd.DataFrame(raw_blood_data[0])

# Separating other features from target
df_blood_no_class = df_blood_data.drop('Class', axis=1)

# Removing highly correlated features
corr = df_blood_no_class.corr()
mask = ~(corr.mask(np.eye(len(corr), dtype=bool)).abs() > 0.9).any()
df_blood_no_class = df_blood_no_class.loc[:, mask]

# Creating X and Y
X_blood = df_blood_no_class
y_blood = df_blood_data['Class'].astype(int)


In [8]:
# For Logistic Regression with Adam optimizer
balanced_accuracies_adam_log_reg = []
accuracies_adam_log_reg = []

# For Logistic Regression with SGD optimizer
balanced_accuracies_sgd_log_reg = []
accuracies_sgd_log_reg = []

# For Logistic Regression with IWLS optimizer
balanced_accuracies_iwls_log_reg = []
accuracies_iwls_log_reg = []

# For LAD (Least Absolute Deviations) Regression
balanced_accuracies_lad = []
accuracies_lad = []

# For QAD (Quantile Absolute Deviations) Regression
balanced_accuracies_qad = []
accuracies_qad = []

# For Decision Tree Classifier
balanced_accuracies_dec_tree = []
accuracies_dec_tree = []

# For Random Forest Classifier
balanced_accuracies_rand_forest = []
accuracies_rand_forest = []

adam = Optimizer.Adam
sgd = Optimizer.SGD
iwls = Optimizer.IWLS

splits = [0.1,0.2,0.3,0.4,0.5]
seeds = [0,42,123,321,9]

for split in splits:
    for seed in seeds:
        np.random.seed(seed)
        X_train, X_test, y_train, y_test = train_test_split(X_blood, y_blood, test_size=0.2, random_state=seed)
        
        # ADAM
        logisticRegressor = LogisticRegression(learningRate=0.001, noOfIterations=500, optimizer=adam)
        logisticRegressor.fit(X_train.astype(float),y_train.astype(float))
        y_pred = logisticRegressor.predict(X_test)
        accuracies_adam_log_reg.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_adam_log_reg.append(balanced_accuracy_score(y_test, y_pred))
        
        # SGD
        # Error singular matrix
        logisticRegressor = LogisticRegression(learningRate=0.0001, noOfIterations=500, optimizer=iwls)
        logisticRegressor.fit(X_train,y_train)
        y_pred = logisticRegressor.predict(X_test)
        accuracies_iwls_log_reg.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_iwls_log_reg.append(balanced_accuracy_score(y_test, y_pred))

        # IWLS
        # error KeyError: "None of [Index([127,  48, 485, 407, 225, 338, 239, 522, 592, 476,\n...
        logisticRegressor = LogisticRegression(learningRate=0.001, noOfIterations=500, optimizer=sgd)
        logisticRegressor.fit(X_train.astype(float),y_train.astype(float))
        y_pred = logisticRegressor.predict(X_test)
        accuracies_sgd_log_reg.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_sgd_log_reg.append(balanced_accuracy_score(y_test, y_pred))

        # LAD
        lad = LinearDiscriminantAnalysis()
        lad.fit(X_train, y_train)
        y_pred = lad.predict(X_test)
        accuracies_lad.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_lad.append(balanced_accuracy_score(y_test, y_pred))

        # QAD
        qad = QuadraticDiscriminantAnalysis()
        qad.fit(X_train, y_train)
        y_pred = qad.predict(X_test)
        accuracies_qad.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_qad.append(balanced_accuracy_score(y_test, y_pred))

        # Decision Tree Classifier
        dec_tree = DecisionTreeClassifier(random_state=seed)
        dec_tree.fit(X_train, y_train)
        y_pred = dec_tree.predict(X_test)
        accuracies_dec_tree.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_dec_tree.append(balanced_accuracy_score(y_test, y_pred))

        # Random Forest Classifier
        rand_forest = RandomForestClassifier(random_state=seed)
        rand_forest.fit(X_train, y_train)
        y_pred = rand_forest.predict(X_test)
        accuracies_rand_forest.append(accuracy_score(y_test, y_pred))
        balanced_accuracies_rand_forest.append(balanced_accuracy_score(y_test, y_pred))

  vhatBias = moment2Bias/(1.0-beta2**i)
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0

In [11]:
# Logistic Regression with Adam Optimizer
print("Adam Logistic Regression:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_adam_log_reg))
print("Accuracy:", np.mean(accuracies_adam_log_reg), "\n")

# Logistic Regression with SGD Optimizer
print("SGD Logistic Regression:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_sgd_log_reg))
print("Accuracy:", np.mean(accuracies_sgd_log_reg), "\n")

# Logistic Regression with IWLS Optimizer
print("IWLS Logistic Regression:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_iwls_log_reg))
print("Accuracy:", np.mean(accuracies_iwls_log_reg), "\n")

# LAD Regression
print("LAD Regression:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_lad))
print("Accuracy:", np.mean(accuracies_lad), "\n")

# QAD Regression
print("QAD Regression:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_qad))
print("Accuracy:", np.mean(accuracies_qad), "\n")

# Decision Tree Classifier
print("Decision Tree:")
print("Balanced Accuracy:", np.mean(balanced_accuracies_dec_tree))
print("Accuracy:", np.mean(accuracies_dec_tree), "\n")


Adam Logistic Regression:
Balanced Accuracy: 0.5
Accuracy: 0.7573333333333333 

SGD Logistic Regression:
Balanced Accuracy: nan
Accuracy: nan 

IWLS Logistic Regression:
Balanced Accuracy: nan
Accuracy: nan 

LAD Regression:
Balanced Accuracy: 0.5
Accuracy: 0.7573333333333333 

QAD Regression:
Balanced Accuracy: 0.5
Accuracy: 0.7573333333333333 

Decision Tree:
Balanced Accuracy: 0.5518617372552329
Accuracy: 0.7266666666666667 



  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


In [26]:
def compute_accuracies(X, y, splits=[0.1, 0.2, 0.3, 0.4, 0.5], seeds=[0, 42, 123, 321, 9]):
    # For Logistic Regression with Adam optimizer
    balanced_accuracies_adam_log_reg = []
    accuracies_adam_log_reg = []
    costs_adam_log_reg = []

    # For Logistic Regression with SGD optimizer
    balanced_accuracies_sgd_log_reg = []
    accuracies_sgd_log_reg = []
    costs_sgd_log_reg = []

    # For Logistic Regression with IWLS optimizer
    balanced_accuracies_iwls_log_reg = []
    accuracies_iwls_log_reg = []
    costs_iwls_log_reg = []

    # For LAD (Least Absolute Deviations) Regression
    balanced_accuracies_lad = []
    accuracies_lad = []
    costs_lad = []

    # For QAD (Quantile Absolute Deviations) Regression
    balanced_accuracies_qad = []
    accuracies_qad = []
    costs_qad = []

    # For Decision Tree Classifier
    balanced_accuracies_dec_tree = []
    accuracies_dec_tree = []
    costs_dec_tree = []

    # For Random Forest Classifier
    balanced_accuracies_rand_forest = []
    accuracies_rand_forest = []
    costs_rand_forest = []

    costs = {}
    balanced_accuracies = {}
    accuracies = {}
    
    adam = Optimizer.Adam
    sgd = Optimizer.SGD
    iwls = Optimizer.IWLS
    
    # Initialize lists for each classifier
    classifiers = {
        "Adam_Log_Reg": (accuracies_adam_log_reg, balanced_accuracies_adam_log_reg, costs_adam_log_reg, LogisticRegression(learningRate=0.001, noOfIterations=500, optimizer=adam)),
        # "SGD_Log_Reg": (accuracies_sgd_log_reg, balanced_accuracies_sgd_log_reg, costs_sgd_log_reg, LogisticRegression(learningRate=0.001, noOfIterations=500, optimizer=sgd)),
        # "IWLS_Log_Reg": (accuracies_iwls_log_reg, balanced_accuracies_iwls_log_reg, costs_iwls_log_reg, LogisticRegression(learningRate=0.001, noOfIterations=500, optimizer=iwls)),
        "LAD": (accuracies_lad, balanced_accuracies_lad, costs_lad, LinearDiscriminantAnalysis()),
        "QAD": (accuracies_qad, balanced_accuracies_qad, costs_qad, QuadraticDiscriminantAnalysis()),
        "Decision_Tree": (accuracies_dec_tree, balanced_accuracies_dec_tree, costs_dec_tree, DecisionTreeClassifier()),
        "Random_Forest": (accuracies_rand_forest, balanced_accuracies_rand_forest, costs_rand_forest, RandomForestClassifier())
    }

    for clf_name, (accuracy_list, balanced_accuracy_list, costs_list, clf) in classifiers.items():
        accuracies[clf_name] = {}
        balanced_accuracies[clf_name] = {}
        costs[clf_name] = {}

        for split in splits:
            for seed in seeds:
                np.random.seed(seed)
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=split, random_state=seed)

                cost = clf.fit(X_train, y_train)
                y_pred = clf.predict(X_test)

                accuracy = accuracy_score(y_test, y_pred)
                balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
                
                accuracy_list.append(accuracy)
                balanced_accuracy_list.append(balanced_accuracy)
                costs_list.append(cost)

                accuracies[clf_name][(split, seed)] = accuracy
                balanced_accuracies[clf_name][(split, seed)] = balanced_accuracy
                costs[clf_name][(split, seed)] = cost


    return accuracies, balanced_accuracies, costs

In [27]:
a, ba, costs = compute_accuracies(X_blood, y_blood)

  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.shape[0])*(np.dot(y,np.log(yHat))+np.dot((1.0-y),np.log(1.0-yHat))))
  self.costs.append((-1.0/X.sh

In [25]:
np.mean(list(a["Adam_Log_Reg"].values()))

0.7489994058229352