In [None]:
# !conda install -c conda-forge nodejs -y
# !conda install -c conda-forge/label/gcc7 nodejs -y
# !conda install -c conda-forge/label/cf201901 nodejs -y
# !conda install -c conda-forge/label/cf202003 nodejs -y

# !jupyter labextension install jupyterlab-plotly
# !pip install scipy
# !pip install -U kaleido
# !pip install networkx
# !pip install matplotlib
# !pip install igraph

In [None]:
import os
import gglasso
import pickle
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import plotly.express as px

from xgboost import XGBClassifier


from matplotlib.pyplot import figure

from numpy.linalg import matrix_rank
from numpy import genfromtxt

from scipy import stats
from scipy.linalg import eigh

from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

### Read solution

#### Remove outliers

The outliers are found after manual checking of SGL solution for all samples.

In [None]:
corr_all_ix = np.arange(0, 950)
outliers_ix = [96, 144, 210, 522]

corr_filtered_ix = np.array([i for i in corr_all_ix if i not in outliers_ix])
corr_filtered_ix.shape

In [None]:
storage_dir = "/lustre/groups/bds01/datasets/brains/"

In [None]:
outliers = []

for i in outliers_ix:
    outliers.append(genfromtxt(storage_dir + "corr_matrices/corr{0}.csv".format(i), delimiter=','))

    
outliers = np.array(outliers)
outliers.shape

#### HMGU cluster

In [None]:
corr = []
sol = []

for i in corr_filtered_ix:
    corr.append(genfromtxt(storage_dir + "/corr_matrices/corr{0}.csv".format(i), delimiter=','))
    sol.append(genfromtxt(storage_dir + "/est_uniform/est_uniform{0}.csv".format(i), delimiter=','))
    
sol = np.array(sol)
corr = np.array(corr)
corr.shape, sol.shape

In [None]:
sex = pd.read_csv(storage_dir + "sex.csv")
age = pd.read_csv(storage_dir + "age.csv")

#remove outliers
sex = sex.iloc[corr_filtered_ix]
age = age.iloc[corr_filtered_ix]

sex.shape, age.shape

#### On premises

In [None]:
start = 0
stop = 50

sub_corr = []

### on premisis
for i in range(start, stop):
    sub_corr.append(genfromtxt("../data/sub_corr50/sub_corr{0}.csv".format(i), delimiter=','))

    
sub_corr = np.array(sub_corr)
sub_corr.shape

In [None]:
est_uniform['Theta'].shape

In [None]:
sex = pd.read_csv("../data/sex.csv")
age = pd.read_csv("../data/age.csv")

# Logistic regression

In [None]:
# X, y = sol, sex

X, y = sol[:100, ], sex[:100]

X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

print('X_train: ', X_train.shape)
print('X_test: ', X_test.shape)
print('y_train: ', y_train.shape)
print('y_test: ', y_test.shape)

In [None]:
# Converting the 3D matrix to 2D matrix
print('X_train: {0}'.format(X_train.shape))
print('X_test: {0} \n'.format(X_test.shape))

n_train = X_train.shape[0]
n_test = X_test.shape[0]

X_train = X_train.reshape(n_train, X_train.shape[1]*X_train.shape[2])
X_test = X_test.reshape(n_test, X_test.shape[1]*X_test.shape[2])

print("X train flatten: {0}".format(X_train.shape))
print("X test flatten: {0} \n".format(X_test.shape))

In [None]:
# create model and train test accuracy print
logreg = LogisticRegression(random_state = 42, max_iter= 1000, penalty='l2')
logreg.fit(X_train, y_train.values.ravel())
print("Test accuracy: {} ".format(logreg.score(X_test, y_test)))

In [None]:
predictions = logreg.predict(X_test)

cm = metrics.confusion_matrix(y_test, predictions)

plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(logreg.score(X_test, y_test))
plt.title(all_sample_title, size = 15);

plt.savefig('../plots/confusion_matrix.png')

In [None]:
print("Coefficints: {0}".format(logreg.coef_))
print("Intercept: {0}".format(logreg.intercept_))

In [None]:
coefs = np.array(logreg.coef_)

coefs = coefs.flatten()
coefs = coefs.reshape(X.shape[1], X.shape[2])
coefs = pd.DataFrame(coefs)

In [None]:
fig_coef_1 = px.imshow(coefs,color_continuous_scale='RdBu_r', 
                text_auto=True, zmin=0, zmax=0.1, 
                title="Coefficients of Logistic regression classifier with l2 penalty term")
fig_coef_1.update_layout(coloraxis_showscale=False)

fig_coef_2 = px.imshow(coefs,color_continuous_scale='RdBu_r', 
                text_auto=True,
                title="Coefficients pattern of Logistic regression classifier with l2 penalty term")

fig_coef_1.write_image("../plots/coeffs.png")
fig_coef_2.write_html("../plots/coeffs.html")

In [None]:
mask = np.triu(np.ones_like(coefs, dtype=bool))
coefs_df = coefs.mask(mask)

# XGBoost

In [None]:
parameters = {'booster': ('gbtree', 'gblinear', 'dart'),
             'eta': [0.1, 0.3, 0.5, 0.8],
             'gamma': [0, 0.1, 0.3],
             'max_depth': [4, 6, 10],
             'max_bin': [50, 100, 256, 300] }

In [None]:
xgb = XGBClassifier()

xgb_cl = GridSearchCV(xgb, parameters)

In [None]:
xgb_cl.fit(X_train, y_train.values.ravel())

preds = xgb_cl.predict(X_test)
accuracy_score(y_test, preds)

### Benchmarking

In [None]:
#adjust CV threshold so you don't have only one class in a sample
heldout = [0.3, 0.2, 0.1]
# Number of rounds to fit and evaluate an estimator.
rounds = 10
X, y = sol[:100, ], sex[:100]

classifiers = [
     ("XGBoost", XGBClassifier()),
    ("Log-regression", LogisticRegression(max_iter= 110, penalty='l2')),
   ("SVM", svm.SVC(kernel='linear'))
]

xx = 1.0 - np.array(heldout)

for name, clf in classifiers:
    print("training %s" % name)
    rng = np.random.RandomState(42)
    yy = []
    for i in heldout:
        yy_ = []
        for r in range(rounds):
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=i, random_state=rng
            )
            
            n_train = X_train.shape[0]
            n_test = X_test.shape[0]

            X_train = X_train.reshape(n_train, X_train.shape[1]*X_train.shape[2])
            X_test = X_test.reshape(n_test, X_test.shape[1]*X_test.shape[2])
            
            clf.fit(X_train, y_train.values.ravel())
            y_pred = clf.predict(X_test)
            yy_.append(1 - np.mean(y_pred == y_test))
        yy.append(np.mean(yy_))
    plt.plot(xx, yy, label=name)

plt.legend(loc="upper right")
plt.xlabel("Proportion train")
plt.ylabel("Test Error Rate")
plt.show()