In [18]:
import pandas as pd

In [19]:
envs = pd.read_csv('training_environments.csv', index_col=0)
empo_names = [f'empo_{i}' for i in range(1, 4)]
empo_index_to_label = []

for empo in empo_names:
    empo_index_to_label.append([str(row) for row in envs.drop_duplicates(subset=empo)[empo]])
    
empo_label_to_index = {name : {label : i for i, label in enumerate(labels)} for name, labels in zip(empo_names, empo_index_to_label)}
empo_label_to_index

{'empo_1': {'Free-living': 0, 'Host-associated': 1},
 'empo_2': {'Non-saline': 0, 'Saline': 1, 'Animal': 2, 'Plant': 3},
 'empo_3': {'Water (non-saline)': 0,
  'Soil (non-saline)': 1,
  'Sediment (saline)': 2,
  'Animal surface': 3,
  'Surface (non-saline)': 4,
  'Animal distal gut': 5,
  'Animal corpus': 6,
  'Plant surface': 7,
  'Water (saline)': 8,
  'Animal secretion': 9,
  'Sediment (non-saline)': 10,
  'Plant rhizosphere': 11,
  'Plant corpus': 12,
  'Surface (saline)': 13,
  'Animal proximal gut': 14,
  'Aerosol (non-saline)': 15,
  'Hypersaline (saline)': 16}}

In [20]:
# replace text labels with integers
envs = envs.replace(empo_label_to_index)

In [21]:
envs

Unnamed: 0,empo_1,empo_2,empo_3
training_0,0,0,0
training_1,0,0,0
training_2,0,0,1
training_3,0,1,2
training_4,0,0,1
...,...,...,...
training_20978,0,0,1
training_20979,0,0,1
training_20980,1,2,3
training_20981,0,0,0


In [22]:
from scipy import sparse as sps
import numpy as np

In [23]:
def save_as_sparse(in_filename, out_filename):
    line_count = sum(1 for line in open(in_filename))
    rows = []
    with open(in_filename) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            row = [int(x) for x in line.strip().split(',')[1:]]
            row = sps.csr_matrix(row)
            rows.append(row)

            if i % 1000 == 0:
                print(f'Sparsifying {in_filename} [row {i} / {line_count}]\r')
    mat = sps.vstack(rows)
    
    sps.save_npz(out_filename, mat)

In [24]:
from pathlib import Path

def maybe_sparsify(in_filename, out_filename):
    if not Path(out_filename).is_file():
        save_as_sparse(in_filename, out_filename)
    
maybe_sparsify('training_descriptors.csv', 'training_descriptors_sparse.npz')
maybe_sparsify('challenge_descriptors.csv', 'challenge_descriptors_sparse.npz')

In [25]:
desc = sps.load_npz('training_descriptors_sparse.npz')

gen = np.random.default_rng(0)

n_samples = desc.shape[0]
idx = np.arange(n_samples)

gen.shuffle(idx)
desc_shuf = desc[idx]
envs_shuf = envs.iloc[idx]

In [29]:
# 10 folds cross validation
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

k_folds = KFold(n_splits = 10)
clf1 = LogisticRegression(random_state=0, n_jobs=-1)
clf2 = RandomForestClassifier(random_state=0, n_jobs=-1)

In [28]:
# empo 1 Logistic Regression
cv_LogisticRegressionScores_empo1 = cross_val_score(clf1, desc_shuf, envs_shuf["empo_1"], cv=k_folds)
cv_LogisticRegressionScores_empo1

array([0.9614102 , 0.96617437, 0.96045736, 0.96806482, 0.96615825,
       0.96472831, 0.95376549, 0.95996187, 0.95948522, 0.96186845])

In [33]:
from sklearn import preprocessing
scaler = preprocessing.MaxAbsScaler().fit(desc_shuf)
X_scaled = scaler.transform(desc_shuf)

In [31]:
# slow
# empo 2 Logistic Regression
cv_LogisticRegressionScores_empo2 = cross_val_score(clf1, desc_shuf, envs_shuf["empo_2"], cv=k_folds, n_jobs = -1)
cv_LogisticRegressionScores_empo2

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

array([0.95950453, 0.96617437, 0.95759886, 0.96425167, 0.95757865,
       0.95567207, 0.94899905, 0.95233556, 0.95090562, 0.95519542])

In [34]:
# slow
# empo 3 Logistic Regression
cv_LogisticRegressionScores_empo3 = cross_val_score(clf1, X_scaled, envs_shuf["empo_3"], cv=k_folds)
cv_LogisticRegressionScores_empo3

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

KeyboardInterrupt: 

In [None]:
# empo 1 Random Forest
cv_RandomForestScores_empo1 = cross_val_score(clf2, desc_shuf, envs_shuf["empo_1"], cv=k_folds)
cv_RandomForestScores_empo1

In [None]:
# slow
# empo 2 Random Forest
cv_RandomForestScores_empo2 = cross_val_score(clf2, desc_shuf, envs_shuf["empo_2"], cv=k_folds)
cv_RandomForestScores_empo2

In [None]:
# slow
# empo 3 Random Forest
cv_RandomForestScores_empo3 = cross_val_score(clf2, desc_shuf, envs_shuf["empo_3"], cv=k_folds)
cv_RandomForestScores_empo3

In [9]:
train_pcent = .8
train_count = int(round(train_pcent * n_samples))

# X variable (abundances)
desc_train = desc_shuf[:train_count]
desc_validate = desc_shuf[train_count:]

# Y variable (enviroments)
envs_train = envs_shuf[:train_count]
envs_validate = envs_shuf[train_count:]

In [10]:
print("For the training set we use ", train_count, "/", n_samples, "elements")
print("For the validation set we use ", n_samples-train_count, "/", n_samples, "elements")

For the training set we use  16786 / 20983 elements
For the validation set we use  4197 / 20983 elements


In [None]:
# Features selection (--> very slow for n_components > 1000)
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=30)
svd.fit(desc_shuf.T)

In [104]:
reduced_training = svd.components_.T
reduced_training.shape

(20983, 1000)

In [14]:
# cv with only 30 features
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_val_score

k_folds = KFold(n_splits = 10)
clf = LogisticRegression(random_state=0, n_jobs=-1)
scores_empo1 = cross_val_score(clf, reduced_training, envs_shuf["empo_1"], cv=k_folds)
scores_empo2 = cross_val_score(clf, reduced_training, envs_shuf["empo_2"], cv=k_folds)
scores_empo3 = cross_val_score(clf, reduced_training, envs_shuf["empo_3"], cv=k_folds)



NameError: name 'reduced_training' is not defined

In [109]:
print(scores_empo1)
print(scores_empo2)
print(scores_empo3)

[0.89947594 0.90709862 0.90614578 0.89561487 0.89895138 0.89466158
 0.90324118 0.88131554 0.89227836 0.91086749]
[0.76179133 0.75607432 0.75655074 0.75166826 0.74737846 0.73260248
 0.74261201 0.7306959  0.75119161 0.74928503]
[0.57789424 0.56074321 0.5907575  0.5772164  0.56959009 0.56959009
 0.58388942 0.55338418 0.57626311 0.57006673]


In [58]:
# Cross validation Random Forest
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import KFold, cross_val_score

# k_folds = KFold(n_splits = 10)
# clf = RandomForestClassifier()
# scores_empo1 = cross_val_score(clf, desc_shuf, envs_shuf["empo_1"], cv=k_folds)
# scores_empo2 = cross_val_score(clf, desc_shuf, envs_shuf["empo_2"], cv=k_folds)
# scores_empo3 = cross_val_score(clf, desc_shuf, envs_shuf["empo_3"], cv=k_folds)

In [15]:
def logit_score(empo, samples=None):
    clf = RandomForestClassifier(random_state=0, n_jobs=-1)
    clf.fit(desc_train[:samples], envs_train[empo][:samples])
    
    return clf.score(desc_validate, envs_validate[empo])

In [16]:
def RandomForest_score(empo, samples=None):
    clf = LogisticRegression(random_state=0, n_jobs=-1)
    clf.fit(desc_train[:samples], envs_train[empo][:samples])
    
    return clf.score(desc_validate, envs_validate[empo])

In [17]:
import time
start = time.time()
empo_1_LogisticRegressionScore = logit_score('empo_1')
empo_2_LogisticRegressionScore = logit_score('empo_2')
empo_3_LogisticRegressionScore = logit_score('empo_3')
end = time.time()
execution_time_LogisticRegressionScore = end-start

KeyboardInterrupt: 

In [92]:
start = time.time()
empo_1_RandomForestScore = RandomForest_score('empo_1')
empo_2_RandomForestScore = RandomForest_score('empo_2')
empo_3_RandomForestScore = RandomForest_score('empo_3')
end = time.time()
execution_time_RandomForestScore = end-start

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [94]:
print("Logistic regression score using empo_1 = ", empo_1_LogisticRegressionScore)
print("Logistic regression score using empo_2 = ", empo_2_LogisticRegressionScore)
print("Logistic regression score using empo_3 = ", empo_3_LogisticRegressionScore)
print("Random Forest Classifier score using empo_1 = ", empo_1_RandomForestScore)
print("Random Forest Classifier score using empo_2 = ", empo_2_RandomForestScore)
print("Random Forest Classifier score using empo_3 = ", empo_3_RandomForestScore)
print(execution_time_LogisticRegressionScore)
print(execution_time_RandomForestScore)

Logistic regression score using empo_1 =  0.9635453895639743
Logistic regression score using empo_2 =  0.9561591613056946
Logistic regression score using empo_3 =  0.9256611865618298
Random Forest Classifier score using empo_1 =  0.9616392661424827
Random Forest Classifier score using empo_2 =  0.956397426733381
Random Forest Classifier score using empo_3 =  0.9316178222539909
