In [1]:
import pandas as pd

In [2]:
envs = pd.read_csv('training_environments.csv', index_col=0)
empo_names = [f'empo_{i}' for i in range(1, 4)]
empo_index_to_label = []

for empo in empo_names:
    empo_index_to_label.append([str(row) for row in envs.drop_duplicates(subset=empo)[empo]])
    
empo_label_to_index = {name : {label : i for i, label in enumerate(labels)} for name, labels in zip(empo_names, empo_index_to_label)}
empo_label_to_index

{'empo_1': {'Free-living': 0, 'Host-associated': 1},
 'empo_2': {'Non-saline': 0, 'Saline': 1, 'Animal': 2, 'Plant': 3},
 'empo_3': {'Water (non-saline)': 0,
  'Soil (non-saline)': 1,
  'Sediment (saline)': 2,
  'Animal surface': 3,
  'Surface (non-saline)': 4,
  'Animal distal gut': 5,
  'Animal corpus': 6,
  'Plant surface': 7,
  'Water (saline)': 8,
  'Animal secretion': 9,
  'Sediment (non-saline)': 10,
  'Plant rhizosphere': 11,
  'Plant corpus': 12,
  'Surface (saline)': 13,
  'Animal proximal gut': 14,
  'Aerosol (non-saline)': 15,
  'Hypersaline (saline)': 16}}

In [3]:
# replace text labels with integers
envs = envs.replace(empo_label_to_index)

In [4]:
from scipy import sparse as sps
import numpy as np

In [5]:
def save_as_sparse(in_filename, out_filename):
    line_count = sum(1 for line in open(in_filename))
    rows = []
    with open(in_filename) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            row = [int(x) for x in line.strip().split(',')[1:]]
            row = sps.csr_matrix(row)
            rows.append(row)

            if i % 1000 == 0:
                print(f'Sparsifying {in_filename} [row {i} / {line_count}]\r')
    mat = sps.vstack(rows)
    
    sps.save_npz(out_filename, mat)

In [6]:
from pathlib import Path

def maybe_sparsify(in_filename, out_filename):
    if not Path(out_filename).is_file():
        save_as_sparse(in_filename, out_filename)
    
maybe_sparsify('training_descriptors.csv', 'training_descriptors_sparse.npz')
maybe_sparsify('challenge_descriptors.csv', 'challenge_descriptors_sparse.npz')

In [7]:
desc = sps.load_npz('training_descriptors_sparse.npz')

gen = np.random.default_rng(0)

n_samples = desc.shape[0]
idx = np.arange(n_samples)

gen.shuffle(idx)
desc_shuf = desc[idx]
envs_shuf = envs.iloc[idx]

In [8]:
train_pcent = .8
train_count = int(round(train_pcent * n_samples))

desc_train = desc_shuf[:train_count]
desc_validate = desc_shuf[train_count:]

envs_train = envs_shuf[:train_count]
envs_validate = envs_shuf[train_count:]

In [9]:
from sklearn.linear_model import LogisticRegression

In [10]:
def logit_score(empo, samples=None):
    clf = LogisticRegression(random_state=0, n_jobs=-1)
    clf.fit(desc_train[:samples], envs_train[empo][:samples])
    
    return clf.score(desc_validate, envs_validate[empo])

In [11]:
logit_score('empo_1')

0.9614010007147963

In [12]:
logit_score('empo_2')

0.9556826304503216

In [16]:
logit_score('empo_3')

0.9313795568263045