# Data wrangling : sparsify, use indices for labels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import sparse as sps
import time

plt.style.use('ggplot')

In [None]:
envs = pd.read_csv('training_environments.csv', index_col=0)
empo_names = [f'empo_{i}' for i in range(1, 4)]
empo_index_to_label = []

for empo in empo_names:
    empo_index_to_label.append([str(row) for row in envs.drop_duplicates(subset=empo)[empo]])
    
empo_label_to_index = {name : {label : i for i, label in enumerate(labels)} for name, labels in zip(empo_names, empo_index_to_label)}
empo_label_to_index

In [None]:
# replace text labels with integers
envs = envs.replace(empo_label_to_index)

In [None]:
import numpy as np

In [None]:
def save_as_sparse(in_filename, out_filename):
    line_count = sum(1 for line in open(in_filename))
    rows = []
    with open(in_filename) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            row = [int(x) for x in line.strip().split(',')[1:]]
            row = sps.csr_matrix(row)
            rows.append(row)

            if i % 1000 == 0:
                print(f'Sparsifying {in_filename} [row {i} / {line_count}]\r')
    mat = sps.vstack(rows)
    
    sps.save_npz(out_filename, mat)

In [None]:
from pathlib import Path

def maybe_sparsify(in_filename, out_filename):
    if not Path(out_filename).is_file():
        save_as_sparse(in_filename, out_filename)
        
def get_header_line(csv_file):
    with open(csv_file) as f:
        line = next(f)
        return np.array(line.rstrip().split(',')[1:])
    
maybe_sparsify('training_descriptors.csv', 'training_descriptors_sparse.npz')
maybe_sparsify('challenge_descriptors.csv', 'challenge_descriptors_sparse.npz')

In [None]:
desc_species_names = get_header_line('training_descriptors_header.csv')

desc = sps.load_npz('training_descriptors_sparse.npz')
species = pd.read_csv('bacterial_species.csv', index_col=0)

In [None]:
def sparse_megabytes(a):
    return (a.data.nbytes + a.indptr.nbytes + a.indices.nbytes) / (1024 * 1024)

print(f'In-memory size of desc : {sparse_megabytes(desc):.2f}M')

In [None]:
def to_taxonomy_df(desc, taxon_level):
    taxons = species[taxon_level][desc_species_names]
    columns = taxons.unique()

    taxon_indices, taxon_names = pd.factorize(taxons)
    
    data = np.ones(taxon_indices.shape)
    row_ind = np.arange(taxon_indices.shape[0])
    col_ind = taxon_indices
    
    D = sps.csr_matrix((data, (row_ind, col_ind)))
    
    table = desc @ D
    
    return taxon_names, table

taxon_names = {}
taxons = {}

for taxon_level in species.columns:
    taxon_names[taxon_level], taxons[taxon_level] = to_taxonomy_df(desc, taxon_level)

# Classification

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, cross_validate

clf_logit = LogisticRegression(random_state=0, n_jobs=-1)
clf_rforest = RandomForestClassifier(random_state=0, n_jobs=-1)

In [None]:
def cross_validate_clf(clf, desc, empo, samples=None, fast=False, n_splits=5):
    samples = samples if samples is not None else desc.shape[0]
    
    # shuffle and truncate data
    idx = np.arange(samples)

    gen = np.random.default_rng(0)
    gen.shuffle(idx)
    idx = idx[:samples]
    
    desc = desc[idx]
    empo = empo[idx]
    
    if fast:
        print('Warning : using fast evaluation, cross-validation turned off.')
        
        train_pcent = 1 - 1 / n_splits
        train_count = int(round(train_pcent * samples))

        desc_train = desc_shuf[:train_count]
        desc_validate = desc_shuf[train_count:]

        empo_train = empo[:train_count]
        empo_validate = empo[train_count:]

        clf.fit(desc_train, empo_train)
        accuracy = clf.score(desc_validate, empo_validate)
        accuracies = np.array([accuracy])
        
        f1 = f1_score(clf.predict(desc_validate), empo_validate, average='weighted')
        f1 = np.array([f1])
        
        return {'test_accuracy' : accuracies, 'test_f1_weighted' : f1}
    else:
        # cross validation
        k_folds = KFold(n_splits=n_splits, shuffle=True, random_state=0)
    
        return cross_validate(clf, desc, empo, cv=k_folds, scoring=['accuracy', 'f1_weighted'], n_jobs=-1)

#### scores.csv file contains all the information about the cv scores, the computational time, the F1 score, using the different taxonomies and all the initial descriptors

In [None]:
def write_scores_file():
    scores = []
    features = [(taxon_level, taxons[taxon_level]) for taxon_level in species.columns]
    features.append(('desc', desc))

    samples=None
    for clf_name, clf in [('rforest', clf_rforest), ('logit', clf_logit)]:
        for empo in empo_names:
            for feature_name, feature_vector in features:
                print(clf_name, empo, feature_name)

                s = cross_validate_clf(clf, feature_vector, envs[empo], samples=samples)
                s['clf_name'] = clf_name
                s['empo'] = empo
                s['features'] = feature_name

                scores.append(s)

    scores = pd.DataFrame(scores)
    for k in ['test_accuracy', 'score_time', 'test_f1_weighted', 'fit_time']:
        scores[k + '_median'] = scores[k].apply(lambda x : np.median(x))
    scores.to_csv('scores.csv')
    
write_scores_file()

In [None]:
for taxonomy in species:
    print(taxonomy, species[taxonomy].unique().size)

In [None]:
import sys

def biological_interpretation(taxon_level, empo_name, n, out=sys.stdout):
    feature_vector = taxons[taxon_level]
    n_features = feature_vector.shape[-1]

    clf_logit.fit(feature_vector, envs[empo_name])
    
    feature_order_idx = np.argsort(clf_logit.coef_, axis=1)
    if len(empo_label_to_index[empo_name]) == 2:
        feature_order_idx = np.array([feature_order_idx[0][::-1], feature_order_idx[0]])
    
    for env_name, env_idx in empo_label_to_index[empo_name].items():
        print(f'{env_name} :', file=out)

        bottom_n = taxon_names[taxon_level][feature_order_idx[env_idx][:n]]
        top_n = taxon_names[taxon_level][feature_order_idx[env_idx][-n:]]
        print(f'  Top {n} least correlated : {", ".join(name[5:] for name in bottom_n)}', file=out)
        print(f'  Top {n} most correlated : {", ".join(name[5:] for name in top_n)}', file=out)

for taxon_level in taxon_names:
    if taxon_level != 'taxonomy_0':
        with open(f'biological_interpretation/{taxon_level}.txt', 'w') as f:
            for empo_name in empo_names:
                print(f'{empo_name} :', file=f)
                biological_interpretation(taxon_level, empo_name, 5, out=f)
                print('', file=f)

# Dimensionality reduction Bacteria dataset

In [None]:
# Best number of components for Truncated SVD
from sklearn.decomposition import TruncatedSVD

explained_variance_ratio = []
score_cv = []
time_cv = []

def eval_clf(clf, desc, empo_name):
    scores = cross_validate_clf(clf, desc, empo_name)
    return scores['test_accuracy'], scores['fit_time']

for i in [100,200,500,1000,2000]:
    print(i)
    svd = TruncatedSVD(n_components=i)
    svd.fit(desc)
    desc_reduced = svd.transform(desc)
    explained_variance_ratio.append(svd.explained_variance_ratio_.sum())
    score1_logit_reduced, time1_logit_reduced = eval_clf(clf_logit, desc_reduced, 'empo_1')
    score2_logit_reduced, time2_logit_reduced = eval_clf(clf_logit, desc_reduced, 'empo_2')
    score3_logit_reduced, time3_logit_reduced = eval_clf(clf_logit, desc_reduced, 'empo_3')
    score_cv.append([np.mean(score1_logit_reduced), np.mean(score2_logit_reduced), np.mean(score3_logit_reduced)])
    time_cv.append([time1_logit_reduced,time2_logit_reduced,time3_logit_reduced])
    

In [None]:
time_cv_empo1 = []
time_cv_empo2 = []
time_cv_empo3 = []
for i in range(0,5):
    time_cv_empo1.append(time_cv[i][0])
    time_cv_empo2.append(time_cv[i][1])
    time_cv_empo3.append(time_cv[i][2])

In [None]:
score_cv_empo1 = []
score_cv_empo2 = []
score_cv_empo3 = []
for i in range(0,5):
    score_cv_empo1.append(score_cv[i][0])
    score_cv_empo2.append(score_cv[i][1])
    score_cv_empo3.append(score_cv[i][2])

In [None]:
n_components = [100,200,500,1000,2000]
plt.plot(n_components, score_cv_empo1, label = 'score cv empo_1')
plt.plot(n_components, score_cv_empo2, label = 'score cv empo_2')
plt.plot(n_components, score_cv_empo3, label = 'score cv empo_3')
plt.xlabel('Number of features')
plt.ylabel('Cross validation score')
plt.legend()

In [None]:
plt.plot(n_components, explained_variance_ratio , label = 'Explained variance')
plt.xlabel('Number of features')
plt.ylabel('Explained variance ratio')
plt.legend()

In [None]:
plt.plot(n_components, time_cv_empo1, label = 'time cv empo_1')
plt.plot(n_components, time_cv_empo2, label = 'time cv empo_2')
plt.plot(n_components, time_cv_empo3, label = 'time cv empo_3')
plt.xlabel('Number of features')
plt.ylabel('Cross validation computational time')
plt.legend()

## SVD with n = 500

In [None]:
from sklearn.decomposition import TruncatedSVD

start_time = time.time()
svd500 = TruncatedSVD(n_components=500)
svd500.fit(desc)
print("svd with 500 components takes:" , (time.time() - start_time), "seconds")

In [None]:
svd500.explained_variance_ratio_.sum()

In [None]:
desc_reduced500 = svd500.transform(desc)

### SVD Logistic Regression

In [None]:
score1_logit_reduced500 = cross_validate_clf(clf_logit, desc_reduced500, envs['empo_1'])

In [None]:
score2_logit_reduced500 = cross_validate_clf(clf_logit, desc_reduced500, envs['empo_2'])

In [None]:
score3_logit_reduced500 = cross_validate_clf(clf_logit, desc_reduced500, envs['empo_3'])

In [None]:
score1_logit_reduced500.keys()

In [None]:
# reduced scores 
scores_logit_reduced500 = []
scores_logit_reduced500.append(score1_logit_reduced500.get('test_accuracy'))
scores_logit_reduced500.append(score2_logit_reduced500.get('test_accuracy'))
scores_logit_reduced500.append(score3_logit_reduced500.get('test_accuracy'))

In [None]:
# scores of the full dataset and the reduced one, logistic regression
scores = pd.read_csv('scores.csv', index_col=0)
scores_logit = []

empo1 = scores.loc[scores['empo'] == 'empo_1']
empo2 = scores.loc[scores['empo'] == 'empo_2']
empo3 = scores.loc[scores['empo'] == 'empo_3']

empo1_logit = empo1.loc[empo1['clf_name'] == 'logit']
empo2_logit = empo2.loc[empo2['clf_name'] == 'logit']
empo3_logit = empo3.loc[empo3['clf_name'] == 'logit']

empo1_logit_desc = empo1_logit.loc[empo1_logit['features'] == 'desc']
empo2_logit_desc = empo2_logit.loc[empo2_logit['features'] == 'desc']
empo3_logit_desc = empo3_logit.loc[empo3_logit['features'] == 'desc']

a = empo1_logit_desc['test_accuracy'].values[0]
empo1_logit_desc_test_accuracy = [float(x) for x in a[1:-1].split()]
a = empo2_logit_desc['test_accuracy'].values[0]
empo2_logit_desc_test_accuracy = [float(x) for x in a[1:-1].split()]
a = empo3_logit_desc['test_accuracy'].values[0]
empo3_logit_desc_test_accuracy = [float(x) for x in a[1:-1].split()]

scores_logit = [empo1_logit_desc_test_accuracy, empo2_logit_desc_test_accuracy, empo3_logit_desc_test_accuracy]

In [None]:
# logistic regression cv scores with non reduced and reduced variables (zoom)
for i in range(0,3):
    fig, ax = plt.subplots()
    mean = np.mean(scores_logit[i])
    mean_reduced = np.mean(scores_logit_reduced500[i])
    plt.bar([1,2,3,4,5], scores_logit[i], width=0.2)
    plt.bar([1,2,3,4,5], scores_logit_reduced500[i], width=0.2)
    plt.plot(np.linspace(0.5,5.5,10), mean*np.ones(10), linestyle="dashed")
    plt.plot(np.linspace(0.5,5.5,10), mean_reduced*np.ones(10), linestyle="dashed")
#     plt.ylim(0.80, 1)
    plt.title(f'5 folds cross validation scores using empo {i+1}, \n non reduced (blue) and reduced (orange) dataset')

In [None]:
# time
times_logit_reduced500 = []
times_logit_reduced500.append(score1_logit_reduced500.get('fit_time'))
times_logit_reduced500.append(score2_logit_reduced500.get('fit_time'))
times_logit_reduced500.append(score3_logit_reduced500.get('fit_time'))

a = empo1_logit_desc['fit_time'].values[0]
empo1_logit_desc_time = [float(x) for x in a[1:-1].split()]
a = empo2_logit_desc['fit_time'].values[0]
empo2_logit_desc_time = [float(x) for x in a[1:-1].split()]
a = empo3_logit_desc['fit_time'].values[0]
empo3_logit_desc_time = [float(x) for x in a[1:-1].split()]

times_logit = [empo1_logit_desc_time, empo2_logit_desc_time, empo3_logit_desc_time]

In [None]:
for i in range(0,3):
    fig, ax = plt.subplots()
    mean = np.mean(times_logit[i])
    mean_reduced = np.mean(times_logit_reduced500[i])
    plt.bar([1,2,3,4,5], times_logit[i], width=0.2)
    plt.bar([1,2,3,4,5], times_logit_reduced500[i], width=0.2)
    plt.plot(np.linspace(0.5,5.5,10), mean*np.ones(10), linestyle="dashed")
    plt.plot(np.linspace(0.5,5.5,10), mean_reduced*np.ones(10), linestyle="dashed")
    plt.title(f'5 folds cross validation compuatational time using empo {i+1}, \n non reduced (blue) and reduced (orange) dataset')

### SVD Random Forest

In [None]:
score1_rforest_reduced500 = cross_validate_clf(clf_rforest, desc_reduced500, envs['empo_1'])

In [None]:
score2_rforest_reduced500 = cross_validate_clf(clf_rforest, desc_reduced500, envs['empo_2'])

In [None]:
score3_rforest_reduced500 = cross_validate_clf(clf_rforest, desc_reduced500, envs['empo_3'])

In [None]:
# reduced scores 
scores_rforest_reduced500 = []
scores_rforest_reduced500.append(score1_rforest_reduced500.get('test_accuracy'))
scores_rforest_reduced500.append(score2_rforest_reduced500.get('test_accuracy'))
scores_rforest_reduced500.append(score3_rforest_reduced500.get('test_accuracy'))

In [None]:
# scores of the full dataset and the reduced one, logistic regression
scores = pd.read_csv('scores.csv', index_col=0)
scores_rforest = []

empo1 = scores.loc[scores['empo'] == 'empo_1']
empo2 = scores.loc[scores['empo'] == 'empo_2']
empo3 = scores.loc[scores['empo'] == 'empo_3']

empo1_rforest = empo1.loc[empo1['clf_name'] == 'rforest']
empo2_rforest = empo2.loc[empo2['clf_name'] == 'rforest']
empo3_rforest = empo3.loc[empo3['clf_name'] == 'rforest']

empo1_rforest_desc = empo1_rforest.loc[empo1_rforest['features'] == 'desc']
empo2_rforest_desc = empo2_rforest.loc[empo2_rforest['features'] == 'desc']
empo3_rforest_desc = empo3_rforest.loc[empo3_rforest['features'] == 'desc']

a = empo1_rforest_desc['test_accuracy'].values[0]
empo1_rforest_desc_test_accuracy = [float(x) for x in a[1:-1].split()]
a = empo2_rforest_desc['test_accuracy'].values[0]
empo2_rforest_desc_test_accuracy = [float(x) for x in a[1:-1].split()]
a = empo3_rforest_desc['test_accuracy'].values[0]
empo3_rforest_desc_test_accuracy = [float(x) for x in a[1:-1].split()]

scores_rforest = [empo1_rforest_desc_test_accuracy, empo2_rforest_desc_test_accuracy, empo2_rforest_desc_test_accuracy]

In [None]:
# logistic regression cv scores with non reduced and reduced variables (zoom)
for i in range(0,3):
    fig, ax = plt.subplots()
    mean = np.mean(scores_rforest[i])
    mean_reduced = np.mean(scores_rforest_reduced500[i])
    plt.bar([1,2,3,4,5], scores_rforest[i], width=0.2)
    plt.bar([1,2,3,4,5], scores_rforest_reduced500[i], width=0.2)
    plt.plot(np.linspace(0.5,5.5,10), mean*np.ones(10), linestyle="dashed")
    plt.plot(np.linspace(0.5,5.5,10), mean_reduced*np.ones(10), linestyle="dashed")
#     plt.ylim(0.80, 1)
    plt.title(f'5 folds cross validation scores using empo {i+1}, \n non reduced (blue) and reduced (orange) dataset')

In [None]:
# time
times_rforest_reduced500 = []
times_rforest_reduced500.append(score1_rforest_reduced500.get('fit_time'))
times_rforest_reduced500.append(score2_rforest_reduced500.get('fit_time'))
times_rforest_reduced500.append(score3_rforest_reduced500.get('fit_time'))

a = empo1_rforest_desc['fit_time'].values[0]
empo1_rforest_desc_time = [float(x) for x in a[1:-1].split()]
a = empo2_rforest_desc['fit_time'].values[0]
empo2_rforest_desc_time = [float(x) for x in a[1:-1].split()]
a = empo3_rforest_desc['fit_time'].values[0]
empo3_rforest_desc_time = [float(x) for x in a[1:-1].split()]

times_rforest = [empo1_rforest_desc_time, empo2_rforest_desc_time, empo3_rforest_desc_time]

In [None]:
for i in range(0,3):
    fig, ax = plt.subplots()
    mean = np.mean(times_rforest[i])
    mean_reduced = np.mean(times_rforest_reduced500[i])
    plt.bar([1,2,3,4,5], times_rforest[i], width=0.2)
    plt.bar([1,2,3,4,5], times_rforest_reduced500[i], width=0.2)
    plt.plot(np.linspace(0.5,5.5,10), mean*np.ones(10), linestyle="dashed")
    plt.plot(np.linspace(0.5,5.5,10), mean_reduced*np.ones(10), linestyle="dashed")
    plt.title(f'5 folds cross validation compuatational time using empo {i+1}, \n non reduced (blue) and reduced (orange) dataset')

# Biological interpretation

In [None]:
scores = pd.read_csv('scores.csv', index_col=0)
scores.keys()

### Scores plots Logistic regression

In [None]:
empo1_scores = scores.loc[scores['empo'] == 'empo_1']
empo2_scores = scores.loc[scores['empo'] == 'empo_2']
empo3_scores = scores.loc[scores['empo'] == 'empo_3']

In [None]:
# filter the logistic regression rows
empo1_scores_logit = empo1_scores.loc[scores['clf_name'] == 'logit']
empo2_scores_logit = empo2_scores.loc[scores['clf_name'] == 'logit']
empo3_scores_logit = empo3_scores.loc[scores['clf_name'] == 'logit']

In [None]:
# score time
plt.figure(figsize=(10,5))
X = empo1_scores_logit['features'].iloc[1:-1]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_logit['fit_time_median'].iloc[1:-1] / empo1_scores_logit['fit_time_median'].iloc[-1], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_logit['fit_time_median'].iloc[1:-1] / empo2_scores_logit['fit_time_median'].iloc[-1], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_logit['fit_time_median'].iloc[1:-1] / empo3_scores_logit['fit_time_median'].iloc[-1], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylabel('Computational fit time mean')
plt.title('Computational fit time mean using the taxonomies of the bacteria \n with Logistic regression')

In [None]:
# test accuracy
plt.figure(figsize=(10,5))
X = empo1_scores_logit['features'][1:]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_logit['test_accuracy_median'][1:], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_logit['test_accuracy_median'][1:], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_logit['test_accuracy_median'][1:], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylim([0.5,1])
plt.ylabel('Test accuracy mean')
plt.title('Test accuracy mean using the taxonomies of the bacteria \n with Logistic regression (zoom on [0.5,1])')

In [None]:
# f1 score 
plt.figure(figsize=(10,5))
X = empo1_scores_logit['features'][1:]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_logit['test_f1_weighted_median'][1:], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_logit['test_f1_weighted_median'][1:], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_logit['test_f1_weighted_median'][1:], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylim([0.5,1])
plt.ylabel('F1 score mean')
plt.title('F1 score mean mean using the taxonomies of the bacteria \n with Logistic regression (zoom on [0.5,1])')

### Scores plots Random Forest

In [None]:
# filter the random forest rows
empo1_scores_rforest = empo1_scores.loc[scores['clf_name'] == 'rforest']
empo2_scores_rforest = empo2_scores.loc[scores['clf_name'] == 'rforest']
empo3_scores_rforest = empo3_scores.loc[scores['clf_name'] == 'rforest']

In [None]:
# score time
plt.figure(figsize=(10,5))
X = empo1_scores_rforest['features'][1:]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_rforest['fit_time_median'][1:], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_rforest['fit_time_median'][1:], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_rforest['fit_time_median'][1:], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylabel('Computational fit time mean')
plt.title('Computational fit time mean using the taxonomies of the bacteria \n with Random Forest')

In [None]:
# test accuracy
plt.figure(figsize=(10,5))
X = empo1_scores_rforest['features'][1:]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_rforest['test_accuracy_median'][1:], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_rforest['test_accuracy_median'][1:], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_rforest['test_accuracy_median'][1:], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylim([0.5,1])
plt.ylabel('Test accuracy mean')
plt.title('Test accuracy mean using the taxonomies of the bacteria \n with Random Forest (zoom on [0.5,1])')

In [None]:
# f1 score 
plt.figure(figsize=(10,5))
X = empo1_scores_rforest['features'][1:]
X_axis = np.arange(len(X))
width=0.2

plt.bar(X_axis - width, empo1_scores_rforest['test_f1_weighted_median'][1:], width, label = 'empo_1')
plt.bar(X_axis, empo2_scores_rforest['test_f1_weighted_median'][1:], width, label = 'empo_2')
plt.bar(X_axis + width, empo3_scores_rforest['test_f1_weighted_median'][1:], width ,label = 'empo_3')
plt.xticks(X_axis, X)

plt.legend()
plt.ylim([0.5,1])
plt.ylabel('F1 score mean')
plt.title('F1 score mean mean using the taxonomies of the bacteria \n with Random Forest (zoom on [0.5,1])')

# Challenge prediction

In [None]:
challenge_desc = sps.load_npz('challenge_descriptors_sparse.npz')

In [None]:
challenge_envs = []
for empo_name in empo_names:
    clf_rforest.fit(desc, envs[empo_name])
    
    challenge_envs.append(clf_rforest.predict(challenge_desc))

In [None]:
challenge_envs = np.array(challenge_envs)
challenge_envs

In [None]:
with open('predictions.csv', 'w') as f:
    print(',' + ','.join(empo_names), file=f)
    for sample_idx, line in enumerate(challenge_envs.T):
        labels = [empo_index_to_label[empo_idx][cat_idx] for empo_idx, cat_idx in enumerate(line)]
        
        print(f'challenge_{sample_idx},' + ','.join(labels), file=f)