In [1]:
import numpy as np
from pandas import Series
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
import json
import pandas as pd

In [2]:
df = pd.read_parquet("../data/scrna_seq_with_embs.pq")
df = df[~df.embedding.isna()]

In [8]:
df.keywords.explode().dropna().map(str.lower).value_counts()

scrna-seq                         829
single-cell rna sequencing        646
single-cell rna-seq               390
single cell                       305
gene expression                   289
                                 ... 
immune cell infiltration (ici)      1
left-sided                          1
right-sided                         1
therapeutic sensitivity             1
mitochondrial biogenesis            1
Name: keywords, Length: 14908, dtype: int64

In [79]:
df[['title', 'abstract']].iloc[:50].to_csv("../data/mturk.csv", index=False)

In [82]:
df[['title', 'abstract']].iloc[:100].rename(columns={'title': 'INPUT:title', 'abstract': 'INPUT:abstract'}).to_csv("../data/mturk.tsv", index=False, sep="\t")

In [85]:
df[['title', 'abstract']].iloc[1000].abstract

'Microglia are macrophages present in the brain that function as the primary and most important source of immune response in the central nervous system (CNS). Regardless of their multitasking role, our knowledge regarding their molecular heterogeneity is limited; due to technical restrictions, it is only possible to measure gene expression in cell populations, not individual cells, with the results reflecting average mRNA levels. Therefore, recent scientific approaches have focused on single-cell techniques such as single-cell RNA sequencing (scRNAseq), a powerful technique that enables the delineation of transcriptomic cell-to-cell differences, revealing subpopulations with distinct molecular and functional characteristics. Here, we summarize recent studies that focused on transcriptomic microglial subpopulation clustering and classify them into three distinct groups based on age, spatial distribution, and disease. Additionally, we cross-compare populations from different studies to i

In [4]:
with open("../data/labeled_test.json") as f:
    data = json.load(f)

In [5]:
all_labels = np.array(['biological', 'computational'])
inp_samples = []
inp_labels = []
for d in data:
    if len(d['annotations'][0]['result']) == 0:
        continue

    inp_samples.append(d['data']['embedding'])
    choices = d['annotations'][0]['result'][0]['value']['choices']
    inp_labels.append([a in choices for a in all_labels])

inp_samples = np.array(inp_samples)

In [6]:
model = MultiOutputClassifier(estimator=LogisticRegression(C=100, n_jobs=4), n_jobs=4)
model.fit(np.array(inp_samples), np.array(inp_labels))

In [60]:
all_embeddings = np.array(list(df.embedding.values))
all_preds = [all_labels[m] for m in model.predict(all_embeddings)]
all_scores = np.c_[[x.max(axis=1) for x in model.predict_proba(all_embeddings)]].min(axis=0)

In [56]:
inp_samples_adj = np.r_[inp_samples, np.array(list(df.embedding.loc[manual_labels.index].values))]
inp_labels_adj = np.array(inp_labels + list(manual_labels.values))

In [58]:
model.fit(inp_samples_adj, inp_labels_adj);

In [64]:
(all_scores < 0.51).sum()

414

In [41]:
manual_labels = Series({
    2633: [1, 0],
    5163: [1, 0],
    5631: [1, 0],
    2069: [1, 0],
    7103: [0, 1],
    1913: [1, 1],
    6254: [1, 1],
    6272: [1, 1],
    6593: [0, 1],
    7045: [1, 0],
    ###
    67: [1, 0],
    9264: [1, 1],
    3111: [0, 1],
    5739: [1, 1],
    8254: [1, 0],
    909: [1, 1],
    6327:
})

In [63]:
for i in range(10):
    ti = np.argsort(all_scores)[i]
    print()
    print(df.index[ti], f"{all_scores[ti]:0.2f}")
    print(df.title.values[ti])
    print(df.abstract.values[ti])


67 0.50
Profiling Transcriptional Heterogeneity with Seq-Well S
Seq-Well is a high-throughput, picowell-based single-cell RNA-seq technology that can be used to simultaneously profile the transcriptomes of thousands of cells (Gierahn et al. Nat Methods 14(4):395-398, 2017). Relative to its reverse-emulsion-droplet-based counterparts, Seq-Well addresses key cost, portability, and scalability limitations. Recently, we introduced an improved molecular biology for Seq-Well to enhance the information content that can be captured from individual cells using the platform. This update, which we call Seq-Well S

9264 0.50
Single cell analysis of the inner ear sensory organs.
The inner ear is composed of a complex mixture of cells, which together allow organisms to hear and maintain balance. The cells in the inner ear, which undergo an extraordinary process of development, have only recently begun to be studied on an individual level. As it has recently become clear that individual cells, previ

array([0.33333342, 0.33333342, 0.33333342])

In [10]:
[d['annotations'][0]['result'][0]['value']['choices'] for d in data if len(d['annotations'][0]['result']) > 0]

[['computational'], ['biological'], ['biological']]