# Creating `GeneSet`s from Literature Sources

Recall that the minumum requirement for `GeneSet` creation is simply a list of genes.

***Setting up the notebook***

In [None]:
import numpy as np
import pandas as pd
import xarray as xr

import holoviews as hv
hv.extension("bokeh")
%matplotlib inline

import GSForge as gsf

***Declaring used paths***

In [None]:
# OS-independent path management.
from os import fspath, environ
from pathlib import Path

In [None]:
OSF_PATH = Path(environ.get("GSFORGE_DEMO_DATA", default="~/GSForge_demo_data/osfstorage")).expanduser()
SI_FILE_1_PATH = OSF_PATH.joinpath('raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_1.csv')
SI_FILE_5_PATH = OSF_PATH.joinpath('raw_annotation_data', 'TPC2016-00158-LSBR2_Supplemental_File_5.txt')
HYDRO_GEM_PATH = OSF_PATH.joinpath("AnnotatedGEMs", "oryza_sativa_hydro_raw.nc")

LITERATURE_COLL_PATH = OSF_PATH.joinpath('Collections', 'literature')

***Load an AnnotatedGEM***

In [None]:
agem = gsf.AnnotatedGEM(HYDRO_GEM_PATH)
agem

## Load Data

In [None]:
with open(SI_FILE_1_PATH) as myfile:
    head = ''.join([next(myfile) for x in range(6)])
print(head)

In [None]:
si1_df = pd.read_csv(SI_FILE_1_PATH, skiprows=3, index_col=0)
si1_df.head()

In [None]:
agem.data.Gene

In [None]:
with open(SI_FILE_5_PATH) as myfile:
    head = ''.join([next(myfile) for x in range(6)])
print(head)

In [None]:
si5_df = pd.read_csv(SI_FILE_5_PATH, skiprows=2, index_col=0, sep='\t')
si5_df.head()

## Wrangle Data and Create Collections

At the very least we need a name and a list of genes.

In [None]:
genes = agem.data.Gene.to_series()
genes[genes.str.contains('ChrSy.fgenesh.*.37')]  # ChrSy.fgenesh.gene.37

In [None]:
mappings = {'ChrSy.fgenesh.gene.37': 'ChrSy.fgenesh.mRNA.37'}


def parse_gene_splices(gene, gene_index: pd.Series, replacement_mappings=mappings):
    """Convert to an existing splice site, if possible."""
    if gene in gene_index:
        return gene
    
    for splice_site in range(1, 4):
        gene_splice = f'{gene}.{splice_site}'
        if gene_splice in gene_index:
            return gene_splice
        
    if replacement_mappings is not None:
        if replacement_mappings.get(gene):
            return replacement_mappings.get(gene)
        
    return gene

In [None]:
lit_dge_coll = gsf.GeneSetCollection(gem=agem, name='Literature DGE')

for col in si1_df.columns:
    genes = si1_df[si1_df[col] > 0].index.values
    genes = np.asarray([parse_gene_splices(gene, agem.data.Gene.to_series())
                        for gene in genes])
    
    diff = np.setdiff1d(genes, agem.data.Gene.values)
    if diff.shape[0] > 0:
        print(diff)
    
    lit_dge_coll[col] = gsf.GeneSet.from_gene_array(genes, name=col)
    
lit_dge_coll
# genes

In [None]:
lit_tf_coll = gsf.GeneSetCollection(gem=agem, name='Literature Transcription Factors')

for name, values in si5_df.iterrows():
    genes = np.asarray(values.values[0].split(','))
    genes = np.asarray([parse_gene_splices(gene, agem.data.Gene.to_series())
                        for gene in genes])
    lit_tf_coll[name] = gsf.GeneSet.from_gene_array(genes, name=name)
    
    diff = np.setdiff1d(genes, agem.data.Gene.values)
    if diff.shape[0] > 0:
        print(diff)
    
lit_tf_coll

In [None]:
lit_dge_coll.save(LITERATURE_COLL_PATH.joinpath('DGE'))
lit_tf_coll.save(LITERATURE_COLL_PATH.joinpath('TF'))