In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
import seaborn as sns
from patsy import dmatrices, dmatrix
import statsmodels.discrete.count_model as smdc
import pyarrow as pa

In [24]:


class DataSet(dict):
    def __init__(self, path):
        print('initializing dataset')
        self.filepath = path
        self.parquet = pq.ParquetFile(self.filepath)
    
    def __getitem__(self, key):
        try:
            return self.parquet.read([key]).to_pandas()[key]
        except:
            raise KeyError

    def __reduce__(self):
        #return self.parquet.read().to_pandas().__reduce__()
        return (self.__class__, (self.filepath, ))


def poisson_model(counts_parq, patsy_formula):
    counts_model_poisson = smdc.GeneralizedPoisson.from_formula(formula = patsy_formula, data = counts_parq)

    return counts_model_poisson

def zi_poisson_model(counts_parq, patsy_formula):
    counts_model_zi_poisson = smdc.ZeroInflatedPoisson.from_formula(formula = patsy_formula, data = counts_parq)

    return counts_model_zi_poisson

def negative_binomial_model(counts_parq, patsy_formula):
    counts_model_negative_binomial = smdc.NegativeBinomialP.from_formula(formula = patsy_formula, data = counts_parq)

    return counts_model_negative_binomial

def zi_negative_binomial_model(counts_parq, patsy_formula):
    counts_model_zi_negative_binomial = smdc.ZeroInflatedNegativeBinomialP.from_formula(formula = patsy_formula, data = counts_parq)

    return counts_model_zi_negative_binomial

def get_stats(fit_model):
    print('getting stats')
    # available stats come from here https://www.statsmodels.org/dev/generated/statsmodels.discrete.discrete_model.CountResults.html

    aic = fit_model.aic
    bic = fit_model.bic
    loglike = fit_model.llf
    llr_chi2 = fit_model.llr
    llr_chip = fit_model.llr_pvalue
    pseudr = fit_model.prsquared
    resid = fit_model.resid
    converged = fit_model.converged

    return [aic, bic, loglike, llr_chi2, llr_chip, pseudr, list(resid), converged]

def boolean_string(s):
    if s not in {'False', 'True'}:
        raise ValueError('Not a valid boolean string')
    return s == 'True'

In [3]:
import pyarrow.parquet as pq

class DataSet(dict):
    def __init__(self, path):
        self.filepath = path
        self.parquet = pq.ParquetFile(self.filepath)
    
    def __getitem__(self, key):
        try:
            return self.parquet.read([key]).to_pandas()[key]
        except:
            raise KeyError

    def __reduce__(self):
        #return self.parquet.read().to_pandas().__reduce__()
        return (self.__class__, (self.filepath, ))


In [4]:
# set some formatting preferences to make things nicer to read
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [10]:
counts = pd.read_table('../../../data/GSE217686_assigned_oBC_CRE_mBC_joined_counts_sc_rep_mEB_series.txt')
cell_type_mapping = pd.read_table('../../../data/meb_cbc_to_cell_type_mapping.txt')
counts['cell_type_annotation'] = np.array(cell_type_mapping.loc[counts['cellBC']]['annotation'])

In [11]:
simple_formula = "UMIs_mBC ~ C(cell_type_annotation) + C(rep_id) +  C(CRE_id)"

In [22]:
# every row is a cellBC, CRE_id pair
counts_groupby_cre = counts.groupby(by=['cellBC','rep_id','CRE_class','CRE_id','cell_type_annotation'], as_index=False).agg(lambda x: x.sum() if np.issubdtype(x.dtype, np.number) else ', '.join(x))
counts_groupby_cre.to_csv('shendure_counts_grouped.txt', sep='\t', index=False)
counts_groupby_cre.shape

(778248, 11)

In [21]:
counts_groupby_cre

Unnamed: 0,cellBC,rep_id,CRE_class,CRE_id,cell_type_annotation,oBC,mBC,reads_oBC,UMIs_oBC,reads_mBC,UMIs_mBC
0,2B1_AAACCCACAAGGTTGG-1,2B1,devCRE,Btg1_chr10_9572,Neuroectoderm (brain),CATCGCTGAGTAAACG,GCGTACTCACCAGGT,73,67,0,0
1,2B1_AAACCCACAAGGTTGG-1,2B1,devCRE,Gata4_chr14_5710,Neuroectoderm (brain),GAGGATGAGTTGGAAT,CAATCGCACCCCCGA,167,159,0,0
2,2B1_AAACCCACAAGGTTGG-1,2B1,devCRE,Klf4_chr4_3952,Neuroectoderm (brain),CGTGAATTAATTCTAT,AACCCGGTAAATGTA,99,89,0,0
3,2B1_AAACCCACAAGGTTGG-1,2B1,devCRE,Lama1_chr17_7793,Neuroectoderm (brain),AGTAAGTCAGCTCTTT,CGTGACCTCTTCATT,159,147,0,0
4,2B1_AAACCCACAAGGTTGG-1,2B1,devCRE,Sox17_chr1_67,Neuroectoderm (brain),GACAATAAAATTCCAT,ACAGTCACAAATTTA,59,58,0,0
...,...,...,...,...,...,...,...,...,...,...,...
778243,B2_TTTGTTGGTGGACCAA-1,B2,devCRE,Sparc_chr11_7207,Ex. Endoderm (visceral),AATAATCACTCAAATT,TACCAACTGAGACAT,130,104,0,0
778244,B2_TTTGTTGGTGGACCAA-1,B2,devCRE,Tgfbi_chr13_5735,Ex. Endoderm (visceral),AGTCCATGGAGGGAGG,GTTTACCACATTACT,108,98,0,0
778245,B2_TTTGTTGGTGGACCAA-1,B2,devCRE,Tgfbi_chr13_5741,Ex. Endoderm (visceral),CTCAAGTTAGTAAGGG,CAGGGAACTGCCACC,75,68,19,1
778246,B2_TTTGTTGGTGGACCAA-1,B2,promoters,noP,Ex. Endoderm (visceral),"ACTTCTCGCCAAGGAA, GTTTCTTCGTCTGCCC","TCCCGCTGACACTTA, ATGTGGGTCGTCTAT",200,166,0,0


In [23]:
table = pa.Table.from_pandas(counts)
pq.write_table(table, '../../../data/shendure_mpra_counts_GSE217686.parq')
counts_parq = DataSet('../../../data/shendure_mpra_counts_GSE217686.parq')


table = pa.Table.from_pandas(counts_groupby_cre)
pq.write_table(table, '../../../data/shendure_mpra_counts_grouped_GSE217686.parq')
counts_parq_grouped = DataSet('../../../data/shendure_mpra_counts_grouped_GSE217686.parq')

In [25]:
scmpra_counts = DataSet('/home/eng26/project/scmpra/data/shendure_mpra_counts_grouped_GSE217686.parq')
formula = 'UMIs_mBC~C(cell_type_annotation)+C(rep_id)+C(CRE_id)'
maxiter = 10000
reg_fit = False
temp_dir = '/home/eng26/palmer_scratch/scmpra_temp'
model_choice = 'zi_negative_binomial'
out_file = 'poisson10000_False_UMIs_mBC~C(cell_type_annotation)+C(rep_id)+C(CRE_id)'

model_dict = {'poisson': poisson_model,
                'zi_poisson' : zi_poisson_model, 
                'negative_binomial' : negative_binomial_model,
                'zi_negative_binomial' : zi_negative_binomial_model}

initializing dataset


In [26]:
scmpra_model = model_dict[model_choice](scmpra_counts, formula)
try:
    scmpra_model = model_dict[model_choice](scmpra_counts, formula)
except:
    print('Failed to build %s model' % model_choice)

In [27]:
scmpra_model_fit = scmpra_model.fit(maxiter=maxiter)
scmpra_model_fit.save("%s/%s_fit_model.pickle" % (temp_dir, out_file))
with open("%s/%s_stats.txt" % (temp_dir, out_file), "w") as o:
    o.write("\t".join(str(x) for x in out_list))



Optimization terminated successfully.
         Current function value: 0.730869
         Iterations: 206
         Function evaluations: 207
         Gradient evaluations: 207


In [None]:
model_info = [model_choice, formula, maxiter, reg_fit]
model_stats = get_stats(scmpra_model_fit)

In [None]:
out_list = model_info + model_stats