In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.discrete.count_model as cm
import statsmodels.discrete.count_model as smdc
import pyarrow.parquet as pq
import pyarrow as pa
import patsy

In [2]:

class DataSet(dict):
    def __init__(self, path):
        print('initializing dataset')
        self.filepath = path
        self.parquet = pq.ParquetFile(self.filepath)
    
    def __getitem__(self, key):
        try:
            return self.parquet.read([key]).to_pandas()[key]
        except:
            raise KeyError

    def __reduce__(self):
        #return self.parquet.read().to_pandas().__reduce__()
        return (self.__class__, (self.filepath, ))

In [3]:
fake_cres=pd.read_csv("fake_cres.csv").drop("Unnamed: 0",axis=1)

In [4]:
fake_cres


Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,1,0
1,nobody,brain,1,0
2,nobody,brain,1,0
3,nobody,brain,1,0
4,nobody,brain,1,0
...,...,...,...,...
14307,neurogene,blood,3,7
14308,neurogene,blood,3,26
14309,neurogene,blood,3,7
14310,neurogene,blood,3,15


In [5]:
table = pa.Table.from_pandas(fake_cres)
pq.write_table(table, 'fake_cres.parq')
counts = DataSet('fake_cres.parq')

initializing dataset


In [6]:
print(type(fake_cres["Cell_type"][0]))
print(type(fake_cres["CRE"][0]))

<class 'str'>
<class 'str'>


In [7]:
fake_cres['CRE'].unique()

array(['nobody', 'somebody', 'everybody', 'redgene', 'neurogene'],
      dtype=object)

In [8]:
table = pa.Table.from_pandas(fake_cres)
pq.write_table(table, 'fake_cres.parq')
counts = DataSet('fake_cres.parq')

initializing dataset


In [None]:

simple_formula = "umi_count ~ C(CRE) + C(Cell_type) + 0"
zinb_model = smdc.ZeroInflatedNegativeBinomialP.from_formula(
    simple_formula,
    counts,
    exog_infl = fake_cres[['replicate_ID']],
    p=2
)



zinb_result = zinb_model.fit(method="bfgs",maxiter=200)
zinb_result.summary()

Optimization terminated successfully.
         Current function value: 1.617621
         Iterations: 64
         Function evaluations: 67
         Gradient evaluations: 67


0,1,2,3
Dep. Variable:,umi_count,No. Observations:,14312.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,14306.0
Method:,MLE,Df Model:,5.0
Date:,"Thu, 20 Feb 2025",Pseudo R-squ.:,0.08366
Time:,10:44:45,Log-Likelihood:,-23151.0
converged:,True,LL-Null:,-25265.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_replicate_ID,0.4829,0.010,48.821,0.000,0.464,0.502
C(CRE)[everybody],4.6606,0.032,147.048,0.000,4.598,4.723
C(CRE)[neurogene],4.0148,0.038,105.834,0.000,3.940,4.089
C(CRE)[nobody],0.3315,0.047,7.052,0.000,0.239,0.424
C(CRE)[redgene],4.2443,0.028,149.775,0.000,4.189,4.300
C(CRE)[somebody],2.4020,0.033,73.413,0.000,2.338,2.466
C(Cell_type)[T.brain],-0.0046,0.030,-0.151,0.880,-0.064,0.055
alpha,0.5722,0.015,37.244,0.000,0.542,0.602


In [25]:
fake_cres
fake_cres["replicate_ID"]=fake_cres["replicate_ID"].map({1:"rep1",2:"rep2",3:"rep3"})

In [36]:
pd.get_dummies(fake_cres[['replicate_ID']], drop_first=True)


Unnamed: 0,replicate_ID_rep2,replicate_ID_rep3
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
14307,False,True
14308,False,True
14309,False,True
14310,False,True


In [52]:

simple_formula = "umi_count ~ C(CRE) * C(Cell_type)"
zinb_model = smdc.ZeroInflatedNegativeBinomialP.from_formula(
    simple_formula,
    counts,
    exog_infl = pd.get_dummies(fake_cres[['replicate_ID']]),
    p=2
)

n_count_params = zinb_model.exog.shape[1]      # Count model parameters
n_infl_params = zinb_model.exog_infl.shape[1]    # Inflation model parameters
n_total = n_count_params + n_infl_params + 1 # adding 1 for alpha
start_params = np.full(n_total, 0.1)

zinb_result = zinb_model.fit(start_params=start_params, method="bfgs",maxiter=200)
zinb_result.summary()

Optimization terminated successfully.
         Current function value: 1.483284
         Iterations: 85
         Function evaluations: 86
         Gradient evaluations: 86


0,1,2,3
Dep. Variable:,umi_count,No. Observations:,14312.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,14302.0
Method:,MLE,Df Model:,9.0
Date:,"Thu, 20 Feb 2025",Pseudo R-squ.:,0.1598
Time:,11:35:55,Log-Likelihood:,-21229.0
converged:,True,LL-Null:,-25265.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_replicate_ID_rep1,1.3553,0.037,36.246,0.000,1.282,1.429
inflate_replicate_ID_rep2,0.0281,0.031,0.917,0.359,-0.032,0.088
inflate_replicate_ID_rep3,2.2443,0.051,44.084,0.000,2.145,2.344
Intercept,4.6399,0.028,167.432,0.000,4.586,4.694
C(CRE)[T.neurogene],-1.8991,0.041,-46.296,0.000,-1.980,-1.819
C(CRE)[T.nobody],-3.9739,0.059,-67.697,0.000,-4.089,-3.859
C(CRE)[T.redgene],0.0196,0.039,0.505,0.613,-0.056,0.096
C(CRE)[T.somebody],-2.1146,0.041,-51.352,0.000,-2.195,-2.034
C(Cell_type)[T.brain],0.0398,0.040,0.996,0.319,-0.038,0.118


In [12]:
fake_cres

Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,1,0
1,nobody,brain,1,0
2,nobody,brain,1,0
3,nobody,brain,1,0
4,nobody,brain,1,0
...,...,...,...,...
14307,neurogene,blood,3,7
14308,neurogene,blood,3,26
14309,neurogene,blood,3,7
14310,neurogene,blood,3,15


In [26]:
fake_cres_agg = fake_cres.groupby(['Cell_type','CRE']).agg(
    Sum=('umi_count', 'sum'), Size=('umi_count','size'), Mean=('umi_count', 'mean')
)

In [35]:
fake_cres_agg.sort_values(by='Size')

Unnamed: 0_level_0,Unnamed: 1_level_0,Sum,Size,Mean
Cell_type,CRE,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
brain,nobody,377,1324,0.284743
brain,redgene,11376,1353,8.407982
brain,somebody,3429,1355,2.530627
brain,everybody,38676,1371,28.210066
brain,neurogene,36111,1378,26.20537
blood,neurogene,5999,1480,4.053378
blood,nobody,771,1502,0.513316
blood,redgene,42549,1511,28.159497
blood,everybody,39964,1514,26.396301
blood,somebody,5011,1524,3.288058


In [28]:
celltype_creid_pairs = list(fake_cres_agg.index)

In [29]:
fake_cres_one_iter = fake_cres[fake_cres.Cell_type == celltype_creid_pairs[0][0]][fake_cres.CRE == celltype_creid_pairs[0][1]]

  fake_cres_one_iter = fake_cres[fake_cres.Cell_type == celltype_creid_pairs[0][0]][fake_cres.CRE == celltype_creid_pairs[0][1]]


In [36]:
fake_cres_one_iter

Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
3266,everybody,blood,rep1,0
3267,everybody,blood,rep1,0
3268,everybody,blood,rep1,0
3269,everybody,blood,rep1,0
3270,everybody,blood,rep1,0
...,...,...,...,...
13294,everybody,blood,rep3,49
13295,everybody,blood,rep3,79
13296,everybody,blood,rep3,126
13297,everybody,blood,rep3,76


In [33]:
simple_formula = "umi_count ~ C(CRE) * C(Cell_type)"
zinb_model = smdc.ZeroInflatedNegativeBinomialP.from_formula(
    simple_formula,
    fake_cres_one_iter,
    exog_infl = pd.get_dummies(fake_cres_one_iter[['replicate_ID']]),
    p=2
)

n_count_params = zinb_model.exog.shape[1]      # Count model parameters
n_infl_params = zinb_model.exog_infl.shape[1]    # Inflation model parameters
n_total = n_count_params + n_infl_params + 1 # adding 1 for alpha
start_params = np.full(n_total, 0.1)

zinb_result = zinb_model.fit(start_params=start_params, method="bfgs",maxiter=200)
zinb_result.summary()

Optimization terminated successfully.
         Current function value: 1.855027
         Iterations: 36
         Function evaluations: 37
         Gradient evaluations: 37


  a1 * np.log(a1) + y * np.log(mu) -


0,1,2,3
Dep. Variable:,umi_count,No. Observations:,1514.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,1513.0
Method:,MLE,Df Model:,0.0
Date:,"Tue, 25 Feb 2025",Pseudo R-squ.:,0.03996
Time:,10:51:25,Log-Likelihood:,-2808.5
converged:,True,LL-Null:,-2925.4
Covariance Type:,nonrobust,LLR p-value:,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_replicate_ID_rep1,1.4013,0.112,12.490,0.000,1.181,1.621
inflate_replicate_ID_rep2,0.0441,0.090,0.493,0.622,-0.131,0.220
inflate_replicate_ID_rep3,2.3937,0.159,15.025,0.000,2.081,2.706
Intercept,4.6399,0.028,166.543,0.000,4.585,4.695
alpha,0.2899,0.021,13.944,0.000,0.249,0.331


In [23]:
fake_cres_one_iter

Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
3266,everybody,blood,1,0
3267,everybody,blood,1,0
3268,everybody,blood,1,0
3269,everybody,blood,1,0
3270,everybody,blood,1,0
...,...,...,...,...
13294,everybody,blood,3,49
13295,everybody,blood,3,79
13296,everybody,blood,3,126
13297,everybody,blood,3,76
