In [72]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.discrete.count_model as cm
import statsmodels.discrete.count_model as smdc
import pyarrow.parquet as pq
import pyarrow as pa

In [73]:

class DataSet(dict):
    def __init__(self, path):
        print('initializing dataset')
        self.filepath = path
        self.parquet = pq.ParquetFile(self.filepath)
    
    def __getitem__(self, key):
        try:
            return self.parquet.read([key]).to_pandas()[key]
        except:
            raise KeyError

    def __reduce__(self):
        #return self.parquet.read().to_pandas().__reduce__()
        return (self.__class__, (self.filepath, ))

In [74]:
fake_cres=pd.read_csv("fake_cres.csv").drop("Unnamed: 0",axis=1)

In [75]:
fake_cres


Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,1,0
1,nobody,brain,1,0
2,nobody,brain,1,0
3,nobody,brain,1,0
4,nobody,brain,1,0
...,...,...,...,...
14307,neurogene,blood,3,7
14308,neurogene,blood,3,26
14309,neurogene,blood,3,7
14310,neurogene,blood,3,15


In [76]:
table = pa.Table.from_pandas(fake_cres)
pq.write_table(table, 'fake_cres.parq')
counts = DataSet('fake_cres.parq')

initializing dataset


In [77]:
print(type(fake_cres["Cell_type"][0]))
print(type(fake_cres["CRE"][0]))

<class 'str'>
<class 'str'>


In [78]:
fake_cres['CRE'].unique()

array(['nobody', 'somebody', 'everybody', 'redgene', 'neurogene'],
      dtype=object)

In [79]:
table = pa.Table.from_pandas(fake_cres)
pq.write_table(table, 'fake_cres.parq')
counts = DataSet('fake_cres.parq')

initializing dataset


In [84]:

simple_formula = "umi_count ~ C(CRE) + C(Cell_type) + 0"
zinb_model = smdc.ZeroInflatedNegativeBinomialP.from_formula(
    simple_formula,
    counts,
    exog_infl = fake_cres[['replicate_ID']],
    p=2
)



zinb_result = zinb_model.fit(method="bfgs",maxiter=200)
zinb_result.summary()

Optimization terminated successfully.
         Current function value: 1.617621
         Iterations: 64
         Function evaluations: 67
         Gradient evaluations: 67


0,1,2,3
Dep. Variable:,umi_count,No. Observations:,14312.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,14306.0
Method:,MLE,Df Model:,5.0
Date:,"Tue, 18 Feb 2025",Pseudo R-squ.:,0.08366
Time:,16:09:20,Log-Likelihood:,-23151.0
converged:,True,LL-Null:,-25265.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_replicate_ID,0.4829,0.010,48.821,0.000,0.464,0.502
C(CRE)[everybody],4.6606,0.032,147.048,0.000,4.598,4.723
C(CRE)[neurogene],4.0148,0.038,105.834,0.000,3.940,4.089
C(CRE)[nobody],0.3315,0.047,7.052,0.000,0.239,0.424
C(CRE)[redgene],4.2443,0.028,149.775,0.000,4.189,4.300
C(CRE)[somebody],2.4020,0.033,73.413,0.000,2.338,2.466
C(Cell_type)[T.brain],-0.0046,0.030,-0.151,0.880,-0.064,0.055
alpha,0.5722,0.015,37.244,0.000,0.542,0.602


In [106]:
fake_cres
fake_cres['replicate_ID']=pd.Categorical(fake_cres['replicate_ID'])

In [125]:

simple_formula = "umi_count ~ C(CRE) * C(Cell_type) + 0"
zinb_model = smdc.ZeroInflatedNegativeBinomialP.from_formula(
    simple_formula,
    counts,
    exog_infl = sm.add_constant(fake_cres[['replicate_ID']]),
    p=2
)



zinb_result = zinb_model.fit(method="bfgs",maxiter=200)
zinb_result.summary()

Optimization terminated successfully.
         Current function value: 1.545484
         Iterations: 67
         Function evaluations: 71
         Gradient evaluations: 71


0,1,2,3
Dep. Variable:,umi_count,No. Observations:,14312.0
Model:,ZeroInflatedNegativeBinomialP,Df Residuals:,14302.0
Method:,MLE,Df Model:,9.0
Date:,"Tue, 18 Feb 2025",Pseudo R-squ.:,0.1245
Time:,16:46:19,Log-Likelihood:,-22119.0
converged:,True,LL-Null:,-25265.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
inflate_const,0.4617,0.051,9.029,0.000,0.361,0.562
inflate_replicate_ID,0.2863,0.025,11.674,0.000,0.238,0.334
C(CRE)[everybody],4.6402,0.028,167.414,0.000,4.586,4.695
C(CRE)[neurogene],2.7407,0.030,90.609,0.000,2.681,2.800
C(CRE)[nobody],0.6662,0.052,12.769,0.000,0.564,0.768
C(CRE)[redgene],4.6595,0.027,171.846,0.000,4.606,4.713
C(CRE)[somebody],2.5252,0.030,82.917,0.000,2.466,2.585
C(Cell_type)[T.brain],0.0391,0.040,0.979,0.328,-0.039,0.117
C(CRE)[T.neurogene]:C(Cell_type)[T.brain],1.7878,0.057,31.121,0.000,1.675,1.900


In [113]:
fake_cres

Unnamed: 0,CRE,Cell_type,replicate_ID,umi_count
0,nobody,brain,1,0
1,nobody,brain,1,0
2,nobody,brain,1,0
3,nobody,brain,1,0
4,nobody,brain,1,0
...,...,...,...,...
14307,neurogene,blood,3,7
14308,neurogene,blood,3,26
14309,neurogene,blood,3,7
14310,neurogene,blood,3,15


In [116]:
pd.DataFrame(pd.Categorical(fake_cres['replicate_ID']))

Unnamed: 0,0
0,1
1,1
2,1
3,1
4,1
...,...
14307,3
14308,3
14309,3
14310,3


In [93]:
counts.parquet.read()

pyarrow.Table
CRE: string
Cell_type: string
replicate_ID: int64
umi_count: int64
----
CRE: [["nobody","nobody","nobody","nobody","nobody",...,"neurogene","neurogene","neurogene","neurogene","neurogene"]]
Cell_type: [["brain","brain","brain","brain","brain",...,"blood","blood","blood","blood","blood"]]
replicate_ID: [[1,1,1,1,1,...,3,3,3,3,3]]
umi_count: [[0,0,0,0,0,...,7,26,7,15,38]]

In [94]:
pd.Categorical(DataSet.__getitem__(counts, 'replicate_ID'))

[1, 1, 1, 1, 1, ..., 3, 3, 3, 3, 3]
Length: 14312
Categories (3, int64): [1, 2, 3]

In [88]:
fake_cres[['replicate_ID']]

Unnamed: 0,replicate_ID
0,1
1,1
2,1
3,1
4,1
...,...
14307,3
14308,3
14309,3
14310,3


https://stats.stackexchange.com/questions/284911/type-i-and-type-ii-negative-binomial-distribution-in-zero-inflated-negative-bino

https://www.statsmodels.org/stable/generated/statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.html

https://www.statsmodels.org/dev/generated/statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.from_formula.html#statsmodels.discrete.count_model.ZeroInflatedNegativeBinomialP.from_formula

In [58]:
zinb_result.summary2()

0,1,2,3
Model:,ZeroInflatedNegativeBinomialP,Pseudo R-squared:,0.018
Dependent Variable:,umi_count,AIC:,49640.9973
Date:,2025-02-18 15:53,BIC:,49716.6859
No. Observations:,14312,Log-Likelihood:,-24810.0
Df Model:,7,LL-Null:,-25265.0
Df Residuals:,14304,LLR p-value:,5.1554e-192
Converged:,1.0000,Scale:,1.0
Method:,MLE,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
inflate_replicate_ID,-1.6020,0.1450,-11.0463,0.0000,-1.8862,-1.3177
Intercept,3.2295,0.0951,33.9516,0.0000,3.0430,3.4159
C(Cell_type)[T.brain],-0.0392,0.0702,-0.5574,0.5773,-0.1768,0.0985
C(replicate_ID)[T.2],0.7169,0.0801,8.9504,0.0000,0.5599,0.8739
C(replicate_ID)[T.3],-0.9076,0.0821,-11.0607,0.0000,-1.0684,-0.7468
C(CRE)[T.neurogene],-0.5752,0.1007,-5.7098,0.0000,-0.7726,-0.3777
C(CRE)[T.nobody],-4.1807,0.1039,-40.2537,0.0000,-4.3842,-3.9771
C(CRE)[T.redgene],-0.3457,0.0993,-3.4800,0.0005,-0.5403,-0.1510
C(CRE)[T.somebody],-2.1807,0.0980,-22.2498,0.0000,-2.3728,-1.9886
