In [3]:
import pandas as pd

qc = pd.read_excel("mmc1-QC.xlsx")
print(qc.head())


        SampleName       CellType ImmGenLab    Lineage      CellFamily  \
0   LTHSC.34-.BM#1   LTHSC.34-.BM    Wagers  Stem&Prog  Stem&Prog Cell   
1   LTHSC.34+.BM#1   LTHSC.34+.BM    Wagers  Stem&Prog  Stem&Prog Cell   
2  STHSC.150-.BM#1  STHSC.150-.BM    Wagers  Stem&Prog  Stem&Prog Cell   
3   MPP4.135+.BM#1   MPP4.135+.BM    Wagers  Stem&Prog  Stem&Prog Cell   
4    proB.CLP.BM#1    proB.CLP.BM     Hardy          B          B Cell   

         Organ                                     SortingMarkers  \
0  Bone Marrow               Lin-Sca1+ckit+CD135-CD150+CD48-CD34-   
1  Bone Marrow               Lin-Sca1+ckit+CD135-CD150+CD48-CD34+   
2  Bone Marrow                    Lin-Sca1+ckit+CD135-CD150-CD48-   
3  Bone Marrow                               Lin-Sca1+ckit+CD135+   
4  Bone Marrow  CD19-IgM-CD43+CD24-CD45R-CD93+CD117+IL7Ra+PI-D...   

   InputCellNumber  PF.reads  %chrM.mapped  \
0              677  22287984          4.58   
1             2483  28588536          3.17   
2 

In [4]:
import pandas as pd

# 1. Read both sheets
path = "mmc1-QC.xlsx"
sp = pd.read_excel(path, sheet_name="SortedPopulations")
rs = pd.read_excel(path, sheet_name="Read Statistics")

# 2. Filter SortedPopulations to abT & T.act
sp_filtered = sp[sp["Lineage"].isin(["abT", "T.act"])].copy()

# 3. Keep only these specified cell‐type rows in Read Statistics
keep_names = [
    "preT.DN1.Th", "preT.DN2a.Th", "preT.DN2b.Th", "preT.DN3.Th",
    "T.DN4.Th", "T.ISP.Th", "T.DP.Th", "T.4.Th",   "T.8.Th",
    "T.4.Nve.Sp", "T.4.Nve.Fem.Sp", "T.8.Nve.Sp",
    "T.4.Sp.aCD3+CD40.18hr",
    "Treg.4.FP3+.Nrplo.Co", "Treg.4.25hi.Sp",
    "T8.TN.P14.Sp", "T8.IEL.LCMV.d7.SI", "T8.TE.LCMV.d7.Sp",
    "T8.MP.LCMV.d7.Sp", "T8.Tcm.LCMV.d180.Sp", "T8.Tem.LCMV.d180.Sp",
    "NKT.Sp", "NKT.Sp.LPS.3hr", "NKT.Sp.LPS.18hr", "NKT.Sp.LPS.3d"
]
rs_filtered = rs[ rs["population.name"].isin(keep_names) ].copy()


# 4. (Optional) report counts
print(f"SortedPopulations: {len(sp_filtered)}/{len(sp)} rows kept")
print(f"Read Statistics:   {len(rs_filtered)}/{len(rs)} rows kept")

# 5. Write out to CSV for easy analysis
sp_filtered.to_csv("SortedPopulations_abT-Tact.csv", index=False)
rs_filtered.to_csv("ReadStatistics_abT-Tact.csv",    index=False)

print("Filtered CSVs written:")
print(" - SortedPopulations_abT-Tact.csv")
print(" - ReadStatistics_abT-Tact.csv")


SortedPopulations: 50/181 rows kept
Read Statistics:   53/169 rows kept
Filtered CSVs written:
 - SortedPopulations_abT-Tact.csv
 - ReadStatistics_abT-Tact.csv


##### Now that we only have the rows relevant for abT and T.act cells, I want to have a better understanding of the metrics in the datasets. 

In [5]:
import pandas as pd

# Define your file paths
files = {
    "SortedPopulations": "SortedPopulations_abT-Tact.csv",
    "ReadStatistics":   "ReadStatistics_abT-Tact.csv"
}

for name, path in files.items():
    df = pd.read_csv(path)
    print(f"=== {name} ({path}) columns ===")
    print(df.columns.tolist(), "\n")
    
    # Optional: show dtypes & non-null counts
    print(f"--- {name} info ---")
    df.info()
    print("\n")


=== SortedPopulations (SortedPopulations_abT-Tact.csv) columns ===
['SampleName', 'CellType', 'ImmGenLab', 'Lineage', 'CellFamily', 'Organ', 'SortingMarkers', 'InputCellNumber', 'PF.reads', '%chrM.mapped', 'Paired.read.after.removing.PCR.duplication', '%fragment.1Kb_TSS', 'Replicate.cor'] 

--- SortedPopulations info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 13 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   SampleName                                  50 non-null     object 
 1   CellType                                    50 non-null     object 
 2   ImmGenLab                                   50 non-null     object 
 3   Lineage                                     50 non-null     object 
 4   CellFamily                                  50 non-null     object 
 5   Organ                                       50 non-null   

In [11]:
import pandas as pd

basic_stats_df = pd.read_csv("../Descriptive_Stat_ATAC/basic_stats_df.csv")

basic_stats_df

Unnamed: 0.1,Unnamed: 0,Sample ID,Mean,Median,SD,CV
0,0,preT.DN1.Th,4.006249,1.34,9.89646,2.470256
1,1,preT.DN2a.Th,4.009777,1.2,9.849384,2.456342
2,2,preT.DN2b.Th,4.001368,1.34,9.83158,2.457055
3,3,preT.DN3.Th,3.999144,1.4,9.830203,2.458077
4,4,T.DN4.Th,4.000097,1.47,9.8199,2.454915
5,5,T.ISP.Th,3.990197,1.54,9.660167,2.420975
6,6,T.DP.Th,4.002725,1.51,9.797414,2.447686
7,7,T.4.Th,3.984623,1.45,9.828451,2.466595
8,8,T.8.Th,3.988538,1.48,9.886826,2.47881
9,9,T.4.Nve.Sp,3.996412,1.37,9.819407,2.457056
