In [1]:
# Built-in Python
from pathlib import Path

# Third Part Libraries
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd
import numpy as np

In [2]:
# Load paired parquet files
parquets = list(Path('parquet-paired').glob('*.parquet'))

In [3]:
# Read All Parquets
dfs = []
for parquet in parquets[:]:
    print(parquet)
    # Read parquet into a dataframe
    _df = pd.read_parquet(parquet)
    # Add dataframe to the list of dataframes
    dfs.append(_df)
# Concatenate dataframes together as 1 dataframe
df = pd.concat(dfs)
df = df.replace(np.nan, None)

parquet-paired/SRR1585274.parquet
parquet-paired/1287148.parquet
parquet-paired/1287158.parquet
parquet-paired/1287151.parquet
parquet-paired/1279073.parquet
parquet-paired/1287150.parquet
parquet-paired/SRR1585265.parquet
parquet-paired/SRR1585275.parquet
parquet-paired/1287159.parquet
parquet-paired/1287149.parquet
parquet-paired/1287152.parquet
parquet-paired/SRR1585267.parquet
parquet-paired/SRR1585248.parquet
parquet-paired/1279068.parquet
parquet-paired/SRR10358523.parquet
parquet-paired/SRR1585249.parquet
parquet-paired/1287153.parquet
parquet-paired/1279075.parquet
parquet-paired/1279065.parquet
parquet-paired/1287156.parquet
parquet-paired/1287146.parquet
parquet-paired/1279074.parquet
parquet-paired/1287147.parquet
parquet-paired/1287157.parquet
parquet-paired/SRR10358525.parquet
parquet-paired/D326651.parquet
parquet-paired/1287155.parquet
parquet-paired/1287145.parquet
parquet-paired/1279076.parquet
parquet-paired/1279066.parquet
parquet-paired/1287144.parquet
parquet-paire

In [4]:
# displays constant region call counts for all paired BCR database 
df['c_call_heavy'].value_counts(dropna=False).reset_index(name='count').rename(columns={'index': 'c_call_heavy'})

Unnamed: 0,c_call_heavy,count
0,IGHM*01,329645
1,,134368
2,IGHA1*01,20074
3,IGHG1*01,17162
4,IGHG2*01,11117
5,IGHA2*01,8477
6,IGHG3*01,6693
7,IGHG4*01,1920
8,"IGHG1*01,IGHG2*01",1349
9,"IGHA1*01,IGHA2*01",209


In [5]:
# Adds IGHM*01 c_call_heavy to DeKosky dataset
df.loc[df['author'] == "DeKosky", 'c_call_heavy'] = "IGHM*01"

In [6]:
# creates naive paired BCR database
naive_df = df[
    df['c_call_heavy'] == "IGHM*01"
]

In [7]:
# displays constant region call counts for all paired BCR database 
df['c_call_heavy'].value_counts(dropna=False).reset_index(name='count').rename(columns={'index': 'c_call_heavy'})

Unnamed: 0,c_call_heavy,count
0,IGHM*01,457346
1,IGHA1*01,20074
2,IGHG1*01,17162
3,IGHG2*01,11117
4,IGHA2*01,8477
5,IGHG3*01,6693
6,,6667
7,IGHG4*01,1920
8,"IGHG1*01,IGHG2*01",1349
9,"IGHA1*01,IGHA2*01",209


In [8]:
# displays constant region call counts for naive paired BCR database 
naive_df['c_call_heavy'].value_counts(dropna=False).reset_index(name='count').rename(columns={'index': 'c_call_heavy'})

Unnamed: 0,c_call_heavy,count
0,IGHM*01,457346


In [9]:
# writes out naive paired BCR database as a csv file
naive_df.to_csv("naive_pairs.csv", index=False)

In [10]:
# writes out all paired BCR database as a csv file
df.to_csv("all_pairs.csv", index=False)

In [11]:
filtered_df = naive_df[
    naive_df['v_call_top_heavy'].str.contains("IGHV1-46", na=False) &
    naive_df['v_call_top_light'].str.contains("IGKV3-20", na=False)
]

In [12]:
# writes out filtered database
filtered_df.to_csv("filtered_pairs.csv", index=False)