In [None]:
import pandas
import re
import tempfile
import os
import gzip
import shutil
import subprocess
from pyspark.sql.functions import udf, col, lit
from pyspark.sql import functions as F

"""
Import Parquet As a DataFrame
"""

#Read in parquet file from public S3 bucket
parquet_s3 = "s3://macaquenaive/parquet/"
df_spark = spark.read.parquet(parquet_s3)

##Verify count 
df_spark.count()

## Make a query class
The query class can hold our spark query until it's time to execute

In [None]:
class Query():
    
    '''An example query class to hold query parameters'''
    
    def __init__(self,q_name,length='',d_call_top="",regex="",animal_id="",c_call="",file_name=""):
        self.query_name = q_name
        self.d_call_top = d_call_top
        self.animal_id = animal_id
        self.c_call = c_call
        self.file_name = file_name
        
        if not length:
            raise Exception("Length must be supplied")
        self.length = length
        self.regular_expression = regex
    
    
    
    def apply(self,df):
        
        '''Apply function will take in spark dataframe and apply query parameters to it if they exist
        
           Returns a filtered dataframe
        '''
        self.queried_dataframe = ""
        
        ##Lets get length
        
        self.queried_dataframe = df.filter(F.length(df.cdr3_aa) > self.length)
        
        ##If the rest of these were specified, add them to the filter
                   
        if self.d_call_top:
             self.queried_dataframe = self.queried_dataframe.filter(self.queried_dataframe.d_call_top.rlike(self.d_call_top))
                
        if self.c_call:
            self.queried_dataframe = self.queried_dataframe.filter(self.queried_dataframe.c_call == self.c_call)      

        if self.regular_expression:
             self.queried_dataframe = self.queried_dataframe.filter(self.queried_dataframe.cdr3_aa.rlike(self.regular_expression))
        
        if self.animal_id:
             self.queried_dataframe = self.queried_dataframe.filter(self.queried_dataframe.animal_id == self.animal_id)
                
        if self.file_name:
             self.queried_dataframe = self.queried_dataframe.filter(self.queried_dataframe.file_name.rlike(self.file_name))
                
        print("Found {} sequences".format(self.queried_dataframe.count()))
        return self.queried_dataframe

## These cells search for IgM precursors and calculate the IgM precursor frequencies of all animals with c_call="IGHM*03".
However, for dataset of animals RPz18, RGp18, RPb18, and REt18, only IgMs were sorted and sequenced, and some of their c_call reads were incomplete and therefore not reliable.

To address this issue, we used these cell to analyze precursor frequencies of all other macaques and analyze these four animals separately.

In [None]:
my_query_0 = Query('BG18_search.0',length=1,c_call="IGHM*03")
my_query_1 = Query('BG18_search.1',length=21,c_call="IGHM*03")
my_query_2 = Query('BG18_search.2',d_call_top=r'IGHD3-41',length=21,c_call="IGHM*03")
my_query_3 = Query('BG18_search.3',d_call_top=r'IGHD3-41',length=21,regex=r'IFG[VL]',c_call="IGHM*03")
my_query_4_7 = Query('BG18_search.4.7',d_call_top=r'IGHD3-41',length=21,regex=r'^.....IFG[VL]',c_call="IGHM*03")
my_query_4_8 = Query('BG18_search.4.8',d_call_top=r'IGHD3-41',length=21,regex=r'^......IFG[VL]',c_call="IGHM*03")
my_query_4_9 = Query('BG18_search.4.9',d_call_top=r'IGHD3-41',length=21,regex=r'^.......IFG[VL]',c_call="IGHM*03")
my_query_5_7 = Query('BG18_search.5.7',d_call_top=r'IGHD3-41',length=21,regex=r'^.....IFG[VL]....E',c_call="IGHM*03")
my_query_5_8 = Query('BG18_search.5.8',d_call_top=r'IGHD3-41',length=21,regex=r'^......IFG[VL]....E',c_call="IGHM*03")
my_query_5_9 = Query('BG18_search.5.9',d_call_top=r'IGHD3-41',length=21,regex=r'^.......IFG[VL]....E',c_call="IGHM*03")
my_query_6 = Query('BG18_search.6',d_call_top=r'IGHD3-41',length=21, regex=r'IFG[VL]....E',c_call="IGHM*03")
my_query_7 = Query('BG18_search.7',d_call_top=r'IGHD3-41',length=1,c_call="IGHM*03")

#To run query, pass the input object from above to apply

queried0_df = my_query_0.apply(df_spark)
queried1_df = my_query_1.apply(df_spark)
queried2_df = my_query_2.apply(df_spark)
queried3_df = my_query_3.apply(df_spark)
queried4_7_df = my_query_4_7.apply(df_spark)
queried4_8_df = my_query_4_8.apply(df_spark)
queried4_9_df = my_query_4_9.apply(df_spark)
queried5_7_df = my_query_5_7.apply(df_spark)
queried5_8_df = my_query_5_8.apply(df_spark)
queried5_9_df = my_query_5_9.apply(df_spark)
queried6_df = my_query_6.apply(df_spark)
queried7_df = my_query_7.apply(df_spark)

In [None]:
pandas0_df = queried0_df.select('sequence_id','animal_id').toPandas()
pandas1_df = queried1_df.select('sequence_id','animal_id').toPandas()
pandas2_df = queried2_df.select('sequence_id','animal_id').toPandas()
pandas3_df = queried3_df.select('sequence_id','animal_id').toPandas()
pandas47_df = queried4_7_df.select('sequence_id','animal_id').toPandas()
pandas48_df = queried4_8_df.select('sequence_id','animal_id').toPandas()
pandas49_df = queried4_9_df.select('sequence_id','animal_id').toPandas()
pandas57_df = queried5_7_df.select('sequence_id','animal_id').toPandas()
pandas58_df = queried5_8_df.select('sequence_id','animal_id').toPandas()
pandas59_df = queried5_9_df.select('sequence_id','animal_id').toPandas()
pandas6_df = queried6_df.select('sequence_id','animal_id').toPandas()
pandas7_df = queried7_df.select('sequence_id','animal_id').toPandas()

In [None]:
pandas4_df = pandas.concat([pandas47_df,pandas48_df,pandas49_df])
pandas5_df = pandas.concat([pandas57_df,pandas58_df,pandas59_df])

Print out the number of IgM sequences from all animals  

In [None]:
counts = pandas0_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 1: ≥22

In [None]:
counts = pandas1_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 2: ≥22, D3-3

In [None]:
counts = pandas2_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 3: ≥22, D3-3, FGV anywhere

In [None]:
counts = pandas3_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 4: ≥22, D3-3, FGV starting at position 7, 8, 9

In [None]:
counts = pandas4_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 5: ≥22, D3-3, FGV starting at position 7, 8, 9, E at position 7 past FGV (example:.......FGV....E)

In [None]:
counts = pandas5_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 6: ≥22, D3-3, FGV anywhere, E at position 7 past FGV (example:.......FGV....E)

In [None]:
counts = pandas6_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 7: D3-3

In [None]:
counts = pandas7_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

## These cells search for precursors and calculate the precursor frequencies of RPz18, RGp18, RPb18, and REt18 only.
The dataset of animals RPz18, RGp18, RPb18, and REt18, only IgMs were sorted and sequenced, and some of their c_call reads were incomplete and therefore not reliable.

To address this issue, we used this cell to analyze precursor frequencies of these four animals separately from the previous cells.

In [None]:
my_query_0 = Query('BG18_search.0',length=1,file_name=r'_Functional')
my_query_1 = Query('BG18_search.1',length=21,file_name=r'_Functional')
my_query_2 = Query('BG18_search.2',d_call_top=r'IGHD3-41',length=21,file_name=r'_Functional')
my_query_3 = Query('BG18_search.3',d_call_top=r'IGHD3-41',length=21,regex=r'IFG[VL]',file_name=r'_Functional')
my_query_4_7 = Query('BG18_search.4.7',d_call_top=r'IGHD3-41',length=21,regex=r'^.....IFG[VL]',file_name=r'_Functional')
my_query_4_8 = Query('BG18_search.4.8',d_call_top=r'IGHD3-41',length=21,regex=r'^......IFG[VL]',file_name=r'_Functional')
my_query_4_9 = Query('BG18_search.4.9',d_call_top=r'IGHD3-41',length=21,regex=r'^.......IFG[VL]',file_name=r'_Functional')
my_query_5_7 = Query('BG18_search.5.7',d_call_top=r'IGHD3-41',length=21,regex=r'^.....IFG[VL]....E',file_name=r'_Functional')
my_query_5_8 = Query('BG18_search.5.8',d_call_top=r'IGHD3-41',length=21,regex=r'^......IFG[VL]....E',file_name=r'_Functional')
my_query_5_9 = Query('BG18_search.5.9',d_call_top=r'IGHD3-41',length=21,regex=r'^.......IFG[VL]....E',file_name=r'_Functional')
my_query_6 = Query('BG18_search.6',d_call_top=r'IGHD3-41',length=21, regex=r'IFG[VL]....E',file_name=r'_Functional')
my_query_7 = Query('BG18_search.7',d_call_top=r'IGHD3-41',length=1,file_name=r'_Functional')

#To run query, pass the input object from above to apply

queried0_df = my_query_0.apply(df_spark)
queried1_df = my_query_1.apply(df_spark)
queried2_df = my_query_2.apply(df_spark)
queried3_df = my_query_3.apply(df_spark)
queried4_7_df = my_query_4_7.apply(df_spark)
queried4_8_df = my_query_4_8.apply(df_spark)
queried4_9_df = my_query_4_9.apply(df_spark)
queried5_7_df = my_query_5_7.apply(df_spark)
queried5_8_df = my_query_5_8.apply(df_spark)
queried5_9_df = my_query_5_9.apply(df_spark)
queried6_df = my_query_6.apply(df_spark)
queried7_df = my_query_7.apply(df_spark)

In [None]:
pandas0_df = queried0_df.select('sequence_id','animal_id').toPandas()
pandas1_df = queried1_df.select('sequence_id','animal_id').toPandas()
pandas2_df = queried2_df.select('sequence_id','animal_id').toPandas()
pandas3_df = queried3_df.select('sequence_id','animal_id').toPandas()
pandas47_df = queried4_7_df.select('sequence_id','animal_id').toPandas()
pandas48_df = queried4_8_df.select('sequence_id','animal_id').toPandas()
pandas49_df = queried4_9_df.select('sequence_id','animal_id').toPandas()
pandas57_df = queried5_7_df.select('sequence_id','animal_id').toPandas()
pandas58_df = queried5_8_df.select('sequence_id','animal_id').toPandas()
pandas59_df = queried5_9_df.select('sequence_id','animal_id').toPandas()
pandas6_df = queried6_df.select('sequence_id','animal_id').toPandas()
pandas7_df = queried7_df.select('sequence_id','animal_id').toPandas()

In [None]:
pandas4_df = pandas.concat([pandas47_df,pandas48_df,pandas49_df])
pandas5_df = pandas.concat([pandas57_df,pandas58_df,pandas59_df])

Print out the number of IgM sequences from all animals  

In [None]:
counts = pandas0_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 1: ≥22

In [None]:
counts = pandas1_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 2: ≥22, D3-3

In [None]:
counts = pandas2_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 3: ≥22, D3-3, FGV anywhere

In [None]:
counts = pandas3_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 4: ≥22, D3-3, FGV starting at position 7, 8, 9

In [None]:
counts = pandas4_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 5: ≥22, D3-3, FGV starting at position 7, 8, 9, E at position 7 past FGV (example:.......FGV....E)

In [None]:
counts = pandas5_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 6: ≥22, D3-3, FGV anywhere, E at position 7 past FGV (example:.......FGV....E)

In [None]:
counts = pandas6_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)

Print out the number of IgM sequences from all animals in search 7: D3-3

In [None]:
counts = pandas7_df.groupby('animal_id').count().rename({'sequence_id':'count'},axis=1).sort_values('count')
pandas.set_option('display.max_rows', None)  
print(counts)