In [1]:
from kasearch import AlignSequences, SearchDB, PrepareDB

## **Prepare query sequence (sequence to search with)**

In [2]:
raw_queries = [
    'QVQLVESGGGVVQPGRSLRLSCAASGFTFSSFGMHWVRQAPGKGLEWVAVISFDGSIKYSVDSVKGRFTISRDNSKNTLFLQMNSLRAEDTAVYYCARDRLNYYDSSGYYHYKYYGMAVWGQGTTVTVSS',
]
query_db = AlignSequences(raw_queries, # Sequences as strings to align.
                          n_jobs=1,     # Allocated number for jobs/threads for the search.
                         )

query_db.db.aligned_seqs

array([[81, 86, 81,  0, 76, 86, 69, 83, 71, 71,  0, 71, 86, 86, 81, 80,
        71, 82, 83, 76, 82, 76, 83, 67, 65, 65, 83, 71, 70, 84, 70,  0,
         0,  0,  0,  0,  0,  0,  0,  0, 83, 83, 70, 71, 77, 72,  0, 87,
        86, 82, 81,  0, 65,  0, 80,  0, 71,  0, 75,  0,  0, 71,  0, 76,
        69,  0, 87, 86, 65, 86, 73, 83, 70, 68,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, 71, 83, 73, 75, 89,  0,  0, 83, 86,  0,
        68,  0,  0, 83, 86,  0,  0, 75,  0,  0,  0, 71, 82, 70, 84, 73,
        83, 82,  0, 68,  0,  0,  0, 78,  0, 83,  0,  0, 75, 78,  0,  0,
         0,  0, 84,  0, 76, 70, 76, 81, 77, 78, 83, 76, 82, 65,  0, 69,
        68, 84, 65, 86, 89, 89, 67, 65, 82, 68, 82, 76, 78, 89, 89, 68,
        83, 83, 71,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 89, 89, 72, 89, 75, 89, 89, 71, 77, 65, 86, 87, 71,  0, 81,
        71, 84, 84, 86, 84, 86, 83, 83]], dtype=int8)

--------------
## **Initiate search class**

### Database to search against
- If no database path is give, a small OAS version will be downloaded to search against.
- The full version of OAS can be downloaded here ().
- You can also give it the path for a custom database to search against. (See below for how to create a custom database).

### Regions to search with
- Default regions are the whole chain, CDRs or CDR3.
- User-defined regions can be added, as seen with the paratope search below.
- For each region, the search can either be based on exact length match or not.
- For a more specific search, the search can be focused on a specific chain and species.

In [3]:
paratope = ["107 ", "108 ","111C", "114 ","115 "]

In [4]:
oasdb = SearchDB(
    database_path='oasdb-small', # Path to your database. Default will be to download a small prepared version of OAS.
    allowed_chain='Heavy',         # Search against a specific chain. Default is any chain.
    allowed_species='Human',       # Search against a specific species. Default is any species.
    regions=['whole', 'cdrs', 'cdr3', paratope],
    length_matched=[False, True, True, True],
)

-----------
## **Run search**

A search takes ~23min per sequence against all of OAS and ~2min per sequence against the small OAS.

To specify the number of highest similar sequences to keep, you can change the keep_best_n parameter.

In [5]:
%%time 
oasdb.search(query_db.db.aligned_seqs, # Input can be a single or multiple aligned sequences at a time.
             keep_best_n=10,               # You can define how many most similar sequences to return
            )



CPU times: user 3min 11s, sys: 33.2 s, total: 3min 44s
Wall time: 2min 1s


### Get N best identities

Identities of the most similar sequence for each of the regions can be fetched from the object with the bellow command.

In [6]:
oasdb.current_best_identities

array([[[0.8923077 , 0.7105263 , 0.8695652 , 1.        ],
        [0.8923077 , 0.7105263 , 0.8695652 , 1.        ],
        [0.8880441 , 0.7105263 , 0.8695652 , 1.        ],
        [0.8880441 , 0.7105263 , 0.8695652 , 1.        ],
        [0.8880441 , 0.7105263 , 0.82608694, 1.        ],
        [0.88461536, 0.7105263 , 0.82608694, 1.        ],
        [0.88461536, 0.7105263 , 0.82608694, 1.        ],
        [0.88461536, 0.7105263 , 0.82608694, 1.        ],
        [0.88461536, 0.68421054, 0.82608694, 1.        ],
        [0.88461536, 0.68421054, 0.82608694, 1.        ]]], dtype=float32)

### Get ID's of sequences with the N best identities

Similarly, the ID's of the most similar sequences can be fetched from the object with the bellow command.

In [7]:
oasdb.current_best_ids

array([[[[   3536,  185703],
         [   9281,  303840],
         [   9562,  770192],
         [   9333,  169872]],

        [[   2231,    4292],
         [   9970,  607648],
         [   9253,   81478],
         [   3799,  291364]],

        [[   2070, 1685168],
         [   9281,    2566],
         [   9253,  516624],
         [   1989,  606221]],

        [[   2140,  886583],
         [   5039,  133318],
         [   9562,  255745],
         [   5013,   64721]],

        [[   4903,  125092],
         [   2070, 1685168],
         [   2025, 1170341],
         [   2644, 1882235]],

        [[   3280, 2379245],
         [   4903,  125092],
         [   3536,  185703],
         [   3536, 1062161]],

        [[   3085, 2376838],
         [   2140,  886583],
         [   3174,  554092],
         [   1989, 3434206]],

        [[   1989, 2804021],
         [   4512, 1968848],
         [   5039,  133318],
         [  10443,  184024]],

        [[   3119, 3406137],
         [   2904,  339796]

---------
## Extract the meta data from matched sequences

Using the get_meta function, the meta data for all matched sequences for each query and region can be extracted as seen below.

Zero (0) is the first query or the first region in the list when initiating the search class. 

NB: The column "sequence_alignment_aa" holds the antibody sequence.

In [None]:
n_best_sequences = oasdb.get_meta(n_query = 0,          # Which query to extract meta data from
                                  n_region = 0,         # Which region to extract meta data from
                                  n_sequences = 'all',  # Number of sequences to extract (default is all, which is keep_best_n)
                                  n_jobs=10             # Allocated number for jobs/threads for the extraction
                                 )
n_best_sequences

In [None]:
n_best_sequences.sequence_alignment_aa.values

----------
## Create custom database


To create your own database you first need to create a csv file in the OAS format. For an example file, look at data/custom-data-example.csv. This file consists of a dictionary containing the metadata in the first line and then rows of the individual sequences afterwards. Only the Species and Chain is strictly needed in the metadata, and only the amino acids sequence of the antibodies is required for each antibody sequence.

### 1. Format your data into OAS files

In [None]:
import json
import pandas as pd

In [None]:
metadata = {"Species":"Human", "Chain":"Heavy"}
metadata = pd.Series(name=json.dumps(metadata), dtype='object')
seqsdata = pd.DataFrame([["EVQLVESGGGLAKPGGSLRLHCAASGFAFSSYWMNWVRQAPGKRLEWVSAINLGGGLTYYAASVKGRFTISRDNSKNTLSLQMNSLRAEDTAVYYCATDYCSSTYCSPVGDYWGQGVLVTVSS"],
                          ["EVQLVQSGAEVKRPGESLKISCKTSGYSFTSYWISWVRQMPGKGLEWMGAIDPSDSDTRYNPSFQGQVTISADKSISTAYLQWSRLKASDTATYYCAIKKYCTGSGCRRWYFDLWGPGT"]
                         ], columns = ['sequence_alignment_aa'])

In [None]:
save_file = "../data/custom-data-examples.csv"
metadata.to_csv(save_file, index=False)
seqsdata.to_csv(save_file, index=False, mode='a')

### 2. Turn your OAS formatted files into a custom database

After creating all the files you want to include in the new database, you can run the following code to create the database.

In [None]:
db_folder = "../data/my_db"
db_files = ['../data/custom-data-examples.csv']

In [None]:
%%timeit -n 1 -r 1
newDB = PrepareDB(db_path=db_folder, n_jobs=20)

for num, data_unit_file in enumerate(db_files):

    metadata = json.loads(','.join(pd.read_csv(data_unit_file, nrows=0).columns))
    newDB.prepare_sequences(data_unit_file,
                            file_id=num, 
                            chain=metadata['Chain'], 
                            species=metadata['Species'])
    
newDB.save_database()
newDB.merge_sequence_files()

### 3. Initiate the search class with your custom database


In [None]:
mydb = SearchDB(database_path=db_folder,      # Path to your database. Default will be to download a prepared version of OAS.
                 allowed_chain='Heavy',       # Search against a specific chain. Default is any chain.
                 allowed_species='Any',       # Search against a specific species. Default is any species.
                )