# Application example

Let's apply the tool to a set of variants from a cancer cell line.

Dataset is from [Talsania et. al. 2022](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-022-02816-6). These are structural variants called using the [Manta pipeline](https://github.com/Illumina/manta) on WGS data from Illumina in HCC1395 tumor cells.

In [4]:
import pandas as pd
import numpy as np 
import os



from pathlib import Path

# Get input variants

In [8]:
# Get variants

ftp_path = 'https://ftp-trace.ncbi.nlm.nih.gov/ReferenceSamples/seqc/Somatic_Mutation_WG/analysis/SVs/VCFs/tumor.illumina.manta.EA_T_1.vcf.gz'
in_file = 'test/tumor.illumina.manta.EA_T_1.vcf'

if not Path(f'../{in_file}').is_file():
    os.system(f'wget -P ../test/ {ftp_path}')
    os.system(f'gupzip {in_file}.gz')


In [17]:
import io 

def read_vcf(path):
    
    '''
    Read  vcf files into dataframe.
    Source: https://gist.github.com/dceoy/99d976a2c01e7f0ba1c813778f9db744.
    
    '''
    
    with open(path, 'r') as f:
        lines = [l for l in f if not l.startswith('##')]
        
    return pd.read_csv(
        io.StringIO(''.join(lines)),
        dtype={'#CHROM': str, 'POS': int, 'ID': str, 'REF': str, 'ALT': str,
               'QUAL': str, 'FILTER': str, 'INFO': str},
        sep='\t'
    ).rename(columns={'#CHROM': 'CHROM'})



read_vcf(f'../{in_file}')

ValueError: invalid literal for int() with base 10: 'MantaBND:47012:0:1:0:0:0:0'

In [12]:
# Read input

import sys
sys.path.insert(0, '../scripts')

import reading_utils
reading_utils.var_set_size = 2000

variants = reading_utils.read_input(f'../{in_file}', 0)
variants

Unnamed: 0,CHROM,POS,END,REF,ALT,SVTYPE,SVLEN
0,chr1,1117831,,C,[chr22:20272153[C,,
1,chr1,1119512,,A,ACAGTGC]chr22:20302979],,
2,chr1,3721048,3734333,T,<DUP:TANDEM>,DUP,13285
3,chr1,6742482,,C,C[chr12:96100887[,,
4,chr1,9357666,9377061,G,<DEL>,DEL,-19395
...,...,...,...,...,...,...,...
1543,chrY,14531089,,T,T[chrX:6219006[,,
1544,chrY,14533586,,A,A]chrX:6219008],,
1545,chr16_KI270728v1_random,1769992,,A,A[chr9:129442980[,,
1546,chr17_KI270729v1_random,162446,162616,GAGTCCATTCGATGATTTCATTAGATTCCATTGGAAGATGATTCCA...,G,DEL,-170


In [14]:
# Look at varinat types present
variants.SVTYPE.value_counts()

DEL    552
DUP    294
INS     80
Name: SVTYPE, dtype: int64

# Score variants using tool

In [13]:
# Get run command

file = 'tumor'#'CTCF_del' # Output files prefix
directory = 'test/tumor_output'# '../test/output' # Output directory

print('Run this command in the main directory:\n')
print('python scripts/score_var.py', in_file,
      '--file', file, # File name prefix for outputs
      '--dir', directory, # Path to save output in 
      '--augment', # Get the average augmented scores
      '--get_scores')


Run this command in the main directory:

python scripts/score_var.py test/tumor.illumina.manta.EA_T_1.vcf --file tumor --dir test/tumor_output --augment --get_scores


# Prioritize most disruptive variants

In [None]:
# Read output

In [None]:


top_variants = []

for SVTYPE in [x for x in scores.SVTYPE.unique() if x is not np.nan]:
    
    top_var = list(scores[scores.SVTYPE == SVTYPE]
                        .sort_values('mse_mean', ascending = False)
                        .head(3)
                        .index
                        .values)
    
    for var in top_var:
        top_variants.append(var)
        
top_variants

# Get maps for top scoring variants using tool

# Plot maps for top scoring variants