In [1]:
from utils import home, templates, read_metadata_with_fields, join, create_cols_in_df
import pandas as pd

In [12]:
import requests

# Function to map chromosomal location to gene symbol using Ensembl API (GRCh37)
def map_location_to_gene(chrom, start, end=None):
    # Ensembl API endpoint for GRCh37
    server = "https://grch37.rest.ensembl.org"
    ext = f"/overlap/region/human/{chrom}:{start-100}"
    
    # Add the end position to the query if provided
    if end:
        ext += f"-{end}"
    else:
        ext += f"-{start+100}"
    
    # Add type of feature to return (gene)
    ext += "?feature=gene"
    
    # Set headers to accept JSON format
    headers = {"Content-Type": "application/json"}
    
    # Make the request to Ensembl API
    response = requests.get(server + ext, headers=headers)
    
    # Check for a successful response
    if not response.ok:
        return f"Error {response.status_code}: {response.text}"
    
    # Parse the JSON response
    decoded = response.json()
    
    # Check if any genes are found
    if decoded:
        # Return the first gene symbol found
        return decoded[0].get('gene_symbol', 'No gene symbol found')
    else:
        return "No gene found for this location"

# Example usage:
chromosome = "12"
start_position = 84374201

gene_symbol = map_location_to_gene(chromosome, start_position)
print(f"Gene symbol for chr{chromosome}:{start_position} (hg19) is: {gene_symbol}")


Gene symbol for chr12:84374201 (hg19) is: No gene found for this location


In [36]:
mut = read_metadata_with_fields(join(templates, 'mut/mutation_template_internal-Sheet1.tsv'))
raw = pd.read_csv('whims_mutation.csv', sep=',').rename({'Sample': 'sample_id', 'Chr': 'chromosome', 'Pos': 'seq_start_position', 'Ref': 'ref_allele', 'Var': 'alt_allele', 'Gene': 'symbol', 'Xeno VAF': 'allele_frequency'}, axis=1)
temp = create_cols_in_df(raw, mut.columns)[mut.columns]
temp['platform_id'] = "mut_wgs_illumina"
temp.to_csv('WUST_mut.tsv', sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['platform_id'] = "mut_wgs_illumina"


In [36]:
logfile = pd.read_csv(join(home, 'WUSTL/mut/annotations/annotater.log'), header=None)
vepIn = logfile[0].str.split(':', expand=True)[1].str.split('(', expand=True)[1].str.split(' ', expand=True).iloc[:, 0:5].drop_duplicates()
vepIn = vepIn.reset_index(drop=True)
vepIn['vepInput'] = vepIn[0].str.replace('chr', '') + ' ' + vepIn[1] + ' ' + vepIn[1] + ' ' + vepIn[3] + '/' + vepIn[4]
vepIn.dropna(subset=['vepInput'], inplace=True)
vepIn['chr'] = vepIn[0].str.replace('chr', '')
vepIn[1] = vepIn[1].astype(int)
vepIn.sort_values(by=['chr', 1], inplace=True)
vepIn.to_csv('vepIn.tsv', sep='\t', index=False)
vepIn

Unnamed: 0,0,1,2,3,4,vepInput,chr
8,chr1,768629,chr1_768629_G_A,G,A,1 768629 768629 G/A,1
9,chr1,864981,chr1_864981_G_C,G,C,1 864981 864981 G/C,1
10,chr1,1001212,chr1_1001212_A_G,A,G,1 1001212 1001212 A/G,1
16,chr1,1662614,chr1_1662614_G_A,G,A,1 1662614 1662614 G/A,1
26,chr1,1673928,chr1_1673928_G_T,G,T,1 1673928 1673928 G/T,1
...,...,...,...,...,...,...,...
39519,chrX,155251249,chrX_155251249_C_A,C,A,X 155251249 155251249 C/A,X
39520,chrX,155267632,chrX_155267632_A_G,A,G,X 155267632 155267632 A/G,X
39521,chrX,155428030,chrX_155428030_C_G,C,G,X 155428030 155428030 C/G,X
39522,chrX,155452207,chrX_155452207_G_C,G,C,X 155452207 155452207 G/C,X


In [29]:

vepIn.sort_values(by=[0, 1], inplace=True)
vepIn.to_csv('vepIn.tsv', sep='\t', index=False)

8        100039881
9          1001212
10       100296079
11       100667108
12       100752587
           ...    
39519     99832273
39520     99870774
39521     99915253
39522     99956121
39523     99958169
Name: 1, Length: 39516, dtype: int64

In [37]:
mut = read_metadata_with_fields(join(home, 'WUSTL/mut/WUSTL_mut.tsv'))

In [39]:
mut.sample_id.unique()

array(['WHIM5', 'WHIM6', 'WHIM8', 'WHIM9', 'WHIM11', 'WHIM12', 'WHIM13',
       'WHIM14', 'WHIM16', 'WHIM18', 'WHIM20', 'WHIM21'], dtype=object)