In [1]:
import pandas as pd
import requests

pd.options.mode.chained_assignment = None  # Suppress the warning

In [2]:
def get_sequence_from_ucsc(df):
    """
    Retrieves sequence information from UCSC Genome Browser API for each row in the DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame containing 'chr', 'start', and 'end' columns.

    Returns:
        pandas.DataFrame: DataFrame with additional columns containing sequence information.
    """
    for index, row in df.iterrows():
        # Extract chromosome, start, and end positions from the DataFrame
        chrom = row['chr']
        start = row['start']
        end = row['end']

        # Build the URL for the UCSC Genome Browser API
        url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg19;chrom={chrom};start={start};end={end}"

        # Send a GET request to the API
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the JSON response
            gene_info = response.json()

            # Store the sequence information in the DataFrame
            df.loc[index, 'downloadTime_new'] = gene_info.get('downloadTime')
            df.loc[index, 'downloadTimeStamp_new'] = gene_info.get('downloadTimeStamp')
            df.loc[index, 'genome_new'] = gene_info.get('genome')
            df.loc[index, 'chrom_new'] = gene_info.get('chrom')
            df.loc[index, 'start_new'] = gene_info.get('start')
            df.loc[index, 'end_new'] = gene_info.get('end')
            df.loc[index, 'dna_new'] = gene_info.get('dna')
        else:
            # If the API request fails, set the sequence information to None
            df.loc[index, 'downloadTime_new'] = None
            df.loc[index, 'downloadTimeStamp_new'] = None
            df.loc[index, 'genome_new'] = None
            df.loc[index, 'chrom_new'] = None
            df.loc[index, 'start_new'] = None
            df.loc[index, 'end_new'] = None
            df.loc[index, 'dna_new'] = None

    return df

In [3]:
df = pd.read_csv("sana_second_dataset.csv")

df[["ref", "alt"]] = df["allele"].str.split("/", expand=True)

# adding +1 to the end so the API can work
df["end"] = df["end"] + 1

df.head()

Unnamed: 0,chr,start,end,allele,driver,ref,alt
0,16,339439,339440,C/T,1,C,T
1,16,339608,339609,C/T,1,C,T
2,16,347721,347722,C/T,1,C,T
3,16,360070,360071,C/A,1,C,A
4,16,396146,396147,A/T,1,A,T


it takes 1-1.5 secs for every row so expect this to run for around 15 minutes

In [8]:
# I ran the following code to get the sequences from UCSC

# df = get_sequence_from_ucsc(df)
# df.to_csv("sana_second_dataset_with_results_from_ucsc.csv", index=False)

In [4]:
df = pd.read_csv("sana_second_dataset_with_results_from_ucsc.csv")

df.head()

Unnamed: 0.1,Unnamed: 0,chr,start,end,allele,driver,ref,alt,downloadTime_new,downloadTimeStamp_new,genome_new,chrom_new,start_new,end_new,dna_new
0,0,16,339439,339440,C/T,1,C,T,2023:07:20T07:04:26Z,1689837000.0,hg19,chr16,339439.0,339440.0,C
1,1,16,339608,339609,C/T,1,C,T,2023:07:20T07:04:27Z,1689837000.0,hg19,chr16,339608.0,339609.0,T
2,2,16,347721,347722,C/T,1,C,T,2023:07:20T07:04:28Z,1689837000.0,hg19,chr16,347721.0,347722.0,C
3,3,16,360070,360071,C/A,1,C,A,2023:07:20T07:04:29Z,1689837000.0,hg19,chr16,360070.0,360071.0,T
4,4,16,396146,396147,A/T,1,A,T,2023:07:20T07:04:30Z,1689837000.0,hg19,chr16,396146.0,396147.0,C


In [5]:
(df.ref == df.dna_new).value_counts()


False    931
True     224
Name: count, dtype: int64

In [6]:
df.start_new.isna().sum()

0

In [7]:
df.end_new.isna().sum()


0