In [1]:
cd ..

/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
import pandas as pd
import requests

pd.options.mode.chained_assignment = None  # Suppress the warning


In [None]:
def get_sequence_from_ucsc(df):
    """
    Retrieves sequence information from UCSC Genome Browser API for each row in the DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame containing "chr", "start", and "end" columns.

    Returns:
        pandas.DataFrame: DataFrame with additional columns containing sequence information.
    """
    for index, row in df.iterrows():
        # Extract chromosome, start, and end positions from the DataFrame
        chrom = row["chr"]
        start = row["start"]
        end = row["end"]

        # Build the URL for the UCSC Genome Browser API
        url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg19;chrom={chrom};start={start};end={end}"

        # Send a GET request to the API
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the JSON response
            gene_info = response.json()

            df.loc[index, "start_from_ucsc"] = gene_info.get("start")
            df.loc[index, "end_from_ucsc"] = gene_info.get("end")
            df.loc[index, "nucleotide_from_ucsc"] = gene_info.get("dna")

        else:
            # If the API request fails, set the information to None
            df.loc[index, "start_from_ucsc"] = None
            df.loc[index, "end_from_ucsc"] = None
            df.loc[index, "nucleotide_from_ucsc"] = None

    return df

In [41]:
def get_sequence_from_ucsc(df):
    """
    Retrieves sequence information from UCSC Genome Browser API for each row in the DataFrame.

    Args:
        df (pandas.DataFrame): DataFrame containing 'chr', 'start', and 'end' columns.

    Returns:
        pandas.DataFrame: DataFrame with additional columns containing sequence information.
    """
    for index, row in df.iterrows():
        # Extract chromosome, start, and end positions from the DataFrame
        chrom = row['chr']
        start = row['start']
        end = row['end']

        # Build the URL for the UCSC Genome Browser API
        url = f"https://api.genome.ucsc.edu/getData/sequence?genome=hg19;chrom={chrom};start={start};end={end}"

        # Send a GET request to the API
        response = requests.get(url)

        if response.status_code == 200:
            # Parse the JSON response
            gene_info = response.json()

            # Store the sequence information in the DataFrame
            df.loc[index, 'downloadTime_new'] = gene_info.get('downloadTime')
            df.loc[index, 'downloadTimeStamp_new'] = gene_info.get('downloadTimeStamp')
            df.loc[index, 'genome_new'] = gene_info.get('genome')
            df.loc[index, 'chrom_new'] = gene_info.get('chrom')
            df.loc[index, 'start_new'] = gene_info.get('start')
            df.loc[index, 'end_new'] = gene_info.get('end')
            df.loc[index, 'dna_new'] = gene_info.get('dna')
        else:
            # If the API request fails, set the sequence information to None
            df.loc[index, 'downloadTime_new'] = None
            df.loc[index, 'downloadTimeStamp_new'] = None
            df.loc[index, 'genome_new'] = None
            df.loc[index, 'chrom_new'] = None
            df.loc[index, 'start_new'] = None
            df.loc[index, 'end_new'] = None
            df.loc[index, 'dna_new'] = None

    return df


In [27]:
df = pd.read_csv("data/external/sana/final_dataset.csv", usecols=["chr", "start", "end", "ref", "alt"])
df["end"] = df["end"] + 1
df.head()

Unnamed: 0,chr,start,end,ref,alt
0,10,35318568,35318569,T,G
1,10,3827102,3827103,C,T
2,10,63661915,63661916,G,C
3,10,88681451,88681452,G,A
4,10,89653866,89653867,T,C


In [8]:
df = get_sequence_from_ucsc(df)

Unnamed: 0,chr,start,end,allele,driver,ref,alt
0,16,339439,339439,C/T,1,C,T
1,16,339608,339608,C/T,1,C,T
2,16,347721,347721,C/T,1,C,T
3,16,360070,360070,C/A,1,C,A
4,16,396146,396146,A/T,1,A,T


In [30]:
df.to_csv("data/external/sana/data_with_results_from_ucsc.csv")

In [6]:
df2 = pd.read_csv("data/external/sana/data_with_results_from_ucsc.csv")

df2.head()

Unnamed: 0.1,Unnamed: 0,chr,start,end,ref,alt,downloadTime_new,downloadTimeStamp_new,genome_new,chrom_new,start_new,end_new,dna_new
0,0,10,35318568,35318569,T,G,2023:07:18T18:55:37Z,1689707000.0,hg19,chr10,35318568.0,35318569.0,C
1,1,10,3827102,3827103,C,T,2023:07:18T18:55:38Z,1689707000.0,hg19,chr10,3827102.0,3827103.0,A
2,2,10,63661915,63661916,G,C,2023:07:18T18:55:40Z,1689707000.0,hg19,chr10,63661915.0,63661916.0,A
3,3,10,88681451,88681452,G,A,2023:07:18T18:55:41Z,1689707000.0,hg19,chr10,88681451.0,88681452.0,G
4,4,10,89653866,89653867,T,C,2023:07:18T18:55:42Z,1689707000.0,hg19,chr10,89653866.0,89653867.0,G


In [8]:
(df2.ref == df2.dna_new).value_counts()

False    1000
True      187
Name: count, dtype: int64

# Sana data 2

In [32]:
df = pd.read_csv("data/external/sana/sana_second_dataset.csv")

df[["ref", "alt"]] = df["allele"].str.split("/", expand=True)

df["end"] = df["end"] + 1

df.head()

Unnamed: 0,chr,start,end,allele,driver,ref,alt
0,16,339439,339440,C/T,1,C,T
1,16,339608,339609,C/T,1,C,T
2,16,347721,347722,C/T,1,C,T
3,16,360070,360071,C/A,1,C,A
4,16,396146,396147,A/T,1,A,T


In [21]:
df = get_sequence_from_ucsc(df)

df.head()

df.to_csv("data/external/sana/sana_second_dataset_with_results_from_ucsc.csv")

In [22]:
df.head()

Unnamed: 0,chr,start,end,allele,driver,ref,alt,downloadTime_new,downloadTimeStamp_new,genome_new,chrom_new,start_new,end_new,dna_new
0,16,339439,339440,C/T,1,C,T,2023:07:20T07:04:26Z,1689837000.0,hg19,chr16,339439.0,339440.0,C
1,16,339608,339609,C/T,1,C,T,2023:07:20T07:04:27Z,1689837000.0,hg19,chr16,339608.0,339609.0,T
2,16,347721,347722,C/T,1,C,T,2023:07:20T07:04:28Z,1689837000.0,hg19,chr16,347721.0,347722.0,C
3,16,360070,360071,C/A,1,C,A,2023:07:20T07:04:29Z,1689837000.0,hg19,chr16,360070.0,360071.0,T
4,16,396146,396147,A/T,1,A,T,2023:07:20T07:04:30Z,1689837000.0,hg19,chr16,396146.0,396147.0,C


In [23]:
(df.ref == df.dna_new).value_counts()


False    931
True     224
Name: count, dtype: int64

In [25]:
len(df)

1155

In [24]:
(df.alt == df.dna_new).value_counts()


False    942
True     213
Name: count, dtype: int64