In [1]:
%cd ..
import pandas as pd


from scripts.utils_latest import *

pd.options.mode.chained_assignment = None  # Disable the SettingWithCopyWarning warning globally


/run/media/nazif/2F946E411BA61D49/thesis


In [2]:
df = pd.read_csv("sana/dataset.csv")

df["ref"] = df["ref"].replace("-", "")
df["alt"] = df["alt"].replace("-", "")
df.rename(columns={"start": "pos"}, inplace=True)
df.loc[df['chr'] == 'Un_KI270742v1', 'chr'] = '1'

df.head()

Unnamed: 0,chr,pos,ref,alt,driver
0,16,339439,C,T,1
1,16,339608,C,T,1
2,16,347721,C,T,1
3,16,360070,C,A,1
4,16,396146,A,T,1


https://www.ncbi.nlm.nih.gov/clinvar/RCV000140032/ this page shows that this variant chromosome "Un_KI270742v1" is at Chr1: 142728921 - 142907112 (on Assembly GRCh37)

so I added the line 

```df.loc[df['chr'] == 'Un_KI270742v1', 'chr'] = '1'```

In [5]:
# Add an 'id' column by combining 'chr', 'pos', 'ref', and 'alt' columns
df['id'] = df['chr'] + '_' + df['pos'].astype(str) + '_' + df['ref'] + '_' + df['alt']

# preparing mutation len values
df.loc[:, "ref_len"] = df["ref"].str.len()
df.loc[:, "alt_len"] = df["alt"].str.len()

# fetching sequences from fasta files to validate

# if affected sequences are longer than 1, fetch nucleotides at the specified interval
df.loc[df['ref_len'] > 1, 'fetched_nucleotides'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['pos'], x["pos"]+x["ref_len"]-1), axis=1)
# if affected sequences are equal to 1, fetch nucleotides at the specified pos
df.loc[df['ref_len'] == 1, 'fetched_nucleotides'] = df.apply(lambda x: get_nucleotide_at_position(x['chr'], x['pos']), axis=1)
# same but for 0 length
df.loc[df['ref_len'] == 0, 'fetched_nucleotides'] = ""

# Add an 'is_nucleotides_same' column to check if fetched nucleotides are the same as 'ref'
df.loc[:, 'is_nucleotides_same'] = df["fetched_nucleotides"] == df["ref"]

# preparing reference indices
df.loc[:, "ref_start"] = df["pos"]
df.loc[:, "ref_end"] = df["pos"] + df["ref_len"]

# preparing extended sequences
df.loc[:, 'upstream_sequence'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['ref_start']-30, x["ref_start"]-1), axis=1)
df.loc[:, 'downstream_sequence'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['ref_end'], x["ref_end"]+29), axis=1)

# preparing unprocessed sequence
df.loc[:,'sequence'] = df["upstream_sequence"] + df["ref"] + df["downstream_sequence"]


# preparing mutated sequence
df.loc[:,'mutated_sequence'] = df["upstream_sequence"] + df["alt"] + df["downstream_sequence"]


In [6]:
df.head()

Unnamed: 0,chr,pos,ref,alt,driver,id,ref_len,alt_len,fetched_nucleotides,is_nucleotides_same,ref_start,ref_end,upstream_sequence,downstream_sequence,sequence,mutated_sequence
0,16,339439,C,T,1,16_339439_C_T,1,1,C,True,339439,339440,GGGGAGGGGGCACCCCAGCCCTCACACTCA,CTGTAGCTGCCCTTTTTGGTCAGCAGCTCC,GGGGAGGGGGCACCCCAGCCCTCACACTCACCTGTAGCTGCCCTTT...,GGGGAGGGGGCACCCCAGCCCTCACACTCATCTGTAGCTGCCCTTT...
1,16,339608,C,T,1,16_339608_C_T,1,1,C,True,339608,339609,CCGCCGCCCACCTTCCTCTGCGATCTTGTC,TGGGGAAAGAGATGCAGCGGTGGTACCTGG,CCGCCGCCCACCTTCCTCTGCGATCTTGTCCTGGGGAAAGAGATGC...,CCGCCGCCCACCTTCCTCTGCGATCTTGTCTTGGGGAAAGAGATGC...
2,16,347721,C,T,1,16_347721_C_T,1,1,C,True,347721,347722,GCCCCCTCCTCACTGACAGGCGCACGCTCA,CTGTGGGCGAGGCCATCACTGGCGTTGGGG,GCCCCCTCCTCACTGACAGGCGCACGCTCACCTGTGGGCGAGGCCA...,GCCCCCTCCTCACTGACAGGCGCACGCTCATCTGTGGGCGAGGCCA...
3,16,360070,C,A,1,16_360070_C_A,1,1,C,True,360070,360071,TTACGGATCCTGTATGGGGGGATCCCATCC,TGTCCAGGAGAAAGAGGCAGCCGTTAACTC,TTACGGATCCTGTATGGGGGGATCCCATCCCTGTCCAGGAGAAAGA...,TTACGGATCCTGTATGGGGGGATCCCATCCATGTCCAGGAGAAAGA...
4,16,396146,A,T,1,16_396146_A_T,1,1,A,True,396146,396147,TACAGAAAGTGGACGCCTGGCGTCGGACTC,CCTGAACTCTCTGCCTTCGCTGTACCGTCT,TACAGAAAGTGGACGCCTGGCGTCGGACTCACCTGAACTCTCTGCC...,TACAGAAAGTGGACGCCTGGCGTCGGACTCTCCTGAACTCTCTGCC...


In [None]:
df.is_nucleotides_same.value_counts()

# adding is_mirna column

In [None]:
def generate_is_mirna_column(df, grch=38):
    coords = pd.read_csv(f"data/processed/mirbase/mirna_coordinates_grch{grch}/coordinates.csv")
    df['is_mirna'] = 0

    # Iterate over each mutation in the mutations dataframe
    for index, row in df.iterrows():
        mutation_chr = row['chr']
        mutation_start = row['pos']

        # Check if the mutation falls into any of the RNAs
        matching_rnas = coords[(coords['chr'] == mutation_chr) & (coords['start'] <= mutation_start) & (coords['end'] >= mutation_start)]

        if not matching_rnas.empty:
            # Update the 'is_mirna' column to 1 for the current mutation
            df.at[index, 'is_mirna'] = 1
            
    return df

df = generate_is_mirna_column(df, grch=38)


# finding matches

In [None]:
results = find_matches_for_vcfs(df[:2])
results_mutated = find_matches_for_vcfs(df[:2], mutated=True)


In [None]:
results["is_mutated"] = 0
results_mutated["is_mutated"] = 1
final =pd.concat([results, results_mutated], axis=0)

final.head()

In [None]:
final.to_csv("sana/initial_results.csv", index=False)