In [None]:
import pandas as pd
from scripts.utils import *
import concurrent.futures

In [None]:
df = pd.read_csv("data/raw_data_from_sana.csv")
df.head()

https://www.ncbi.nlm.nih.gov/clinvar/RCV000140032/ this page shows that this variant chromosome "Un_KI270742v1" is at Chr1: 142728921 - 142907112 (on Assembly GRCh37)

so I added the line 

```df.loc[df['chr'] == 'Un_KI270742v1', 'chr'] = '1'```

# processing df

In [None]:
df.loc[df['chr'] == 'Un_KI270742v1', 'chr'] = '1'

In [None]:
# renaming cols
df.rename(columns={"start": "pos"}, inplace=True)

In [None]:
# handle minus signs
df["ref"] = df["ref"].replace("-", "")
df["alt"] = df["alt"].replace("-", "")

In [None]:
# Add an 'id' column by combining 'chr', 'pos', 'ref', and 'alt' columns
df['id'] = df['chr'] + '_' + df['pos'].astype(str) + '_' + df['ref'] + '_' + df['alt']

In [None]:
# preparing mutation len values
df.loc[:, "ref_len"] = df["ref"].str.len()
df.loc[:, "alt_len"] = df["alt"].str.len()

In [None]:
# if affected sequences are longer than 1, fetch nucleotides at the specified interval
df.loc[df['ref_len'] > 1, 'fetched_nucleotides'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['pos'], x["pos"]+x["ref_len"]-1), axis=1)
# if affected sequences are equal to 1, fetch nucleotides at the specified pos
df.loc[df['ref_len'] == 1, 'fetched_nucleotides'] = df.apply(lambda x: get_nucleotide_at_position(x['chr'], x['pos']), axis=1)
# same but for 0 length
df.loc[df['ref_len'] == 0, 'fetched_nucleotides'] = ""

In [None]:
# Add an 'is_nucleotides_same' column to check if fetched nucleotides are the same as 'ref'
df.loc[:, 'is_nucleotides_same'] = df["fetched_nucleotides"] == df["ref"]
df.is_nucleotides_same.value_counts()


In [None]:
# preparing reference indices
df.loc[:, "ref_start"] = df["pos"]
df.loc[:, "ref_end"] = df["pos"] + df["ref_len"]


In [None]:
# preparing extended sequences
df.loc[:, 'upstream_sequence'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['ref_start']-30, x["ref_start"]-1), axis=1)
df.loc[:, 'downstream_sequence'] = df.apply(lambda x: get_nucleotides_in_interval(x['chr'], x['ref_end'], x["ref_end"]+29), axis=1)

In [None]:
# preparing wt sequence
df.loc[:,'sequence'] = df["upstream_sequence"] + df["ref"] + df["downstream_sequence"]

# preparing mutated sequence
df.loc[:,'mutated_sequence'] = df["upstream_sequence"] + df["alt"] + df["downstream_sequence"]

In [None]:
df = generate_is_mirna_column(df, grch=37)
df = generate_transcript_id_and_gene_name_columns(df, grch=37)


In [None]:
df.head()

In [None]:
df[df.is_mirna == 1].head()

In [None]:
def get_biotype(coord):
    if transcripts := grch37.transcripts_at_locus(*coord):
        return tuple(coord), transcripts[0].biotype
    else:
        return tuple(coord), "not_found"
    
grch37 = import_pyensembl(grch=37)
coords = df[['chr', 'pos']].values.tolist()

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(get_biotype, coords)

biotypes = dict(results)
df["biotype"] = [biotypes.get((row["chr"], row["pos"]), "") for _, row in df.iterrows()]


In [None]:
df.biotype.value_counts()

In [None]:
df[df.biotype == "miRNA"].is_mirna.value_counts()

In [None]:
df.head()

In [None]:
df[(df.biotype == "miRNA") & (df.is_mirna == 0)].transcript_id.values

In [None]:
case_1 = df[df.is_mirna == 0]
case_2 = df[df.is_mirna == 1]

case_1.to_csv("data/case_1_processed.csv", index=False)
case_2.to_csv("data/case_2_processed.csv", index=False)