In [1]:
import pandas as pd
pd.set_option('display.max_columns', 500)
import os
from pyensembl import EnsemblRelease
import concurrent.futures

# set pyensembl cache location
os.environ['PYENSEMBL_CACHE_DIR'] = "../data"

# init database
grch37 = EnsemblRelease(75)

# download and index db
grch37.download()
grch37.index()

INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.cdna.all.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.ncrna.fa.gz.pickle
INFO:pyensembl.sequence_data:Loaded sequence dictionary from /run/media/nazif/2F946E411BA61D49/data/pyensembl/GRCh37/ensembl75/Homo_sapiens.GRCh37.75.pep.all.fa.gz.pickle


In [2]:
relevant_cols = ["id", "chr", "start_coordinate", "end_coordinate", "mirna_accession", "is_mutated", "prediction", "binary_prediction", "pred_difference", "pred_difference_binary"]
df = pd.read_csv("results/sana_results_0_1500_with_prediction_only_meaningful_results.csv", low_memory=False, usecols=relevant_cols)
len(df) / df.id.nunique()

2.0

### explanation
there are 2 entries for every id. One for wild type and one for mutated. Which is encoded in "is_mutated" column.


In [3]:
# /2 is there because there are 2 entries for every id.
df.pred_difference_binary.value_counts() / 2

pred_difference_binary
-1    116868.0
 1    106160.0
Name: count, dtype: float64

# pyensembl

In [4]:
def get_transcript_id(coord):
    if transcript_id := grch37.transcript_ids_at_locus(*coord):
        return tuple(coord), transcript_id
    else:
        return tuple(coord), "not_found"



coords = df[['chr', 'start_coordinate']].values.tolist()

with concurrent.futures.ProcessPoolExecutor() as executor:
    results = executor.map(get_transcript_id, coords)

ensts = dict(results)
df["ENST"] = [ensts.get((row["chr"], row["start_coordinate"]), "") for _, row in df.iterrows()]


In [5]:
print(f"{len(df[df.ENST == 'not_found']) / len(df):.3%} of the coordinates have no transcripts")

26.131% of the coordinates have no transcripts


In [6]:
df[df.is_mutated == 0]

Unnamed: 0,mirna_accession,is_mutated,id,chr,start_coordinate,end_coordinate,prediction,binary_prediction,pred_difference,pred_difference_binary,ENST
0,MIMAT0000070,0,10_102293465_C_T_MIMAT0000070,10,102293455,102293471,0.117245,0,0.532147,1,[ENST00000533589]
2,MIMAT0000075,0,10_102293465_C_T_MIMAT0000075,10,102293462,102293484,0.452544,0,0.418784,1,[ENST00000533589]
4,MIMAT0000077,0,10_102293465_C_T_MIMAT0000077,10,102293483,102293494,0.492932,0,0.091038,1,[ENST00000533589]
6,MIMAT0000078,0,10_102293465_C_T_MIMAT0000078,10,102293452,102293464,0.483818,0,0.161300,1,[ENST00000533589]
8,MIMAT0000088,0,10_102293465_C_T_MIMAT0000088,10,102293460,102293482,0.488911,0,0.066889,1,[ENST00000533589]
...,...,...,...,...,...,...,...,...,...,...,...
446046,MIMAT0049016,0,X_88181906_A_G_MIMAT0049016,X,88181900,88181933,0.021817,0,0.838433,1,not_found
446048,MIMAT0049020,0,X_88181906_A_G_MIMAT0049020,X,88181919,88181932,0.325099,0,0.575925,1,not_found
446050,MIMAT0049021,0,X_88181906_A_G_MIMAT0049021,X,88181900,88181932,0.719502,1,-0.251468,-1,not_found
446052,MIMAT0049023,0,X_88181906_A_G_MIMAT0049023,X,88181922,88181933,0.079222,0,0.495642,1,not_found


In [7]:
cols = ["id", "mirna_accession", "ENST", "is_mutated", "pred_difference_binary"]

lite_df = df[cols]
lite_df.head()


Unnamed: 0,id,mirna_accession,ENST,is_mutated,pred_difference_binary
0,10_102293465_C_T_MIMAT0000070,MIMAT0000070,[ENST00000533589],0,1
1,10_102293465_C_T_MIMAT0000070,MIMAT0000070,[ENST00000533589],1,1
2,10_102293465_C_T_MIMAT0000075,MIMAT0000075,[ENST00000533589],0,1
3,10_102293465_C_T_MIMAT0000075,MIMAT0000075,[ENST00000533589],1,1
4,10_102293465_C_T_MIMAT0000077,MIMAT0000077,[ENST00000533589],0,1


In [8]:
gain_df = lite_df[(lite_df.is_mutated == 0) & (lite_df.pred_difference_binary == 1)] 
loss_df = lite_df[(lite_df.is_mutated == 0) & (lite_df.pred_difference_binary == -1)] 

gain_df.drop(columns=["is_mutated", "pred_difference_binary"], inplace=True)
loss_df.drop(columns=["is_mutated", "pred_difference_binary"], inplace=True)

# remove the last 13 characters (MIMAT0000070 like identifiers)
gain_df['id'] = gain_df['id'].str[:-13]
loss_df['id'] = loss_df['id'].str[:-13]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gain_df.drop(columns=["is_mutated", "pred_difference_binary"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  loss_df.drop(columns=["is_mutated", "pred_difference_binary"], inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gain_df['id'] = gain_df['id'].str[:-13]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

S

In [10]:
gain_df.to_csv("results/gain_pairs.csv", index=False)

In [11]:
loss_df.to_csv("results/loss_pairs.csv", index=False)
