TO DO:
1.  Access to the number of transposons (Insertions) in the upstream and downstream 3KB regions of every gene.
    - Count how many insertions are 300bp after the End location and before it.
2.  Compute the average transposon density per library.
    - Compute the total number of transposons in the genome over the length of the genome.
3.  Compare if the number of transposons in the upstream and downstream 3KB regions of every gene is less than the average transposon density of the library * 3KB , then discard the gene for analysis of essentiality.
    - if the number of insertions of step1 < average transposon density * 3KB , discard the gene for analysis of essentiality.

- import dataset : postproccessed_data_all_backgrounds.xlsx
- compute transposon density : total bp/total insertions
- compute number of insertions in the upstream and downstream 3KB regions of every gene , Feature_type=Gene; Dubious

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os,sys
from collections import defaultdict


In [2]:
data_all=pd.read_excel('../postprocessed-data/postprocessed_data_all_backgrounds.xlsx',engine="openpyxl")

In [3]:
## Refine data 
data_all.index=np.arange(0,len(data_all))
data_all.drop(columns=['Unnamed: 1'],inplace=True)
data_all.fillna(0,inplace=True)
data_all.rename(columns={'Unnamed: 0':'background'},inplace=True)

keys=['dbem3_b',
 'dnrp1_b',
 'bem1-aid_a',
 'dnrp1_a',
 'dbem1dbem3_b',
 'wt_merged',
 'dbem1dbem3_a',
 'bem1-aid-dbem3_a',
 'bem1-aid-dbem3_b',
 'wt_b',
 'wt_a',
 'dnrp1_merged',
 'bem1-aid_b',
 'dbem3_merged',
 'dbem3_a']

In [4]:
indexes_back=[] # indexes for the start of each background

for i in keys:
    indexes_back.append(np.where(data_all.loc[:,"background"]==i)[0])


In [5]:
# Filling the name of the background according to the index
for k in np.arange(0,len(indexes_back)-1):
    
    data_all.loc[np.arange(indexes_back[k][0],indexes_back[k+1][0]),"background"]=keys[k]

data_all.loc[np.arange(indexes_back[-1][0],len(data_all)),"background"]=keys[-1] # for the last key

In [6]:
# Check
data_all.groupby(['background']).size()

background
bem1-aid-dbem3_a    12760
bem1-aid-dbem3_b    12760
bem1-aid_a          12745
bem1-aid_b          12760
dbem1dbem3_a        12760
dbem1dbem3_b        12760
dbem3_a             12760
dbem3_b             12760
dbem3_merged        12760
dnrp1_a             12745
dnrp1_b             12760
dnrp1_merged        12760
wt_a                12760
wt_b                12760
wt_merged           12745
dtype: int64

In [7]:
# Compute average transposon density per background 
data_all_processed=data_all.copy()
for key in keys:
    sum_tr=data_all_processed[data_all_processed.loc[:,"background"]==key]["Ninsertions"].sum()
    genome=data_all_processed[data_all_processed.loc[:,"background"]==key]["Nbasepairs"].sum()
    data_all_processed.loc[data_all_processed.loc[:,"background"]==key,"transposon_density"]=sum_tr/genome

In [8]:
# Threshold for the number of transposons in 3kB = 3000bp according the transposon density per background

threshold_tr=defaultdict(dict)
windows_size=3000
for key in keys:
    threshold_tr[key]["threshold"]=np.round(data_all_processed[data_all_processed.loc[:,"background"]==key]["transposon_density"].unique()[0]*windows_size,decimals=0)

In [9]:
threshold_tr_pd=pd.DataFrame.from_dict(threshold_tr,orient='index')
threshold_tr_pd

Unnamed: 0,threshold
bem1-aid-dbem3_a,82.0
bem1-aid-dbem3_b,122.0
bem1-aid_a,30.0
bem1-aid_b,61.0
dbem1dbem3_a,123.0
dbem1dbem3_b,116.0
dbem3_a,148.0
dbem3_b,159.0
dbem3_merged,248.0
dnrp1_a,115.0


In [10]:
len(data_all_processed[(data_all_processed.loc[:,"background"]== keys[1]) & (data_all_processed.loc[:,"Feature_type"]== "Gene; Dubious") | (data_all_processed.loc[:,"Feature_type"]== "Gene; Verified")])

75152

In [11]:
## Importing pergene files , to know the name of all genes to analyze 

pergene_files=[]
#data_dir= "../satay/data_files/data_unmerged/"
#data_dir="../transposonmapper/data_files/files4test/"
data_dir="../postprocessed-data/"
#data_dir="../transposonmapper/data_files/"
for root, dirs, files in os.walk(data_dir):
    for file in files:
        if file.endswith("pergene_insertions.xlsx"):
            pergene_files.append(os.path.join(root, file))

list_data=[]
for i in pergene_files:
    list_data.append(pd.read_excel(i,engine='openpyxl',index_col="Unnamed: 0"))

keys=[]
for i in np.arange(0,len(pergene_files)):
    keys.append(pergene_files[i].split("/")[-1].split("_")[0]+"_"+pergene_files[i].split("/")[-1].split("_")[1])

list_data_pd=pd.concat(list_data,axis=0,keys=keys)

In [12]:
list_data_pd.head(2)


Unnamed: 0,Unnamed: 1,Gene name,Chromosome,Start location,End location,Insertion locations,Reads per insertion location,Reads,Insertions
dbem3_b,0,YAL069W,I,335,649,"[336, 340, 349, 372, 377, 382, 386, 386, 387, ...","[3, 327, 87, 1, 3, 12, 39, 1, 43, 6, 3, 45, 1,...",2353,55
dbem3_b,1,YAL068W-A,I,538,792,"[559, 561, 564, 565, 569, 570, 570, 571, 573, ...","[6, 1, 2, 2, 147, 9, 154, 17, 177, 1, 132, 5, ...",694,25


## Procedure :

- Take from the processed data only the elements that fall under the category of 
"Feature_type=Gene; Dubious" or "Feature_type=Gene; Verified" or "Feature_type=Gene; Uncharacterized" or "Feature_type=Gene; Verified|silenced_gene"

In [13]:
genes_names=data_all_processed[data_all_processed.loc[:,"Feature_type"]=="Gene; Verified"]["Standard_name"].unique()
np.append(genes_names,data_all_processed[data_all_processed.loc[:,"Feature_type"]=="Gene; Dubious"]["Standard_name"].unique())
np.append(genes_names,data_all_processed[data_all_processed.loc[:,"Feature_type"]=="Gene; Uncharacterized"]["Standard_name"].unique())
np.append(genes_names,data_all_processed[data_all_processed.loc[:,"Feature_type"]=="Gene; Verified|silenced_gene"]["Standard_name"].unique())

array(['PAU8', 'SEO1', 'FLO9', ..., 'HMLALPHA1', 'HMRA2', 'HMRA1'],
      dtype=object)

In [14]:
data_all_processed.index=data_all_processed.loc[:,"Standard_name"]

In [100]:
## Steps:
# 1. Convert all positions to floats
# 2. Find all the positions that fall such that the start and the end are within 300bp more than 
#    the end of the gene (downstream search)
# 3. Find all the positions that fall such that the start and the end are within 300bp less than
#    the start of the gene (upstream search)

# Convert all positions to float
from from_excel_to_list import from_excel_to_list
n=0
background=keys[n]
data=data_all_processed[data_all_processed.loc[:,"background"]==background]

positions_float=defaultdict(dict)
for i in data.index:
    if type(data.loc[i,"Position"])==pd.core.series.Series:
        tmp=[]
        for j in np.arange(0,len(data.loc[i,"Position"])):
            tmp.append(from_excel_to_list(data.loc[i,"Position"][j]))
            positions_float[i]["Positions_float"]=tmp
    else:
        positions_float[i]["Positions_float"]=(from_excel_to_list(data.loc[i,"Position"]))
    


In [86]:
positions_float_pd=pd.DataFrame.from_dict(positions_float,orient='index')
positions_float_pd.head(2)

Unnamed: 0,Positions_float
AAC1,"[387316.0, 388245.0]"
AAC3,"[415984.0, 416907.0]"


In [87]:
#2. Find all the positions that fall such that the start and the end are within 300bp more than 
#    the end of the gene (downstream search)

pos_target=0
target=positions_float_pd.loc[genes_names[pos_target],"Positions_float"]

for i in genes_names:
    tmp=positions_float_pd.loc[i,"Positions_float"]
    if tmp[1]<target[1]+300 :
        positions_float_pd.loc[0,"Genes downstream"]=i

In [98]:
positions_float_pd.loc[:,"Positions_float"][0]

<function list.index(value, start=0, stop=9223372036854775807, /)>