Author: Ronny F. Pacheco Date: Sept 2024
Copyright: © 2024 Ronny Pacheco License: MIT License

---

MIT License

Copyright (c) 2024 Ronny Pacheco

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Save data

In [34]:
# =============================================================================
# Save the variables
# =============================================================================
# variables_dict = {
#     "sider_df" : sider_df,  # Data Frame with SIDER data
#     "gtf_df" : gtf_df,  # Data Frame with GTF data
# }

# =============================================================================
# main function
# =============================================================================
# import os
# import pickle

# def data_save_load(option, dict_variables=None):
#     """
#     This function is used to save or load data for the jupyter notebook
#     """
#     path_folder = "ipynb_db"  # Folder to save variables
#     os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
#     path_file = os.path.join(path_folder, "variables.pkl") # Path to save the variables

#     if option == "save":
#         with open(path_file, "wb") as f:
#             pickle.dump(dict_variables, f)
#     elif option == "load":
#         with open(path_file, "rb") as f:
#             variables = pickle.load(f)
#         # Now load the variables
#         for key, value in variables.items():
#             variables[key] = value

# =============================================================================
# Call the function
# =============================================================================
# data_save_load(option="save",
#                dict_variables=variables_dict)

# Code

## 1. Load data

In [35]:
import pandas as pd # type: ignore
import numpy as np # type: ignore
import json

### 1.1. SIDER data

In [36]:
sider_df = pd.read_csv("data/SIDERs.gff", sep="\t", header=None)
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2117, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,SIDER,1,173,.,.,.,ID=sre_c01.10
1,LinJ.01,CBM,SIDER,24093,24758,.,.,.,ID=sre_c01.20A
2,LinJ.01,CBM,SIDER,35371,35956,.,.,.,ID=sre_c01.30A
3,LinJ.01,CBM,SIDER,39790,40595,.,.,.,ID=sre_c01.40
4,LinJ.01,CBM,SIDER,54983,55547,.,.,.,ID=sre_c01.50A


In the `sider_df` I only need the column 0, 3, 4, 6 and 8 columns

In [37]:
# Extracting from `sider_df`the needed columns [0, 3, 4, 6, 8]
sider_df = sider_df[[0, 3, 4, 6, 8]]
sider_df.columns = ["chrom", "start", "end", "strand", "sider_name"]
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2117, 5)
chrom         object
start          int64
end            int64
strand        object
sider_name    object
dtype: object


Unnamed: 0,chrom,start,end,strand,sider_name
0,LinJ.01,1,173,.,ID=sre_c01.10
1,LinJ.01,24093,24758,.,ID=sre_c01.20A
2,LinJ.01,35371,35956,.,ID=sre_c01.30A
3,LinJ.01,39790,40595,.,ID=sre_c01.40
4,LinJ.01,54983,55547,.,ID=sre_c01.50A


Now we need to get the **sider_name** without the 'ID=' element

In [38]:
# Extracting the sider_name from the column `sider_name`
sider_df["sider_name"] = sider_df["sider_name"].str.extract(r'ID=(.+)')
sider_df.head()

Unnamed: 0,chrom,start,end,strand,sider_name
0,LinJ.01,1,173,.,sre_c01.10
1,LinJ.01,24093,24758,.,sre_c01.20A
2,LinJ.01,35371,35956,.,sre_c01.30A
3,LinJ.01,39790,40595,.,sre_c01.40
4,LinJ.01,54983,55547,.,sre_c01.50A


### 1.2 GTF data
This one wil be **harder** to prepare

In [39]:
# Load data
gtf_df = pd.read_csv("./data/20240703111001_LINF-Tabla_maestra_v3-20244_RP_v0.8.gtf", sep="\t", header=None)
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45179, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,gene,1520,5066,.,-,.,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,CBM,transcript,1520,5066,.,-,.,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CBM,CDS,3710,4711,.,-,.,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,CBM,3utr,1520,3709,.,-,.,"parent_id ""LINF_01T0005000""; notes ""Protein_of..."
4,LinJ.01,CBM,5utr,4712,5066,.,-,.,"parent_id ""LINF_01T0005000""; notes ""Protein_of..."


From `gtf_df`I only need columns 0, 2, 3, 4, 6 and 8

In [40]:
# Get from `gtf_df` the needed columns [0, 3, 4, 6, 8]
gtf_df = gtf_df[[0, 2, 3, 4, 6, 8]]
gtf_df.columns = ["chrom", "feature", "start", "end", "strand", "attributes"]
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45179, 6)
chrom         object
feature       object
start          int64
end            int64
strand        object
attributes    object
dtype: object


Unnamed: 0,chrom,feature,start,end,strand,attributes
0,LinJ.01,gene,1520,5066,-,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,transcript,1520,5066,-,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CDS,3710,4711,-,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,3utr,1520,3709,-,"parent_id ""LINF_01T0005000""; notes ""Protein_of..."
4,LinJ.01,5utr,4712,5066,-,"parent_id ""LINF_01T0005000""; notes ""Protein_of..."


Now the field `attributes` it's separated by ";" and the header its in a format like `header "data"`. We are going to transform the "attributes" column in multiple columns

#### 1.2.1 Transforming colums

Get first all the elements that appear in the attributes columns

In [41]:
# Let's count first the number of elements in the `attributes` column
atr_dict = {}
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        # print(f"\t{'-'*50}")
        # print(f"\tatribute: {atr.strip()}")  
        key = atr.split(" ")[0] 
        if key not in atr_dict:
            atr_dict[key] = 1

        else:
            atr_dict[key] += 1
        # print(f"\t{atr_dict}")
print(atr_dict)

{'gene_id': 9861, 'gene_name': 9861, 'biotype': 17106, 'notes': 42974, 'parent_id': 35318, 'transcript_id': 18215, 'transcript_name': 9660, 'pseudogen': 49}


In [42]:
# get a list with the keys of atr_dict
atr_keys = list(atr_dict.keys())
print(atr_keys)

['gene_id', 'gene_name', 'biotype', 'notes', 'parent_id', 'transcript_id', 'transcript_name', 'pseudogen']


Now we'll have a list with all the elements. When indexing each row in the next steps, we can check if one of this items appear, and if not, we can add a "None" value to the attribute

In [43]:
# Now that we have the attributes count, let's create a dict for each element in "test_df" with the attributes separated
new_col_df = []
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    pre_data = []
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        key = atr.split(" ")[0]
        value = atr.split(" ")[1].replace('"', "")
        pre_data.append({key: value})
    
    for elem in atr_keys: # type: ignore  # Checking if the elements from atr_keys
        if elem not in [list(elem.keys())[0] for elem in pre_data]:  # If the element is not in pre_data, add it with value None
            pre_data.append({elem: None})

    flattenend_data = {key: value for sublist in pre_data for key, value in sublist.items()}
    new_col_df.append(flattenend_data)

In [44]:
# Checking how it worked
new_col_df  

[{'gene_id': 'LINF_010005000',
  'gene_name': 'LINF_010005000',
  'biotype': 'protein_coding',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'parent_id': None,
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_010005000',
  'transcript_id': 'LINF_01T0005000',
  'transcript_name': 'LINF_01T0005000',
  'biotype': 'protein_coding',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'gene_id': None,
  'gene_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'transcript_id': 'LINF_01T0005000',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'gene_id': None,
  'gene_name': None,
  'biotype': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'gene_id': None,
  'gene_name': None,
  'biotype': None,
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'notes': 'Prote

In [45]:
# Transforming the list of dicts into a DataFrame
new_col_df = pd.DataFrame(new_col_df)
new_col_df

Unnamed: 0,gene_id,gene_name,biotype,notes,parent_id,transcript_id,transcript_name,pseudogen
0,LINF_010005000,LINF_010005000,protein_coding,Protein_of_unknown_function_(DUF2946),,,,
1,,,protein_coding,Protein_of_unknown_function_(DUF2946),LINF_010005000,LINF_01T0005000,LINF_01T0005000,
2,,,,Protein_of_unknown_function_(DUF2946),LINF_01T0005000,LINF_01T0005000,,
3,,,,Protein_of_unknown_function_(DUF2946),LINF_01T0005000,,,
4,,,,Protein_of_unknown_function_(DUF2946),LINF_01T0005000,,,
...,...,...,...,...,...,...,...,...
45174,,,,Hypothetical_protein_-_conserved,LINF_36T0082400,LINF_36T0082400,,
45175,,,,Hypothetical_protein_-_conserved,LINF_36T0082400,,,
45176,,,,Hypothetical_protein_-_conserved,LINF_36T0082400,,,
45177,LINF_360082500,LINF_360082500,,,,,,


In [46]:
# Le'ts reorder the columns
new_col_df = new_col_df[["gene_id", "gene_name", "transcript_id", "transcript_name", "biotype", "parent_id", "pseudogen", "notes"]]
new_col_df

Unnamed: 0,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946)
1,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946)
2,,,LINF_01T0005000,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
3,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
4,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
...,...,...,...,...,...,...,...,...
45174,,,LINF_36T0082400,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45175,,,,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45176,,,,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45177,LINF_360082500,LINF_360082500,,,,,,


In [47]:
# Concatenating the new DataFrame with the original `gtf_df` and dropping the `attributes` column
gtf_df = pd.concat([gtf_df, new_col_df], axis=1)
gtf_df.drop(columns="attributes", inplace=True)
gtf_df

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LinJ.01,gene,1520,5066,-,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946)
1,LinJ.01,transcript,1520,5066,-,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946)
2,LinJ.01,CDS,3710,4711,-,,,LINF_01T0005000,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
3,LinJ.01,3utr,1520,3709,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
4,LinJ.01,5utr,4712,5066,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45174,LinJ.36,CDS,2739458,2740183,-,,,LINF_36T0082400,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45175,LinJ.36,3utr,2738595,2739457,-,,,,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45176,LinJ.36,5utr,2740184,2740374,-,,,,,,LINF_36T0082400,,Hypothetical_protein_-_conserved
45177,LinJ.36,gene,2740760,2742268,-,LINF_360082500,LINF_360082500,,,,,,


## 2. Compare coordinates

In this next part we are going to check for coordinates. To search which elements in the **sider_df** is inside which element in the **gtf_df**.

### 2.1 Fail proof the data

In [48]:
sider_df_test = sider_df.copy()
gtf_df_test = gtf_df.copy()

In [49]:
# Check the number of elements 
gtf_df_test.shape

(45179, 13)

In [50]:
# Elements where the start is minor than the end
(gtf_df_test['start'] < gtf_df_test['end']).sum()

np.int64(45177)

In [51]:
# Elements where the start is major than the end
(gtf_df_test['start'] > gtf_df_test['end']).sum()

np.int64(0)

In [52]:
# CHeck the feature elements
condition = (gtf_df_test['start'] < gtf_df_test['end'])
gtf_df_test[condition]['feature'].value_counts()

feature
gene          9861
transcript    9660
CDS           8555
3utr          8554
5utr          8547
Name: count, dtype: int64

Interesting, there should be the same numbers of 5utr as 3utr

In [53]:
# Checking without condition
gtf_df_test['feature'].value_counts()

feature
gene          9861
transcript    9660
CDS           8555
3utr          8554
5utr          8549
Name: count, dtype: int64

In [54]:
# Checking where those elements where there is not a 3utr or 5utr
parent_feature_dict = gtf_df_test.groupby('parent_id')['feature'].apply(list).to_dict()
filtered_dict = {k: v for k, v in parent_feature_dict.items() if v not in (['transcript'], 
                                                                           ['CDS'], 
                                                                           ['CDS', '5utr', '3utr'], 
                                                                           ['CDS', '3utr', '5utr'],
                                                                           ['CDS', '3utr', '5utr', 'CDS', '3utr', '5utr'],
                                                                           ['transcript', 'transcript'],
                                                                           ['CDS', '5utr', '3utr', 'CDS', '5utr', '3utr'])}
filtered_dict

{'LINF_27T0013600': ['CDS', '5utr'],
 'LINF_30T0006850': ['CDS', '3utr'],
 'LINF_31T0037100': ['CDS', '3utr'],
 'LINF_31T0039200': ['CDS', '3utr'],
 'LINF_36T0017400': ['CDS', '3utr'],
 'LINF_36T0036000': ['CDS', '3utr'],
 'LINF_36T0071100': ['CDS', '3utr']}

<span style="color:red">These are the elements without a 3utr or 5utr</span>

Should be careful with LINF_270013600

In [55]:
gtf_df[
    ((gtf_df[['gene_id', 'transcript_id', 'parent_id']].isin(filtered_dict.keys()).any(axis=1)) | (gtf_df['gene_id'].isin([elem.replace("T","") for elem in list(filtered_dict.keys())])))
                            ]

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
23035,LinJ.27,gene,327990,328645,+,LINF_270013600,LINF_270013600,,,protein_coding,,unknown,Stress_responsive_A/B_Barrel_domain-containing...
23036,LinJ.27,transcript,327990,328645,+,,,LINF_27T0013600,LINF_27T0013600,protein_coding,LINF_270013600,,Stress_responsive_A/B_Barrel_domain-containing...
23037,LinJ.27,CDS,328114,328645,+,,,LINF_27T0013600,,,LINF_27T0013600,,Stress_responsive_A/B_Barrel_domain-containing...
23038,LinJ.27,5utr,327990,328113,+,,,,,,LINF_27T0013600,,Stress_responsive_A/B_Barrel_domain-containing...
27672,LinJ.30,gene,56144,57262,-,LINF_300006850,LINF_300006850,,,protein_coding,,unknown,polynucleotide_kinase_3'-phosphatase-_putative...
27673,LinJ.30,transcript,56144,57262,-,,,LINF_30T0006850,LINF_30T0006850,protein_coding,LINF_300006850,,polynucleotide_kinase_3'-phosphatase-_putative...
27674,LinJ.30,CDS,56787,57262,-,,,LINF_30T0006850,,,LINF_30T0006850,,polynucleotide_kinase_3'-phosphatase-_putative...
27675,LinJ.30,3utr,56144,56786,-,,,,,,LINF_30T0006850,,polynucleotide_kinase_3'-phosphatase-_putative...
31361,LinJ.31,gene,1404369,1405546,-,LINF_310037100,LINF_310037100,,,protein_coding,,,Protein_of_unknown_function_-_conserved
31362,LinJ.31,transcript,1404369,1405546,-,,,LINF_31T0037100,LINF_31T0037100,protein_coding,LINF_310037100,,Protein_of_unknown_function_-_conserved


### 2.1 SIDER inside GTF elements

Let's check how is the data in the dictionary

So whe can see that inside every SIDER element that is inside each GTF element we add the information about the own SIDER element and the elements like "transcript", "gene"; and some "identifiers". Let's check again the `gtf_df`

In [56]:
# Checking gtf_df
gtf_df.head()

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LinJ.01,gene,1520,5066,-,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946)
1,LinJ.01,transcript,1520,5066,-,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946)
2,LinJ.01,CDS,3710,4711,-,,,LINF_01T0005000,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
3,LinJ.01,3utr,1520,3709,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)
4,LinJ.01,5utr,4712,5066,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946)


From `gtf_df` we need "gene_id", "transcript_id", "parent_id", "notes" and "feature". It would be great to divide "feature" into multiples columns. Let's check how many values are in feature

In [57]:
gtf_df['feature'].value_counts()

feature
gene          9861
transcript    9660
CDS           8555
3utr          8554
5utr          8549
Name: count, dtype: int64

There are 5 "features", so let's add 5 columns for each one

In [58]:
# Create the boolean columns for each category in "feature"
boolean_df = pd.get_dummies(gtf_df['feature'], prefix='', prefix_sep='').astype(bool)

gtf_df = pd.concat([gtf_df, boolean_df], axis=1)
gtf_df.head()

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes,3utr,5utr,CDS,gene,transcript
0,LinJ.01,gene,1520,5066,-,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946),False,False,False,True,False
1,LinJ.01,transcript,1520,5066,-,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946),False,False,False,False,True
2,LinJ.01,CDS,3710,4711,-,,,LINF_01T0005000,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946),False,False,True,False,False
3,LinJ.01,3utr,1520,3709,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946),True,False,False,False,False
4,LinJ.01,5utr,4712,5066,-,,,,,,LINF_01T0005000,,Protein_of_unknown_function_(DUF2946),False,True,False,False,False


In [59]:
# Let's drop the original "feature" column and reorder the columns
gtf_df.drop(columns="feature", inplace=True)
gtf_df = gtf_df[["chrom", "start", "end", "strand", "gene_id", "transcript_id", "parent_id", "gene", "transcript", "CDS", "3utr", "5utr", "pseudogen", "notes"]]
gtf_df.head()

Unnamed: 0,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes
0,LinJ.01,1520,5066,-,LINF_010005000,,,True,False,False,False,False,,Protein_of_unknown_function_(DUF2946)
1,LinJ.01,1520,5066,-,,LINF_01T0005000,LINF_010005000,False,True,False,False,False,,Protein_of_unknown_function_(DUF2946)
2,LinJ.01,3710,4711,-,,LINF_01T0005000,LINF_01T0005000,False,False,True,False,False,,Protein_of_unknown_function_(DUF2946)
3,LinJ.01,1520,3709,-,,,LINF_01T0005000,False,False,False,True,False,,Protein_of_unknown_function_(DUF2946)
4,LinJ.01,4712,5066,-,,,LINF_01T0005000,False,False,False,False,True,,Protein_of_unknown_function_(DUF2946)


Now let's reapeat the dictionary process again:

The next dictionary will have elements only COMPLETLY inside GTF elements

In [60]:
# # Create interval columns
sider_df["interval"] = pd.IntervalIndex.from_arrays(sider_df["start"], sider_df["end"], closed="both")
gtf_df["interval"] = pd.IntervalIndex.from_arrays(gtf_df["start"], gtf_df["end"], closed="both")

# initialize dict
sider_gtf_dict = {sider_name: [] for sider_name in sider_df["sider_name"].unique()}

# Find elements in sider_df that are inside gtf_df
# Find contains using boolean indexing
for i, sider_row in sider_df.iterrows():
    # Boolean mask for intervals that contain the SIDER interval
    print(f"Analyzing elem {i+1}/{sider_df.shape[0]}")


    mask = (gtf_df['chrom'] == sider_row['chrom']) & \
           (gtf_df['start'] <= sider_row['start']) & \
           (gtf_df['end'] >= sider_row['end'])
    
    contains = gtf_df[mask]
    
    for j, gtf_row in contains.iterrows():
        sider_gtf_dict[sider_row['sider_name']].append(gtf_row.to_dict())

Analyzing elem 1/2117
Analyzing elem 2/2117
Analyzing elem 3/2117
Analyzing elem 4/2117
Analyzing elem 5/2117
Analyzing elem 6/2117
Analyzing elem 7/2117
Analyzing elem 8/2117
Analyzing elem 9/2117
Analyzing elem 10/2117
Analyzing elem 11/2117
Analyzing elem 12/2117
Analyzing elem 13/2117
Analyzing elem 14/2117
Analyzing elem 15/2117
Analyzing elem 16/2117
Analyzing elem 17/2117
Analyzing elem 18/2117
Analyzing elem 19/2117
Analyzing elem 20/2117
Analyzing elem 21/2117
Analyzing elem 22/2117
Analyzing elem 23/2117
Analyzing elem 24/2117
Analyzing elem 25/2117
Analyzing elem 26/2117
Analyzing elem 27/2117
Analyzing elem 28/2117
Analyzing elem 29/2117
Analyzing elem 30/2117
Analyzing elem 31/2117
Analyzing elem 32/2117
Analyzing elem 33/2117
Analyzing elem 34/2117
Analyzing elem 35/2117
Analyzing elem 36/2117
Analyzing elem 37/2117
Analyzing elem 38/2117
Analyzing elem 39/2117
Analyzing elem 40/2117
Analyzing elem 41/2117
Analyzing elem 42/2117
Analyzing elem 43/2117
Analyzing elem 44/21

In [61]:
# Prepare a pre JSON dict to not alter the original dict
sider_gtf_relation_pre_json = sider_gtf_dict.copy()

# Let's check the first element
test = sider_gtf_relation_pre_json.get("sre_c01.20A")

# Check the data types of the values
for elem in test:
    for key, value in elem.items():
        print(key, ":", value, type(value))
    break

chrom : LinJ.01 <class 'str'>
start : 23375 <class 'int'>
end : 25667 <class 'int'>
strand : - <class 'str'>
gene_id : LINF_010005800 <class 'str'>
transcript_id : None <class 'NoneType'>
parent_id : None <class 'NoneType'>
gene : True <class 'bool'>
transcript : False <class 'bool'>
CDS : False <class 'bool'>
3utr : False <class 'bool'>
5utr : False <class 'bool'>
pseudogen : None <class 'NoneType'>
notes : mitochondrial_UMP-CMP_kinase <class 'str'>
interval : [23375, 25667] <class 'pandas._libs.interval.Interval'>


In [62]:
# Let's count the data
counter_siders_inside = 0
counter_sider_not_inside = 0
for key, value in sider_gtf_dict.items():
    print(f"{'='*50}")
    print(f"{key}:")
    if len(value) > 0:
        counter_siders_inside += 1
        for elem in value:
            print(f"\t{elem}")
    else:
        counter_sider_not_inside += 1

sre_c01.10:
sre_c01.20A:
	{'chrom': 'LinJ.01', 'start': 23375, 'end': 25667, 'strand': '-', 'gene_id': 'LINF_010005800', 'transcript_id': None, 'parent_id': None, 'gene': True, 'transcript': False, 'CDS': False, '3utr': False, '5utr': False, 'pseudogen': None, 'notes': 'mitochondrial_UMP-CMP_kinase', 'interval': Interval(23375, 25667, closed='both')}
	{'chrom': 'LinJ.01', 'start': 23375, 'end': 25667, 'strand': '-', 'gene_id': None, 'transcript_id': 'LINF_01T0005800', 'parent_id': 'LINF_010005800', 'gene': False, 'transcript': True, 'CDS': False, '3utr': False, '5utr': False, 'pseudogen': None, 'notes': 'mitochondrial_UMP-CMP_kinase', 'interval': Interval(23375, 25667, closed='both')}
	{'chrom': 'LinJ.01', 'start': 23375, 'end': 24798, 'strand': '-', 'gene_id': None, 'transcript_id': None, 'parent_id': 'LINF_01T0005800', 'gene': False, 'transcript': False, 'CDS': False, '3utr': True, '5utr': False, 'pseudogen': None, 'notes': 'mitochondrial_UMP-CMP_kinase', 'interval': Interval(23375, 

In [63]:
print(f"From the total of {sider_df.shape[0]} SIDERs, {counter_siders_inside} are inside the GTF elements and {counter_sider_not_inside} are not exactly inside the GTF elements")

From the total of 2117 SIDERs, 1685 are inside the GTF elements and 432 are not exactly inside the GTF elements


Let's get the elements in different dictionaries depending if the length of "values" is > 0 or not:

In [64]:
# Get the elements which value is > 0
sider_inside_gtf_dict = {key: value for key, value in sider_gtf_dict.items() if len(value) > 0}
print(len(sider_inside_gtf_dict))

# Get the elements which value is == 0
sider_not_inside_gtf_dict = {key: value for key, value in sider_gtf_dict.items() if len(value) == 0}
print(len(sider_not_inside_gtf_dict))

1685
432


In [65]:
sider_inside_gtf_dict

{'sre_c01.20A': [{'chrom': 'LinJ.01',
   'start': 23375,
   'end': 25667,
   'strand': '-',
   'gene_id': 'LINF_010005800',
   'transcript_id': None,
   'parent_id': None,
   'gene': True,
   'transcript': False,
   'CDS': False,
   '3utr': False,
   '5utr': False,
   'pseudogen': None,
   'notes': 'mitochondrial_UMP-CMP_kinase',
   'interval': Interval(23375, 25667, closed='both')},
  {'chrom': 'LinJ.01',
   'start': 23375,
   'end': 25667,
   'strand': '-',
   'gene_id': None,
   'transcript_id': 'LINF_01T0005800',
   'parent_id': 'LINF_010005800',
   'gene': False,
   'transcript': True,
   'CDS': False,
   '3utr': False,
   '5utr': False,
   'pseudogen': None,
   'notes': 'mitochondrial_UMP-CMP_kinase',
   'interval': Interval(23375, 25667, closed='both')},
  {'chrom': 'LinJ.01',
   'start': 23375,
   'end': 24798,
   'strand': '-',
   'gene_id': None,
   'transcript_id': None,
   'parent_id': 'LINF_01T0005800',
   'gene': False,
   'transcript': False,
   'CDS': False,
   '3utr': 

let's transform it in a data frame

In [66]:
sider_inside_gtf_list = []
for key, value in sider_inside_gtf_dict.items():
    for elem in value:
        new_record = {'sider_name': key}  # Create dict of 1 element
        new_record.update(elem)  # Update the dict with the values from elem, this way "sider_name" goes first
        sider_inside_gtf_list.append(new_record)

sider_inside_gtf_df = pd.DataFrame(sider_inside_gtf_list)

In [67]:
print(sider_inside_gtf_df.shape)
print(sider_inside_gtf_df.dtypes)
print(sider_inside_gtf_df['sider_name'].nunique())
sider_inside_gtf_df.head()

(4650, 16)
sider_name                      object
chrom                           object
start                            int64
end                              int64
strand                          object
gene_id                         object
transcript_id                   object
parent_id                       object
gene                              bool
transcript                        bool
CDS                               bool
3utr                              bool
5utr                              bool
pseudogen                       object
notes                           object
interval         interval[int64, both]
dtype: object
1685


Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
0,sre_c01.20A,LinJ.01,23375,25667,-,LINF_010005800,,,True,False,False,False,False,,mitochondrial_UMP-CMP_kinase,"[23375, 25667]"
1,sre_c01.20A,LinJ.01,23375,25667,-,,LINF_01T0005800,LINF_010005800,False,True,False,False,False,,mitochondrial_UMP-CMP_kinase,"[23375, 25667]"
2,sre_c01.20A,LinJ.01,23375,24798,-,,,LINF_01T0005800,False,False,False,True,False,,mitochondrial_UMP-CMP_kinase,"[23375, 24798]"
3,sre_c01.30A,LinJ.01,34736,37218,-,LINF_010006300,,,True,False,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
4,sre_c01.30A,LinJ.01,34736,37218,-,,LINF_01T0006300,LINF_010006300,False,True,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"


Now with that data we can analyze a lot of things

### 2.2 SIDER intergenic

Now we will check the SIDER elements that are not overlaping any GTF element using `sider_not_inside_gtf_dict`

In [68]:
# Let's check again the dict:
print(len(sider_not_inside_gtf_dict))
sider_not_inside_gtf_dict

432


{'sre_c01.10': [],
 'sre_c01.60A': [],
 'sre_c01.110': [],
 'sre_c01.130': [],
 'sre_c02.10': [],
 'sre_c02.150D': [],
 'sre_c02.160': [],
 'sre_c02.170D': [],
 'sre_c03.30': [],
 'sre_c03.50A': [],
 'sre_c03.60': [],
 'sre_c03.70': [],
 'sre_c04.10': [],
 'sre_c04.20A': [],
 'sre_c04.50A': [],
 'sre_c04.70': [],
 'sre_c04.100B': [],
 'sre_c04.110B': [],
 'sre_c04.130C': [],
 'sre_c05.10': [],
 'sre_c05.120': [],
 'sre_c05.140B': [],
 'sre_c05.150': [],
 'sre_c05.160': [],
 'sre_c06.10': [],
 'sre_c06.120B': [],
 'sre_c06.190': [],
 'sre_c06.270': [],
 'sre_c06.280': [],
 'sre_c07.10A': [],
 'sre_c07.20B': [],
 'sre_c07.50C': [],
 'sre_c07.120D': [],
 'sre_c07.190B': [],
 'sre_c07.230E': [],
 'sre_c07.240': [],
 'sre_c07.270F': [],
 'sre_c07.320A': [],
 'sre_c08.10': [],
 'sre_c08.30': [],
 'sre_c08.50A': [],
 'sre_c08.80B': [],
 'sre_c08.130D': [],
 'sre_c08.150D': [],
 'sre_c08.160D': [],
 'sre_c08.170D': [],
 'sre_c08.180D': [],
 'sre_c08.190D': [],
 'sre_c08.210E': [],
 'sre_c08.22

In [69]:
# Let's take the keys from the dict as a list:
sider_not_inside_gtf_keys_list = list(sider_not_inside_gtf_dict.keys())
print(sider_not_inside_gtf_keys_list)

['sre_c01.10', 'sre_c01.60A', 'sre_c01.110', 'sre_c01.130', 'sre_c02.10', 'sre_c02.150D', 'sre_c02.160', 'sre_c02.170D', 'sre_c03.30', 'sre_c03.50A', 'sre_c03.60', 'sre_c03.70', 'sre_c04.10', 'sre_c04.20A', 'sre_c04.50A', 'sre_c04.70', 'sre_c04.100B', 'sre_c04.110B', 'sre_c04.130C', 'sre_c05.10', 'sre_c05.120', 'sre_c05.140B', 'sre_c05.150', 'sre_c05.160', 'sre_c06.10', 'sre_c06.120B', 'sre_c06.190', 'sre_c06.270', 'sre_c06.280', 'sre_c07.10A', 'sre_c07.20B', 'sre_c07.50C', 'sre_c07.120D', 'sre_c07.190B', 'sre_c07.230E', 'sre_c07.240', 'sre_c07.270F', 'sre_c07.320A', 'sre_c08.10', 'sre_c08.30', 'sre_c08.50A', 'sre_c08.80B', 'sre_c08.130D', 'sre_c08.150D', 'sre_c08.160D', 'sre_c08.170D', 'sre_c08.180D', 'sre_c08.190D', 'sre_c08.210E', 'sre_c08.220E', 'sre_c08.230E', 'sre_c08.240E', 'sre_c08.280G', 'sre_c08.290', 'sre_c08.320F', 'sre_c08.330F', 'sre_c08.350F', 'sre_c09.10A', 'sre_c09.40D', 'sre_c09.50B', 'sre_c09.90', 'sre_c09.100', 'sre_c09.110', 'sre_c09.130C', 'sre_c09.140', 'sre_c09.

Finding overlapping elements:

In [70]:
# Call the old code but only using the keys from the list
# initialize dict
sider_gtf_dict_2 = {sider_name: [] for sider_name in sider_not_inside_gtf_keys_list}

# Find elements in sider_df that overlap with a  gtf_df
# Find overlaps using boolean indexing
for i, sider_row in sider_df.iterrows():
    # Boolean mask for intervals that overlaps the SIDER interval
    print(f"Analyzing elem {i+1}/{sider_df.shape[0]}")

    if sider_row['sider_name'] not in sider_not_inside_gtf_keys_list:
        continue

    mask = (gtf_df['chrom'] == sider_row['chrom']) & \
            (
                ((sider_row['start'] >= gtf_df['start']) & (sider_row['start'] <= gtf_df['end'])) | \
                ((sider_row['end'] >= gtf_df['start']) & (sider_row['end'] <= gtf_df['end'])) | \
                ((gtf_df['start'] >= sider_row['start']) & (gtf_df['start'] <= sider_row['end'])) | \
                ((gtf_df['end'] >= sider_row['start']) & (gtf_df['end'] <= sider_row['end']))
                )
    
    overlaps = gtf_df[mask]
    
    for j, gtf_row in overlaps.iterrows():
        sider_gtf_dict_2[sider_row['sider_name']].append(gtf_row.to_dict())

Analyzing elem 1/2117
Analyzing elem 2/2117
Analyzing elem 3/2117
Analyzing elem 4/2117
Analyzing elem 5/2117
Analyzing elem 6/2117
Analyzing elem 7/2117
Analyzing elem 8/2117
Analyzing elem 9/2117
Analyzing elem 10/2117
Analyzing elem 11/2117
Analyzing elem 12/2117
Analyzing elem 13/2117
Analyzing elem 14/2117
Analyzing elem 15/2117
Analyzing elem 16/2117
Analyzing elem 17/2117
Analyzing elem 18/2117
Analyzing elem 19/2117
Analyzing elem 20/2117
Analyzing elem 21/2117
Analyzing elem 22/2117
Analyzing elem 23/2117
Analyzing elem 24/2117
Analyzing elem 25/2117
Analyzing elem 26/2117
Analyzing elem 27/2117
Analyzing elem 28/2117
Analyzing elem 29/2117
Analyzing elem 30/2117
Analyzing elem 31/2117
Analyzing elem 32/2117
Analyzing elem 33/2117
Analyzing elem 34/2117
Analyzing elem 35/2117
Analyzing elem 36/2117
Analyzing elem 37/2117
Analyzing elem 38/2117
Analyzing elem 39/2117
Analyzing elem 40/2117
Analyzing elem 41/2117
Analyzing elem 42/2117
Analyzing elem 43/2117
Analyzing elem 44/21

In [71]:
counter_sider_overlaps = 0
counter_sider_not_overlaps = 0
for key, value in sider_gtf_dict_2.items():
    print(f"{'='*50}")
    print(f"{key}:")
    if len(value) > 0:
        counter_sider_overlaps += 1
        for elem in value:
            print(f"\t{elem}")
    else:
        counter_sider_not_overlaps += 1

sre_c01.10:
sre_c01.60A:
sre_c01.110:
	{'chrom': 'LinJ.01', 'start': 145613, 'end': 146868, 'strand': '+', 'gene_id': 'LINF_010010350', 'transcript_id': None, 'parent_id': None, 'gene': True, 'transcript': False, 'CDS': False, '3utr': False, '5utr': False, 'pseudogen': None, 'notes': None, 'interval': Interval(145613, 146868, closed='both')}
	{'chrom': 'LinJ.01', 'start': 145613, 'end': 146868, 'strand': '+', 'gene_id': None, 'transcript_id': 'LINF_01T0010350', 'parent_id': 'LINF_010010350', 'gene': False, 'transcript': True, 'CDS': False, '3utr': False, '5utr': False, 'pseudogen': None, 'notes': None, 'interval': Interval(145613, 146868, closed='both')}
sre_c01.130:
sre_c02.10:
	{'chrom': 'LinJ.02', 'start': 779, 'end': 1740, 'strand': '-', 'gene_id': 'LINF_020004900', 'transcript_id': None, 'parent_id': None, 'gene': True, 'transcript': False, 'CDS': False, '3utr': False, '5utr': False, 'pseudogen': None, 'notes': None, 'interval': Interval(779, 1740, closed='both')}
	{'chrom': 'LinJ

In [72]:
print(f"From the total of {len(sider_not_inside_gtf_keys_list)} SIDERs, {counter_sider_overlaps} are overlapping the GTF elements and {counter_sider_not_overlaps} are not overlapping the GTF elements")

From the total of 432 SIDERs, 332 are overlapping the GTF elements and 100 are not overlapping the GTF elements


Let's join the two "sider_gtf_dict" dictionaries to save them as a JSON file

In [73]:
dict_sider_full_inside = sider_gtf_dict.copy()
dict_sider_overlap = sider_gtf_dict_2.copy()

# Let's join them
for key in dict_sider_overlap.keys():
    if key in dict_sider_full_inside.keys():
        if len(dict_sider_overlap[key]) > 0:
            dict_sider_full_inside[key].extend(dict_sider_overlap[key])

In these data types there will be a problem since the JSON package can't read through *Interval* pandas data

In [74]:
# Save the data to a json file
path_gtf_sider_relation_json = "./data/sider_gtf_relation.json" # Path to save the json file

# The prolem will be the pandas Interval type and the JSON package. We need to create a custom serializer
def custom_serializer(obj):
    if isinstance(obj, pd.Interval):
        return {
            'left': int(obj.left) if isinstance(obj.left, np.integer) else obj.left,  # JSOn package can't process int64; transformint it
            'right': int(obj.right) if isinstance(obj.right, np.integer) else obj.right,  # JSOn package can't process int64; transformint it
            'closed': obj.closed
        }
    elif isinstance(obj, np.integer):  # Check for numpy integer types
        return int(obj)  # Convert to a standard Python int
    raise TypeError(f"Object of type {type(obj).__name__} is not JSON serializable")


# Save the data to a json file
with open(path_gtf_sider_relation_json, "w") as f:
    json.dump(dict_sider_full_inside, f, default=custom_serializer)

In [75]:
# Let's get the elements in different dictionaries
sider_overlaps_gtf_dict = {key: value for key, value in sider_gtf_dict_2.items() if len(value) > 0}

# And now for the INTERGENIC elements
sider_intergenic_gtf_dict = {key: value for key, value in sider_gtf_dict_2.items() if len(value) == 0}

In [76]:
list(sider_intergenic_gtf_dict.keys())

['sre_c01.10',
 'sre_c01.60A',
 'sre_c01.130',
 'sre_c03.30',
 'sre_c04.10',
 'sre_c04.70',
 'sre_c04.100B',
 'sre_c04.110B',
 'sre_c04.130C',
 'sre_c06.120B',
 'sre_c07.270F',
 'sre_c08.30',
 'sre_c08.80B',
 'sre_c08.280G',
 'sre_c08.290',
 'sre_c09.50B',
 'sre_c09.140',
 'sre_c09.310A',
 'sre_c11.400I',
 'sre_c12.190E',
 'sre_c13.160C',
 'sre_c13.320A',
 'sre_c14.10',
 'sre_c16.60B',
 'sre_c16.130B',
 'sre_c16.230D',
 'sre_c16.240D',
 'sre_c16.350F',
 'sre_c17.100B',
 'sre_c17.130B',
 'sre_c17.210C',
 'sre_c17.310B',
 'sre_c18.100C',
 'sre_c18.170E',
 'sre_c18.300',
 'sre_c18.470K',
 'sre_c19.220D',
 'sre_c19.490K',
 'sre_c20.180D',
 'sre_c20.190C',
 'sre_c22.140D',
 'sre_c22.330F',
 'sre_c22.530J',
 'sre_c22.540J',
 'sre_c23.160E',
 'sre_c23.230E',
 'sre_c24.10',
 'sre_c24.130B',
 'sre_c24.470',
 'sre_c25.320I',
 'sre_c25.370I',
 'sre_c26.10A',
 'sre_c26.130C',
 'sre_c26.140C',
 'sre_c26.270D',
 'sre_c26.310F',
 'sre_c27.240',
 'sre_c28.130A',
 'sre_c28.150B',
 'sre_c28.430H',
 'sre

### 2.3 SIDER overlapping

In [77]:
# Check the Dict
print(len(sider_overlaps_gtf_dict))
sider_overlaps_gtf_dict

332


{'sre_c01.110': [{'chrom': 'LinJ.01',
   'start': 145613,
   'end': 146868,
   'strand': '+',
   'gene_id': 'LINF_010010350',
   'transcript_id': None,
   'parent_id': None,
   'gene': True,
   'transcript': False,
   'CDS': False,
   '3utr': False,
   '5utr': False,
   'pseudogen': None,
   'notes': None,
   'interval': Interval(145613, 146868, closed='both')},
  {'chrom': 'LinJ.01',
   'start': 145613,
   'end': 146868,
   'strand': '+',
   'gene_id': None,
   'transcript_id': 'LINF_01T0010350',
   'parent_id': 'LINF_010010350',
   'gene': False,
   'transcript': True,
   'CDS': False,
   '3utr': False,
   '5utr': False,
   'pseudogen': None,
   'notes': None,
   'interval': Interval(145613, 146868, closed='both')}],
 'sre_c02.10': [{'chrom': 'LinJ.02',
   'start': 779,
   'end': 1740,
   'strand': '-',
   'gene_id': 'LINF_020004900',
   'transcript_id': None,
   'parent_id': None,
   'gene': True,
   'transcript': False,
   'CDS': False,
   '3utr': False,
   '5utr': False,
   'pseud

In [78]:
# Transform it into a DataFrame
sider_overlaps_gtf_list = []
for key, value in sider_overlaps_gtf_dict.items():
    for elem in value:
        new_record = {'sider_name': key}  # Create dict of 1 element
        new_record.update(elem)  # Update the dict with the values from elem, this way "sider_name" goes first
        sider_overlaps_gtf_list.append(new_record)

sider_overlaps_gtf_df = pd.DataFrame(sider_overlaps_gtf_list)

In [79]:
print(sider_overlaps_gtf_df.shape)
print(sider_overlaps_gtf_df.dtypes)
print(sider_overlaps_gtf_df['sider_name'].nunique())
sider_overlaps_gtf_df.head()

(1176, 16)
sider_name                      object
chrom                           object
start                            int64
end                              int64
strand                          object
gene_id                         object
transcript_id                   object
parent_id                       object
gene                              bool
transcript                        bool
CDS                               bool
3utr                              bool
5utr                              bool
pseudogen                       object
notes                           object
interval         interval[int64, both]
dtype: object
332


Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
0,sre_c01.110,LinJ.01,145613,146868,+,LINF_010010350,,,True,False,False,False,False,,,"[145613, 146868]"
1,sre_c01.110,LinJ.01,145613,146868,+,,LINF_01T0010350,LINF_010010350,False,True,False,False,False,,,"[145613, 146868]"
2,sre_c02.10,LinJ.02,779,1740,-,LINF_020004900,,,True,False,False,False,False,,,"[779, 1740]"
3,sre_c02.10,LinJ.02,779,1740,-,,LINF_02T0004900,LINF_020004900,False,True,False,False,False,,,"[779, 1740]"
4,sre_c02.150D,LinJ.02,312894,314773,+,LINF_020011300,,,True,False,False,False,False,,Arp2/3_complex_subunit_4|ARPC4,"[312894, 314773]"


#### 2.3.1 Divide in "overlapping" and "overextend" elements

In the `sider_overlaps_gtf_df` elements.There are some that touch more than one element in the GTF such as:
* 3'UTR, CDS
* 3'UTR transcript_1, 5'UTR trasnscript_2
* etc.

On the other part, there are some elements that don't touch more that one element. Instead they touch one element and overextend to an intergenic zone

In [80]:
sider_overlaps_gtf_df[sider_overlaps_gtf_df["sider_name"] == "src_c09.130C"]

Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval


In [81]:
# Let's group the elements by "sider_name"
groupy_sider_overlaps_gtf_df = sider_overlaps_gtf_df.groupby('sider_name')

In [82]:
# Create the pre-list to save the elements
true_overlaps_gtf = []
overextend_elements_gtf = []

# Iterate over the groupy object
for name, group in groupy_sider_overlaps_gtf_df:
    location = group[["gene", "transcript", "CDS", "3utr", "5utr"]].sum()
    
    # Get the elements that extend by 3'utr or 5'utr
    if (location["gene"] == 1) & (location["3utr"] == 1 or location["5utr"] == 1):
        [overextend_elements_gtf.append(elem) for i, elem in group.iterrows()]
    elif (location["gene"] == 1) & (location["3utr"] == 0 and location["5utr"] == 0):  # Elements that overextend, but there are non-coding gentes.
        [overextend_elements_gtf.append(elem) for i, elem in group.iterrows()]
    else:  # The rest will be the elements that are truly overlapping more than one GTF element
        [true_overlaps_gtf.append(elem) for i, elem in group.iterrows()]

# Let's create the DataFrames
true_overlaps_gtf_df = pd.DataFrame(true_overlaps_gtf, columns=sider_overlaps_gtf_df.columns)
overextend_elements_gtf_df = pd.DataFrame(overextend_elements_gtf, columns=sider_overlaps_gtf_df.columns)

In [83]:
print(f"True Overlaps: {true_overlaps_gtf_df.shape}"
      f"\n\tUnique SIDERs: {true_overlaps_gtf_df['sider_name'].nunique()}")
print(f"Overextend Elements: {overextend_elements_gtf_df.shape}"
      f"\n\tUnique SIDERs: {overextend_elements_gtf_df['sider_name'].nunique()}")

True Overlaps: (593, 16)
	Unique SIDERs: 110
Overextend Elements: (583, 16)
	Unique SIDERs: 222


## 3. Analyze results

Now we have 2 Data Frames and one list:
* `sider_inside_gtf_df` ==> Data frames of the 1685 SIDERs that are inside the coordinates of a GTF element.
* `sider_overlaps_gtf_df`==> Data frame of the 333 elements that overlap GTF elements.
  * `true_overlaps_gtf_df`==> 113 elements
  * `overextended elements`==> 220 elements
* `sider_intergenic_gtf_dict`==> Dictionary with the 100 INTERGENIC elements

### 3.1 sider_inside_gtf_df

In [84]:
print(sider_inside_gtf_df.shape)
print(sider_inside_gtf_df["sider_name"].nunique())
sider_inside_gtf_df.head()

(4650, 16)
1685


Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
0,sre_c01.20A,LinJ.01,23375,25667,-,LINF_010005800,,,True,False,False,False,False,,mitochondrial_UMP-CMP_kinase,"[23375, 25667]"
1,sre_c01.20A,LinJ.01,23375,25667,-,,LINF_01T0005800,LINF_010005800,False,True,False,False,False,,mitochondrial_UMP-CMP_kinase,"[23375, 25667]"
2,sre_c01.20A,LinJ.01,23375,24798,-,,,LINF_01T0005800,False,False,False,True,False,,mitochondrial_UMP-CMP_kinase,"[23375, 24798]"
3,sre_c01.30A,LinJ.01,34736,37218,-,LINF_010006300,,,True,False,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
4,sre_c01.30A,LinJ.01,34736,37218,-,,LINF_01T0006300,LINF_010006300,False,True,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"


In [85]:
group_sider_inside_gtf_df = sider_inside_gtf_df.groupby("sider_name")

In [86]:
# First let's make a Global counter of the "notes" rows
notes_counter_global = {}
for group, data in group_sider_inside_gtf_df:
    # notes_counter[group] = data["notes"].unique()
    notes = data['notes'].unique()
    for elem in notes:
        if elem != None:
            if elem not in notes_counter_global:
                notes_counter_global[elem] = 1
            else:
                notes_counter_global[elem] += 1

In [87]:
# Let's sort it
notes_counter_global_sorted = dict(sorted(notes_counter_global.items(), key=lambda x: x[1], reverse=True))  # Sorted dict
notes_counter_global_sorted_df = pd.DataFrame(notes_counter_global_sorted.items(), columns=["notes", "count"])  # Transform it into a DataFrame
notes_counter_global_sorted_df

Unnamed: 0,notes,count
0,hypothetical_protein_-_conserved,240
1,Protein_of_unknown_function_-_conserved,75
2,hypothetical_protein,47
3,protein_of_unknown_function_-_conserved,35
4,Hypothetical_protein_-_conserved,29
...,...,...
634,Snf7_family_member,1
635,signal_recognition_particle_receptor_like_protein,1
636,Glycine_cleavage_complex_(GCC)_T-protein|GCVT-...,1
637,mitoribosomal_protein_mS70,1


In [88]:
# Let's do the same but with the columns "gene", "transcript", "CDS", "3utr", "5utr"
localization_counter_global = {}
for group, data in group_sider_inside_gtf_df:
    localization = data[["gene", "transcript", "CDS", "3utr", "5utr"]].sum()

    for elem in localization.index:
        if localization[elem] > 0:
            if elem not in localization_counter_global:
                localization_counter_global[elem] = 1
            else:
                localization_counter_global[elem] += 1

In [89]:
localization_counter_global

{'gene': 1685, 'transcript': 1685, '3utr': 1238, 'CDS': 7, '5utr': 24}

#### 3.1.1. GTF elements with more than 1 SIDER element

Let's check the elements '5utr', 'CDS' or '3utr' that have  more than one SIDER element

In [90]:
# Let's check the rows in "sider_inside_gtf_df" with CDS=True, 3utr=True, 5utr=True
utr3_inside_df = sider_inside_gtf_df[sider_inside_gtf_df["3utr"] == True]
utr5_inside_df = sider_inside_gtf_df[sider_inside_gtf_df["5utr"] == True]
cds_inside_df = sider_inside_gtf_df[sider_inside_gtf_df["CDS"] == True]

print(f"3utr: {utr3_inside_df.shape[0]}")
print(f"5utr: {utr5_inside_df.shape[0]}")
print(f"CDS: {cds_inside_df.shape[0]}")

3utr: 1241
5utr: 24
CDS: 7


Check the GTF elements in 5utr, CDS and 3utr that have momre than 1 sider element

##### 3.1.1.1 3'UTR

In [91]:
# Check the GTF elements in 5utr, CDS and 3utr that have more than 1 sider element
group_utr3_inside_df = utr3_inside_df.groupby("parent_id")
group_utr5_inside_df = utr5_inside_df.groupby("parent_id")
group_cds_inside_df = cds_inside_df.groupby("parent_id")

In [92]:
# Get a dict with the elements elements and its sider 
utr3_inside_dict = {}
for group, data in group_utr3_inside_df:
    for i, row in data.iterrows():
        if group not in utr3_inside_dict:
            utr3_inside_dict[group] = [row["sider_name"]]
        else:
            utr3_inside_dict[group].append(row["sider_name"])

# Get the elements with more than 1 sider element
utr3_inside_dict_more_than_1 = {key: value for key, value in utr3_inside_dict.items() if len(value) > 1}
print(f"Elements with more than 1 sider element: {len(utr3_inside_dict_more_than_1)}")

# Get a list with the keys
utr3_inside_dict_more_than_1_keys = list(utr3_inside_dict_more_than_1.keys())

# From "gtf_df" get the "note" column where the elemens in "transcript_id" or "gene_id" are in "utr3_inside_dict_more_than_1_keys"
mask = gtf_df["transcript_id"].isin(utr3_inside_dict_more_than_1_keys) | gtf_df["gene_id"].isin(utr3_inside_dict_more_than_1_keys)
utr3_more_siders_df = gtf_df[mask]

# Group the elements by "transcript_id" for the "function"
dict_utr3_more_siders = {}
group_utr3_more_siders_df = utr3_more_siders_df.groupby("transcript_id")
for name, group in group_utr3_more_siders_df:
    notes = group["notes"].unique()
    for elem in notes:
        if elem != None:
            if elem not in dict_utr3_more_siders:
                dict_utr3_more_siders[elem] = 1
            else:
                dict_utr3_more_siders[elem] += 1

# Sort the dict
dict_utr3_more_siders_sorted = dict(sorted(dict_utr3_more_siders.items(), key=lambda x: x[1], reverse=True))
dict_utr3_more_siders_sorted_df = pd.DataFrame(dict_utr3_more_siders_sorted.items(), columns=["notes", "count"])
dict_utr3_more_siders_sorted_df

Elements with more than 1 sider element: 182


Unnamed: 0,notes,count
0,hypothetical_protein_-_conserved,31
1,Protein_of_unknown_function_-_conserved,12
2,protein_of_unknown_function_-_conserved,6
3,hypothetical_protein,6
4,Hypothetical_protein_-_conserved,5
...,...,...
114,C-4_sterol_methyl_oxidase_-_putative,1
115,elongation_factor-2_kinase-like_protein,1
116,VHS_domain_containing_protein,1
117,tartrate-sensitive_acid_phosphatase,1


##### 3.1.1.2 5'UTR

In [93]:
# Same with 5utr
utr5_inside_dict = {}
for group, data in group_utr5_inside_df:
    for i, row in data.iterrows():
        if group not in utr5_inside_dict:
            utr5_inside_dict[group] = [row["sider_name"]]
        else:
            utr5_inside_dict[group].append(row["sider_name"])

# Get the elements with more than 1 sider element
utr5_inside_dict_more_than_1 = {key: value for key, value in utr5_inside_dict.items() if len(value) > 1}
print(f"Elements with more than 1 sider element: {len(utr5_inside_dict_more_than_1)}")

# Get a list with the keys
utr5_inside_dict_more_than_1_keys = list(utr5_inside_dict_more_than_1.keys())

# From "gtf_df" get the "note" column where the elemens in "transcript_id" or "gene_id" are in "utr5_inside_dict_more_than_1_keys"
mask = gtf_df["transcript_id"].isin(utr5_inside_dict_more_than_1_keys) | gtf_df["gene_id"].isin(utr5_inside_dict_more_than_1_keys)
utr5_more_siders_df = gtf_df[mask]

# Group the elements by "transcript_id" for the "function"
dict_utr5_more_siders = {}
group_utr5_more_siders_df = utr5_more_siders_df.groupby("transcript_id")
for name, group in group_utr5_more_siders_df:
    notes = group["notes"].unique()
    for elem in notes:
        if elem != None:
            if elem not in dict_utr5_more_siders:
                dict_utr5_more_siders[elem] = 1
            else:
                dict_utr5_more_siders[elem] += 1

# Sort the dict
dict_utr5_more_siders_sorted = dict(sorted(dict_utr5_more_siders.items(), key=lambda x: x[1], reverse=True))
dict_utr5_more_siders_sorted_df = pd.DataFrame(dict_utr5_more_siders_sorted.items(), columns=["notes", "count"])
dict_utr5_more_siders_sorted_df

Elements with more than 1 sider element: 1


Unnamed: 0,notes,count
0,tuzin-like_protein,1


##### 3.1.1.3 CDS

In [94]:
# Same with CDS
cds_inside_dict = {}
for group, data in group_cds_inside_df:
    for i, row in data.iterrows():
        if group not in cds_inside_dict:
            cds_inside_dict[group] = [row["sider_name"]]
        else:
            cds_inside_dict[group].append(row["sider_name"])

# Get the elements with more than 1 sider element
cds_inside_dict_more_than_1 = {key: value for key, value in cds_inside_dict.items() if len(value) > 1}
print(f"Elements with more than 1 sider element: {len(cds_inside_dict_more_than_1)}")

# Get a list with the keys
cds_inside_dict_more_than_1_keys = list(cds_inside_dict_more_than_1.keys())

# From "gtf_df" get the "note" column where the elemens in "transcript_id" or "gene_id" are in "cds_inside_dict_more_than_1_keys"
mask = gtf_df["transcript_id"].isin(cds_inside_dict_more_than_1_keys) | gtf_df["gene_id"].isin(cds_inside_dict_more_than_1_keys)
cds_more_siders_df = gtf_df[mask]

# Group the elements by "transcript_id" for the "function"
dict_cds_more_siders = {}
group_cds_more_siders_df = cds_more_siders_df.groupby("transcript_id")
for name, group in group_cds_more_siders_df:
    notes = group.unique()
    for elem in notes:
        if elem != None:
            if elem not in dict_cds_more_siders:
                dict_cds_more_siders[elem] = 1
            else:
                dict_cds_more_siders[elem] += 1

# Sor the dict
dict_cds_more_siders_sorted = dict(sorted(dict_cds_more_siders.items(), key=lambda x: x[1], reverse=True))
dict_cds_more_siders_sorted_df = pd.DataFrame(dict_cds_more_siders_sorted.items(), columns=["notes", "count"])
dict_cds_more_siders_sorted_df

Elements with more than 1 sider element: 0


Unnamed: 0,notes,count


In [95]:
# Check all the SIDERs inside the GTF elements
cds_inside_df[["sider_name", "transcript_id", "notes"]]

Unnamed: 0,sider_name,transcript_id,notes
75,sre_c04.30,LINF_04T0005100,pteridine_transporter
154,sre_c06.70,LINF_06T0008400,pteridine_transporter_(truncated)_-_putative
375,sre_c10.20B,LINF_10T0005100,pteridine_transporter
397,sre_c10.110,LINF_10T0008900,folate/biopterin_transporter|AdoMetT1
405,sre_c10.140B,LINF_10T0009300,folate/biopterin_transporter|FT1
412,sre_c10.170,LINF_10T0009500,folate/biopterin_transporter|FT5
1252,sre_c19.300F,LINF_19T0016300,hypothetical_protein_-_conserved


#### 3.1.2 GOEA save data

In [96]:
# Create a GO term dictionary
go_sider_inside_gtf_df = []
for name, group in group_sider_inside_gtf_df:
    all_genes_id = group["gene_id"].unique()
    all_genes_id = [elem for elem in all_genes_id if pd.notnull(elem)]
    for gene_id in all_genes_id:
        go_sider_inside_gtf_df.append({"sider_name": name, "gene_name": gene_id})
    
# Create a DataFrame
go_sider_inside_gtf_df = pd.DataFrame(go_sider_inside_gtf_df)

### 3.2. Truly overlapping

In [97]:
print(true_overlaps_gtf_df.shape)
print(true_overlaps_gtf_df["sider_name"].nunique())

(593, 16)
110


In [98]:
true_overlaps_gtf_df

Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
11,sre_c02.170D,LinJ.02,340328,344214,+,LINF_020012300,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[340328, 344214]"
12,sre_c02.170D,LinJ.02,340328,344214,+,,LINF_02T0012300,LINF_020012300,False,True,False,False,False,,protein_of_unknown_function_-_conserved,"[340328, 344214]"
13,sre_c02.170D,LinJ.02,343269,344214,+,,,LINF_02T0012300,False,False,False,True,False,,protein_of_unknown_function_-_conserved,"[343269, 344214]"
14,sre_c02.170D,LinJ.02,344576,347925,+,LINF_020012400,,,True,False,False,False,False,,ATP-dependent_Clp_protease_subunit|heat_shock_...,"[344576, 347925]"
15,sre_c02.170D,LinJ.02,344576,347925,+,,LINF_02T0012400,LINF_020012400,False,True,False,False,False,,ATP-dependent_Clp_protease_subunit|heat_shock_...,"[344576, 347925]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1148,sre_c36.750T,LinJ.36,1063049,1064111,-,LINF_360033550,,,True,False,False,False,False,,,"[1063049, 1064111]"
1149,sre_c36.750T,LinJ.36,1063049,1064111,-,,LINF_36T0033550,LINF_360033550,False,True,False,False,False,,,"[1063049, 1064111]"
1150,sre_c36.750T,LinJ.36,1064112,1068685,-,LINF_360033600,,,True,False,False,False,False,,related_to_multifunctional_cyclin-dependent_ki...,"[1064112, 1068685]"
1151,sre_c36.750T,LinJ.36,1064112,1068685,-,,LINF_36T0033600,LINF_360033600,False,True,False,False,False,,related_to_multifunctional_cyclin-dependent_ki...,"[1064112, 1068685]"


In [99]:
true_overlaps_gtf_df[true_overlaps_gtf_df["sider_name"] == "src_c02.170D"]

Unnamed: 0,sider_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval


In [100]:
# Group by sider name
group_true_overlaps_gtf_df = true_overlaps_gtf_df.groupby("sider_name")

In [101]:
# Let's check the location of these elements
loc_coutner_global_truly_overlaps = {}
for group, data in group_true_overlaps_gtf_df:
    location = data[["gene", "transcript", "CDS", "3utr", "5utr"]].sum()
    for elem in location.index:
        if location[elem] > 0:
            if elem not in loc_coutner_global_truly_overlaps:
                loc_coutner_global_truly_overlaps[elem] = 1
            else:
                loc_coutner_global_truly_overlaps[elem] += 1
loc_coutner_global_truly_overlaps

{'gene': 110, 'transcript': 107, '3utr': 73, '5utr': 47, 'CDS': 14}

In [102]:
# Check functions of the elements
notes_counter_global_truly_overlaps = {}
for group, data in group_true_overlaps_gtf_df:
    notes = data['notes'].unique()
    for elem in notes:
        if elem != None:
            if elem not in notes_counter_global_truly_overlaps:
                notes_counter_global_truly_overlaps[elem] = 1
            else:
                notes_counter_global_truly_overlaps[elem] += 1

# Sort the dict
notes_counter_global_truly_overlaps_sorted = dict(sorted(notes_counter_global_truly_overlaps.items(), key=lambda x: x[1], reverse=True))
notes_counter_global_truly_overlaps_sorted_df = pd.DataFrame(notes_counter_global_truly_overlaps_sorted.items(), columns=["notes", "count"])
notes_counter_global_truly_overlaps_sorted_df

Unnamed: 0,notes,count
0,hypothetical_protein_-_conserved,17
1,Amastin_surface_glycoprotein_-_putative,13
2,5S_rRNA,6
3,hypothetical_protein,5
4,amastin-like_protein,4
...,...,...
62,Dolichyl-diphosphooligosaccharide--protein_gly...,1
63,poly(A)-specific_ribonuclease|PARN-3,1
64,Sugar_efflux_transporter_for_intercellular_exc...,1
65,translation_initiation_factor_2_subunit,1


In [103]:
 # Let's do the same but for the CDS matches one
notes_counter_global_truly_overlaps_cds = {}
for group, data in group_true_overlaps_gtf_df:
    if data["CDS"].sum() > 0:
        notes = data['notes'].unique()
        for elem in notes:
            if elem != None:
                if elem not in notes_counter_global_truly_overlaps_cds:
                    notes_counter_global_truly_overlaps_cds[elem] = 1
                else:
                    notes_counter_global_truly_overlaps_cds[elem] += 1

# Sort the dict
notes_counter_global_truly_overlaps_cds_sorted = dict(sorted(notes_counter_global_truly_overlaps_cds.items(), key=lambda x: x[1], reverse=True))
notes_counter_global_truly_overlaps_cds_sorted_df = pd.DataFrame(notes_counter_global_truly_overlaps_cds_sorted.items(), columns=["notes", "count"])
notes_counter_global_truly_overlaps_cds_sorted_df

Unnamed: 0,notes,count
0,hypothetical_protein_-_conserved,6
1,Amastin_surface_glycoprotein_-_putative,5
2,amastin-like_surface_protein_-_putative,2
3,CLN3_protein,1
4,Protein_of_unknown_function_-_conserved,1
5,HEAT_repeat-containing_protein,1
6,amastin-like_surface_protein,1
7,oligosaccharyl_transferase_subunit,1


#### 3.2.1 GOAE save data

In [104]:
# Create GO term dictionary
go_sider_true_overlaps_gtf_df = []
for name, group in group_true_overlaps_gtf_df:
    all_genes_id = group["gene_id"].unique()
    all_genes_id = [elem for elem in all_genes_id if pd.notnull(elem)]
    for gene_id in all_genes_id:
        go_sider_true_overlaps_gtf_df.append({"sider_name": name, "gene_name": gene_id})

# Create a DataFrame
go_sider_true_overlaps_gtf_df = pd.DataFrame(go_sider_true_overlaps_gtf_df)

### 3.3 Overextended elements

In [105]:
print(overextend_elements_gtf_df.shape)
print(overextend_elements_gtf_df["sider_name"].nunique())

(583, 16)
222


In [106]:
# Let's group the elements by SIDER name
group_overextend_elements_gtf_df = overextend_elements_gtf_df.groupby("sider_name")

In [107]:
# Let's check the location of these elements
loc_coutner_global_overextend = {}
for group, data in group_overextend_elements_gtf_df:
    location = data[["gene", "transcript", "CDS", "3utr", "5utr"]].sum()
    for elem in location.index:
        if location[elem] > 0:
            if elem not in loc_coutner_global_overextend:
                loc_coutner_global_overextend[elem] = 1
            else:
                loc_coutner_global_overextend[elem] += 1
loc_coutner_global_overextend

{'gene': 222, 'transcript': 221, '3utr': 125, 'CDS': 6, '5utr': 9}

In [108]:
# Check functions of the elements
notes_counter_global_overextend = {}
for group, data in group_overextend_elements_gtf_df:
    notes = data['notes'].unique()
    for elem in notes:
        if elem != None:
            if elem not in notes_counter_global_overextend:
                notes_counter_global_overextend[elem] = 1
            else:
                notes_counter_global_overextend[elem] += 1

# Sort the dict
notes_counter_global_overextend_sorted = dict(sorted(notes_counter_global_overextend.items(), key=lambda x: x[1], reverse=True))
notes_counter_global_overextend_sorted_df = pd.DataFrame(notes_counter_global_overextend_sorted.items(), columns=["notes", "count"])
notes_counter_global_overextend_sorted_df

Unnamed: 0,notes,count
0,hypothetical_protein_-_conserved,21
1,Protein_of_unknown_function_-_conserved,9
2,hypothetical_protein,7
3,amastin-like_protein,3
4,Amastin_surface_glycoprotein_-_putative,3
...,...,...
84,prolyl_oligopeptidase|POP,1
85,Memo-like_protein,1
86,Acid_phosphatase,1
87,Protein_of_unknown_function_(DUF1619)_-_putative,1


In [109]:
# Let's check the CDS only
notes_counter_global_overextend_cds = {}
for group, data in group_overextend_elements_gtf_df:
    if data["CDS"].sum() > 0:
        notes = data['notes'].unique()
        for elem in notes:
            if elem != None:
                if elem not in notes_counter_global_overextend_cds:
                    notes_counter_global_overextend_cds[elem] = 1
                else:
                    notes_counter_global_overextend_cds[elem] += 1

# Sort the dict
notes_counter_global_overextend_cds_sorted = dict(sorted(notes_counter_global_overextend_cds.items(), key=lambda x: x[1], reverse=True))
notes_counter_global_overextend_cds_sorted_df = pd.DataFrame(notes_counter_global_overextend_cds_sorted.items(), columns=["notes", "count"])
notes_counter_global_overextend_cds_sorted_df

Unnamed: 0,notes,count
0,hypothetical_protein,2
1,DNA-directed_RNA_polymerase_III_subunit_RPC3,1
2,ATG8/AUT7/APG8/PAZ2,1
3,ATG8/AUT7/APG8/PAZ2|Autophagy-related_protein,1
4,Rab-GTPase-TBC_domain_containing_protein,1


#### 3.3.1 GOEA save data

In [110]:
# Create GO term dictionary
go_sider_overextend_gtf_df = []
for name, group in group_overextend_elements_gtf_df:
    all_genes_id = group["gene_id"].unique()
    all_genes_id = [elem for elem in all_genes_id if pd.notnull(elem)]
    for gene_id in all_genes_id:
        go_sider_overextend_gtf_df.append({"sider_name": name, "gene_name": gene_id})

# Create a DataFrame
go_sider_overextend_gtf_df = pd.DataFrame(go_sider_overextend_gtf_df)

## 4. GO data

In [111]:
go_total = pd.concat([go_sider_inside_gtf_df, go_sider_true_overlaps_gtf_df, go_sider_overextend_gtf_df])
go_total.to_csv("data/go_total.csv", index=False, header=False)

In [112]:
go_total

Unnamed: 0,sider_name,gene_name
0,sre_c01.100A,LINF_010010100
1,sre_c01.120A,LINF_010011900
2,sre_c01.20A,LINF_010005800
3,sre_c01.30A,LINF_010006300
4,sre_c01.40,LINF_010006600
...,...,...
217,sre_c36.650Q,LINF_360030800
218,sre_c36.710R,LINF_360032100
219,sre_c36.800U,LINF_360035560
220,sre_c36.880V,LINF_360045300
