# Save data

In [143]:
# =============================================================================
# Save the variables
# =============================================================================
# variables_dict = {
#     "sider_df" : sider_df,  # Data Frame with SIDER data
#     "gtf_df" : gtf_df,  # Data Frame with GTF data
# }

# =============================================================================
# main function
# =============================================================================
# import os
# import pickle

# def data_save_load(option, dict_variables=None):
#     """
#     This function is used to save or load data for the jupyter notebook
#     """
#     path_folder = "ipynb_db"  # Folder to save variables
#     os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
#     path_file = os.path.join(path_folder, "variables.pkl") # Path to save the variables

#     if option == "save":
#         with open(path_file, "wb") as f:
#             pickle.dump(dict_variables, f)
#     elif option == "load":
#         with open(path_file, "rb") as f:
#             variables = pickle.load(f)
#         # Now load the variables
#         for key, value in variables.items():
#             variables[key] = value

# =============================================================================
# Call the function
# =============================================================================
# data_save_load(option="save",
#                dict_variables=variables_dict)

# Code

## 1. Load data

In [144]:
import pandas as pd # type: ignore

### 1.1. SIDER data

In [145]:
sider_df = pd.read_csv("data/SIDER_elements.gff", sep="\t", header=None)
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2132, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,SIDER,2,174,.,+,.,ID=src_c01.10
1,LinJ.01,CBM,SIDER,24094,24759,.,+,.,ID=src_c01.20A
2,LinJ.01,CBM,SIDER,35372,35957,.,+,.,ID=src_c01.30A
3,LinJ.01,CBM,SIDER,39791,40596,.,+,.,ID=src_c01.40
4,LinJ.01,CBM,SIDER,54984,55548,.,+,.,ID=src_c01.50A


In the `sider_df` I only need the column 0, 3, 4, 6 and 8 columns

In [146]:
# Extracting from `sider_df`the needed columns [0, 3, 4, 6, 8]
sider_df = sider_df[[0, 3, 4, 6, 8]]
sider_df.columns = ["chrom", "start", "end", "strand", "sider_name"]
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2132, 5)
chrom         object
start          int64
end            int64
strand        object
sider_name    object
dtype: object


Unnamed: 0,chrom,start,end,strand,sider_name
0,LinJ.01,2,174,+,ID=src_c01.10
1,LinJ.01,24094,24759,+,ID=src_c01.20A
2,LinJ.01,35372,35957,+,ID=src_c01.30A
3,LinJ.01,39791,40596,+,ID=src_c01.40
4,LinJ.01,54984,55548,+,ID=src_c01.50A


Now we need to get the **sider_name** without the 'ID=' element

In [147]:
# Extracting the sider_name from the column `sider_name`
sider_df["sider_name"] = sider_df["sider_name"].str.extract(r'ID=(.+)')
sider_df.head()

Unnamed: 0,chrom,start,end,strand,sider_name
0,LinJ.01,2,174,+,src_c01.10
1,LinJ.01,24094,24759,+,src_c01.20A
2,LinJ.01,35372,35957,+,src_c01.30A
3,LinJ.01,39791,40596,+,src_c01.40
4,LinJ.01,54984,55548,+,src_c01.50A


### 1.2 GTF data
This one wil be **harder** to prepare

In [148]:
# Load data
gtf_df = pd.read_csv("./data/20240703111001_LINF-Tabla_maestra_v3-20244_RP_v0.4.gtf", sep="\t", header=None)
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45354, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,gene,1520,5066,.,-,.,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,CBM,transcript,1520,5066,.,-,.,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CBM,CDS,3710,4711,.,-,.,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,CBM,5utr,1520,3709,.,-,.,"parent_id ""LINF_01T0005000"";"
4,LinJ.01,CBM,3utr,4712,5066,.,-,.,"parent_id ""LINF_01T0005000"";"


From `gtf_df`I only need columns 0, 2, 3, 4, 6 and 8

In [149]:
# Get from `gtf_df` the needed columns [0, 3, 4, 6, 8]
gtf_df = gtf_df[[0, 2, 3, 4, 6, 8]]
gtf_df.columns = ["chrom", "feature", "start", "end", "strand", "attributes"]
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45354, 6)
chrom         object
feature       object
start          int64
end            int64
strand        object
attributes    object
dtype: object


Unnamed: 0,chrom,feature,start,end,strand,attributes
0,LinJ.01,gene,1520,5066,-,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,transcript,1520,5066,-,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CDS,3710,4711,-,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,5utr,1520,3709,-,"parent_id ""LINF_01T0005000"";"
4,LinJ.01,3utr,4712,5066,-,"parent_id ""LINF_01T0005000"";"


Now the field `attributes` it's separated by ";" and the header its in a format like `header "data"`. We are going to transform the "attributes" column in multiple columns

#### 1.2.1 Transforming colums

Get first all the elements that appear in the attributes columns

In [150]:
# Let's count first the number of elements in the `attributes` column
atr_dict = {}
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        # print(f"\t{'-'*50}")
        # print(f"\tatribute: {atr.strip()}")  
        key = atr.split(" ")[0] 
        if key not in atr_dict:
            atr_dict[key] = 1

        else:
            atr_dict[key] += 1
        # print(f"\t{atr_dict}")
print(atr_dict)

{'gene_id': 9853, 'gene_name': 9853, 'biotype': 17298, 'notes': 17283, 'parent_id': 35501, 'transcript_id': 18198, 'transcript_name': 9647, 'pseudogen': 46}


In [151]:
# get a list with the keys of atr_dict
atr_keys = list(atr_dict.keys())
print(atr_keys)

['gene_id', 'gene_name', 'biotype', 'notes', 'parent_id', 'transcript_id', 'transcript_name', 'pseudogen']


Now we'll have a list with all the elements. When indexing each row in the next steps, we can check if one of this items appear, and if not, we can add a "None" value to the attribute

In [152]:
# Now that we have the attributes count, let's create a dict for each element in "test_df" with the attributes separated
new_col_df = []
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    pre_data = []
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        key = atr.split(" ")[0]
        value = atr.split(" ")[1].replace('"', "")
        pre_data.append({key: value})
    
    for elem in atr_keys: # type: ignore  # Checking if the elements from atr_keys
        if elem not in [list(elem.keys())[0] for elem in pre_data]:  # If the element is not in pre_data, add it with value None
            pre_data.append({elem: None})

    flattenend_data = {key: value for sublist in pre_data for key, value in sublist.items()}
    new_col_df.append(flattenend_data)

In [153]:
# Checking how it worked
new_col_df  

[{'gene_id': 'LINF_010005000',
  'gene_name': 'LINF_010005000',
  'biotype': 'protein_coding',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'parent_id': None,
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_010005000',
  'transcript_id': 'LINF_01T0005000',
  'transcript_name': 'LINF_01T0005000',
  'biotype': 'protein_coding',
  'notes': 'Protein_of_unknown_function_(DUF2946)',
  'gene_id': None,
  'gene_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'transcript_id': 'LINF_01T0005000',
  'gene_id': None,
  'gene_name': None,
  'biotype': None,
  'notes': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'gene_id': None,
  'gene_name': None,
  'biotype': None,
  'notes': None,
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None},
 {'parent_id': 'LINF_01T0005000',
  'gene_id': None,
  'gene_name': None,
  'biotype': None,
  'notes': None,
  'transcri

In [154]:
# Transforming the list of dicts into a DataFrame
new_col_df = pd.DataFrame(new_col_df)
new_col_df

Unnamed: 0,gene_id,gene_name,biotype,notes,parent_id,transcript_id,transcript_name,pseudogen
0,LINF_010005000,LINF_010005000,protein_coding,Protein_of_unknown_function_(DUF2946),,,,
1,,,protein_coding,Protein_of_unknown_function_(DUF2946),LINF_010005000,LINF_01T0005000,LINF_01T0005000,
2,,,,,LINF_01T0005000,LINF_01T0005000,,
3,,,,,LINF_01T0005000,,,
4,,,,,LINF_01T0005000,,,
...,...,...,...,...,...,...,...,...
45349,,,,,LINF_36T0082400,LINF_36T0082400,,
45350,,,,,LINF_36T0082400,,,
45351,,,,,LINF_36T0082400,,,
45352,LINF_360082500,LINF_360082500,,,,,,


In [155]:
# Le'ts reorder the columns
new_col_df = new_col_df[["gene_id", "gene_name", "transcript_id", "transcript_name", "biotype", "parent_id", "pseudogen", "notes"]]
new_col_df

Unnamed: 0,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946)
1,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946)
2,,,LINF_01T0005000,,,LINF_01T0005000,,
3,,,,,,LINF_01T0005000,,
4,,,,,,LINF_01T0005000,,
...,...,...,...,...,...,...,...,...
45349,,,LINF_36T0082400,,,LINF_36T0082400,,
45350,,,,,,LINF_36T0082400,,
45351,,,,,,LINF_36T0082400,,
45352,LINF_360082500,LINF_360082500,,,,,,


In [156]:
# Concatenating the new DataFrame with the original `gtf_df` and dropping the `attributes` column
gtf_df = pd.concat([gtf_df, new_col_df], axis=1)
gtf_df.drop(columns="attributes", inplace=True)
gtf_df

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LinJ.01,gene,1520,5066,-,LINF_010005000,LINF_010005000,,,protein_coding,,,Protein_of_unknown_function_(DUF2946)
1,LinJ.01,transcript,1520,5066,-,,,LINF_01T0005000,LINF_01T0005000,protein_coding,LINF_010005000,,Protein_of_unknown_function_(DUF2946)
2,LinJ.01,CDS,3710,4711,-,,,LINF_01T0005000,,,LINF_01T0005000,,
3,LinJ.01,5utr,1520,3709,-,,,,,,LINF_01T0005000,,
4,LinJ.01,3utr,4712,5066,-,,,,,,LINF_01T0005000,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45349,LinJ.36,CDS,2739458,2740183,-,,,LINF_36T0082400,,,LINF_36T0082400,,
45350,LinJ.36,5utr,2738595,2739457,-,,,,,,LINF_36T0082400,,
45351,LinJ.36,3utr,2740184,2740374,-,,,,,,LINF_36T0082400,,
45352,LinJ.36,gene,2740760,2742268,-,LINF_360082500,LINF_360082500,,,,,,


## 2. Compare coordinates

In this next part we are going to check for coordinates. To search which elements in the **sider_df** is inside which element in the **gtf_df**.

### 2.1 Fail proof the data

In [157]:
sider_df_test = sider_df.copy()
gtf_df_test = gtf_df.copy()

In [158]:
# Check the number of elements 
gtf_df_test.shape

(45354, 13)

In [159]:
# Elements where the start is minor than the end
(gtf_df_test['start'] < gtf_df_test['end']).sum()

np.int64(45352)

In [160]:
# Elements where the start is major than the end
(gtf_df_test['start'] > gtf_df_test['end']).sum()

np.int64(0)

In [161]:
# CHeck the feature elements
condition = (gtf_df_test['start'] < gtf_df_test['end'])
gtf_df_test[condition]['feature'].value_counts()

feature
gene          9853
transcript    9647
CDS           8760
5utr          8549
3utr          8543
Name: count, dtype: int64

Interesting, there should be the same numbers of 5utr as 3utr

In [162]:
# Checking without condition
gtf_df_test['feature'].value_counts()

feature
gene          9853
transcript    9647
CDS           8760
5utr          8549
3utr          8545
Name: count, dtype: int64

In [163]:
# Checking where those elements where there is not a 3utr
parent_feature_dict = gtf_df_test.groupby('parent_id')['feature'].apply(list).to_dict()
filtered_dict = {k: v for k, v in parent_feature_dict.items() if v not in (['transcript'], 
                                                                           ['CDS'], 
                                                                           ['CDS', '5utr', '3utr'], 
                                                                           ['CDS', '3utr', '5utr'],
                                                                           ['CDS', '3utr', '5utr', 'CDS', '3utr', '5utr'],
                                                                           ['transcript', 'transcript'],
                                                                           ['CDS', '5utr', '3utr', 'CDS', '5utr', '3utr'])}
filtered_dict

{'LINF_27T0013600': ['CDS', '3utr'],
 'LINF_27T0033600-700': ['CDS', '3utr', '5utr', 'CDS', '3utr'],
 'LINF_30T0006850': ['CDS', '5utr'],
 'LINF_31T0037100': ['CDS', '5utr'],
 'LINF_31T0039200': ['CDS', '5utr'],
 'LINF_36T0017400': ['CDS', '5utr'],
 'LINF_36T0036000': ['CDS', '5utr'],
 'LINF_36T0071100': ['CDS', '5utr']}

<span style="color:red">These are the elements without a 3utr</span>

In [171]:
gtf_df_test[(gtf_df_test['parent_id'].isin(list(filtered_dict.keys()))) |
            (gtf_df_test['gene_id'].isin([elem.replace("T","") for elem in list(filtered_dict.keys())]))]

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
23182,LinJ.27,gene,327990,328645,+,LINF_270013600,LINF_270013600,,,protein_coding,,unknown,Stress_responsive_A/B_Barrel_domain-containing...
23184,LinJ.27,CDS,328114,328645,+,,,LINF_27T0013600,,,LINF_27T0013600,,
23185,LinJ.27,3utr,327990,328113,+,,,,,,LINF_27T0013600,,
24262,LinJ.27,gene,1153377,1155197,+,LINF_270033600-700,LINF_270033600-700,,,protein_coding,,,hypothetical_protein_-_conserved
24264,LinJ.27,CDS,1153523,1154887,+,,,LINF_27T0033600-700,,,LINF_27T0033600-700,,
24265,LinJ.27,3utr,1153377,1153522,+,,,,,,LINF_27T0033600-700,,
24266,LinJ.27,5utr,1154888,1155197,+,,,,,,LINF_27T0033600-700,,
24267,LinJ.27,CDS,1153982,1154887,+,,,LINF_27T0033600-700,,,LINF_27T0033600-700,,
24268,LinJ.27,3utr,1153377,1153981,+,,,,,,LINF_27T0033600-700,,
27824,LinJ.30,gene,56144,57262,-,LINF_300006850,LINF_300006850,,,protein_coding,,unknown,polynucleotide_kinase_3'-phosphatase-_putative...


### 2.1 Check intervals

In [181]:
sider_df_test.columns

Index(['chrom', 'start', 'end', 'strand', 'sider_name', 'interval'], dtype='object')

In [183]:
gtf_df_test.columns

Index(['chrom', 'feature', 'start', 'end', 'strand', 'gene_id', 'gene_name',
       'transcript_id', 'transcript_name', 'biotype', 'parent_id', 'pseudogen',
       'notes', 'interval'],
      dtype='object')

In [188]:
# let's do the same using 'merge_asof'
sider_df_test = sider_df.copy()
gtf_df_test = gtf_df.copy()

# Create interval columns
sider_df_test["interval"] = pd.IntervalIndex.from_arrays(sider_df_test["start"], sider_df_test["end"], closed="both")
gtf_df_test["interval"] = pd.IntervalIndex.from_arrays(gtf_df_test["start"], gtf_df_test["end"], closed="both")

# initialize dict
sider_location_dict_test = {sider_name: [] for sider_name in sider_df_test["sider_name"].unique()}

# Find elements in sider_df_test that are inside gtf_df_test
# Find overlaps using boolean indexing
for i, sider_row in sider_df_test.iterrows():
    # Boolean mask for intervals that contain the SIDER interval
    print(f"Analyzing elem {i+1}/{sider_df_test.shape[0]}")

    sider_location_dict_test[sider_row['sider_name']].append([sider_row['chrom'], sider_row['start'], sider_row['end'], sider_row['strand'], sider_row['sider_name']])

    mask = (gtf_df_test['chrom'] == sider_row['chrom']) & \
           (gtf_df_test['start'] <= sider_row['start']) & \
           (gtf_df_test['end'] >= sider_row['end'])
    
    overlaps = gtf_df_test[mask]
    
    for j, gtf_row in overlaps.iterrows():
        new_row = [gtf_row['chrom'], gtf_row['feature'], gtf_row['start'], gtf_row['end'], gtf_row['strand'], gtf_row['gene_id'], gtf_row['transcript_id'], gtf_row['parent_id']]
        sider_location_dict_test[sider_row['sider_name']].append(new_row)

Analyzing elem 1/2132
Analyzing elem 2/2132
Analyzing elem 3/2132
Analyzing elem 4/2132
Analyzing elem 5/2132
Analyzing elem 6/2132
Analyzing elem 7/2132
Analyzing elem 8/2132
Analyzing elem 9/2132
Analyzing elem 10/2132
Analyzing elem 11/2132
Analyzing elem 12/2132
Analyzing elem 13/2132
Analyzing elem 14/2132
Analyzing elem 15/2132
Analyzing elem 16/2132
Analyzing elem 17/2132
Analyzing elem 18/2132
Analyzing elem 19/2132
Analyzing elem 20/2132
Analyzing elem 21/2132
Analyzing elem 22/2132
Analyzing elem 23/2132
Analyzing elem 24/2132
Analyzing elem 25/2132
Analyzing elem 26/2132
Analyzing elem 27/2132
Analyzing elem 28/2132
Analyzing elem 29/2132
Analyzing elem 30/2132
Analyzing elem 31/2132
Analyzing elem 32/2132
Analyzing elem 33/2132
Analyzing elem 34/2132
Analyzing elem 35/2132
Analyzing elem 36/2132
Analyzing elem 37/2132
Analyzing elem 38/2132
Analyzing elem 39/2132
Analyzing elem 40/2132
Analyzing elem 41/2132
Analyzing elem 42/2132
Analyzing elem 43/2132
Analyzing elem 44/21