# Save data

In [51]:
# =============================================================================
# Save the variables
# =============================================================================
# variables_dict = {
#     "sider_df" : sider_df,  # Data Frame with SIDER data
#     "gtf_df" : gtf_df,  # Data Frame with GTF data
# }

# =============================================================================
# main function
# =============================================================================
# import os
# import pickle

# def data_save_load(option, dict_variables=None):
#     """
#     This function is used to save or load data for the jupyter notebook
#     """
#     path_folder = "ipynb_db"  # Folder to save variables
#     os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
#     path_file = os.path.join(path_folder, "variables.pkl") # Path to save the variables

#     if option == "save":
#         with open(path_file, "wb") as f:
#             pickle.dump(dict_variables, f)
#     elif option == "load":
#         with open(path_file, "rb") as f:
#             variables = pickle.load(f)
#         # Now load the variables
#         for key, value in variables.items():
#             variables[key] = value

# =============================================================================
# Call the function
# =============================================================================
# data_save_load(option="save",
#                dict_variables=variables_dict)

# Code

## 1. Load data

In [23]:
import pandas as pd # type: ignore

### 1.1. SIDER data

In [24]:
sider_df = pd.read_csv("data/SIDER_elements.gff", sep="\t", header=None)
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2132, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,SIDER,2,174,.,+,.,ID=src_c01.10
1,LinJ.01,CBM,SIDER,24094,24759,.,+,.,ID=src_c01.20A
2,LinJ.01,CBM,SIDER,35372,35957,.,+,.,ID=src_c01.30A
3,LinJ.01,CBM,SIDER,39791,40596,.,+,.,ID=src_c01.40
4,LinJ.01,CBM,SIDER,54984,55548,.,+,.,ID=src_c01.50A


In the `sider_df` I only need the column 0, 3, 4, 6 and 8 columns

In [25]:
# Extracting from `sider_df`the needed columns [0, 3, 4, 6, 8]
sider_df = sider_df[[0, 3, 4, 6, 8]]
sider_df.columns = ["chrom", "start", "end", "strand", "sider_name"]
print(sider_df.shape)
print(sider_df.dtypes)
sider_df.head()

(2132, 5)
chrom         object
start          int64
end            int64
strand        object
sider_name    object
dtype: object


Unnamed: 0,chrom,start,end,strand,sider_name
0,LinJ.01,2,174,+,ID=src_c01.10
1,LinJ.01,24094,24759,+,ID=src_c01.20A
2,LinJ.01,35372,35957,+,ID=src_c01.30A
3,LinJ.01,39791,40596,+,ID=src_c01.40
4,LinJ.01,54984,55548,+,ID=src_c01.50A


### 1.2 GTF data
This one wil be **harder** to prepare

In [26]:
# Load data
gtf_df = pd.read_csv("./data/20240703111001_LINF-Tabla_maestra_v3-20244_RP_v0.4.gtf", sep="\t", header=None)
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45779, 9)
0    object
1    object
2    object
3     int64
4     int64
5    object
6    object
7    object
8    object
dtype: object


Unnamed: 0,0,1,2,3,4,5,6,7,8
0,LinJ.01,CBM,gene,1520,5066,.,-,.,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,CBM,transcript,1520,5066,.,-,.,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CBM,CDS,3710,4711,.,-,.,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,CBM,5utr,1520,3709,.,-,.,"parent_id ""LINF_01T0005000"";"
4,LinJ.01,CBM,3utr,4712,5066,.,-,.,"parent_id ""LINF_01T0005000"";"


From `gtf_df`I only need columns 0, 2, 3, 4, 6 and 8

In [27]:
# Get from `gtf_df` the needed columns [0, 3, 4, 6, 8]
gtf_df = gtf_df[[0, 2, 3, 4, 6, 8]]
gtf_df.columns = ["chrom", "feature", "start", "end", "strand", "attributes"]
print(gtf_df.shape)
print(gtf_df.dtypes)
gtf_df.head()

(45779, 6)
chrom         object
feature       object
start          int64
end            int64
strand        object
attributes    object
dtype: object


Unnamed: 0,chrom,feature,start,end,strand,attributes
0,LinJ.01,gene,1520,5066,-,"gene_id ""LINF_010005000""; gene_name ""LINF_0100..."
1,LinJ.01,transcript,1520,5066,-,"parent_id ""LINF_010005000""; transcript_id ""LIN..."
2,LinJ.01,CDS,3710,4711,-,"parent_id ""LINF_01T0005000""; transcript_id ""LI..."
3,LinJ.01,5utr,1520,3709,-,"parent_id ""LINF_01T0005000"";"
4,LinJ.01,3utr,4712,5066,-,"parent_id ""LINF_01T0005000"";"


Now the field `attributes` it's separated by ";" and the header its in a format like `header "data"`. We are going to transform the "attributes" column in multiple columns

#### 1.2.1 Transforming colums

Get first all the elements that appear in the attributes columns

In [28]:
# Let's count first the number of elements in the `attributes` column
atr_dict = {}
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        # print(f"\t{'-'*50}")
        # print(f"\tatribute: {atr.strip()}")  
        key = atr.split(" ")[0] 
        if key not in atr_dict:
            atr_dict[key] = 1

        else:
            atr_dict[key] += 1
        # print(f"\t{atr_dict}")
print(atr_dict)

{'gene_id': 9853, 'gene_name': 9853, 'biotype': 17298, 'notes': 17283, 'parent_id': 35926, 'transcript_id': 18198, 'transcript_name': 9647, 'pseudogen': 46}


In [29]:
# get a list with the keys of atr_dict
atr_keys = list(atr_dict.keys())
print(atr_keys)

['gene_id', 'gene_name', 'biotype', 'notes', 'parent_id', 'transcript_id', 'transcript_name', 'pseudogen']


Now we'll have a list with all the elements. When indexing each row in the next steps, we can check if one of this items appear, and if not, we can add a "None" value to the attribute

In [43]:
# Now that we have the attributes count, let's create a dict for each element in "test_df" with the attributes separated
new_col_df = []
for index, row in gtf_df.iterrows():
    # print(index, ":", sep="")
    pre_data = []
    for atr in row["attributes"].split(";"):
        atr = atr.strip()  # Remove leading and trailing whitespaces
        if len(atr.strip()) == 0:  # Skip empty attribute ""
            continue
        key = atr.split(" ")[0]
        value = atr.split(" ")[1]
        pre_data.append({key: value})
    
    for elem in atr_keys: # type: ignore  # Checking if the elements from atr_keys
        if elem not in [list(elem.keys())[0] for elem in pre_data]:  # If the element is not in pre_data, add it with value None
            pre_data.append({elem: None})

    flattenend_data = {key: value for sublist in pre_data for key, value in sublist.items()}
    new_col_df.append(flattenend_data)

In [44]:
# Checking how it worked
new_col_df  

[{'gene_id': '"LINF_010005000"',
  'gene_name': '"LINF_010005000"',
  'biotype': '"protein_coding"',
  'notes': '"Protein_of_unknown_function_(DUF2946)"',
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None,
  'parent_id': None},
 {'parent_id': '"LINF_010005000"',
  'transcript_id': '"LINF_01T0005000"',
  'transcript_name': '"LINF_01T0005000"',
  'biotype': '"protein_coding"',
  'notes': '"Protein_of_unknown_function_(DUF2946)"',
  'gene_id': None,
  'gene_name': None,
  'pseudogen': None},
 {'parent_id': '"LINF_01T0005000"',
  'transcript_id': '"LINF_01T0005000"',
  'gene_id': None,
  'gene_name': None,
  'transcript_name': None,
  'pseudogen': None,
  'biotype': None,
  'notes': None},
 {'parent_id': '"LINF_01T0005000"',
  'gene_id': None,
  'gene_name': None,
  'transcript_id': None,
  'transcript_name': None,
  'pseudogen': None,
  'biotype': None,
  'notes': None},
 {'parent_id': '"LINF_01T0005000"',
  'gene_id': None,
  'gene_name': None,
  'transcript_id': No

In [45]:
# Transforming the list of dicts into a DataFrame
new_col_df = pd.DataFrame(new_col_df)
new_col_df

Unnamed: 0,gene_id,gene_name,biotype,notes,transcript_id,transcript_name,pseudogen,parent_id
0,"""LINF_010005000""","""LINF_010005000""","""protein_coding""","""Protein_of_unknown_function_(DUF2946)""",,,,
1,,,"""protein_coding""","""Protein_of_unknown_function_(DUF2946)""","""LINF_01T0005000""","""LINF_01T0005000""",,"""LINF_010005000"""
2,,,,,"""LINF_01T0005000""",,,"""LINF_01T0005000"""
3,,,,,,,,"""LINF_01T0005000"""
4,,,,,,,,"""LINF_01T0005000"""
...,...,...,...,...,...,...,...,...
45774,,,,,"""LINF_36T0082400""",,,"""LINF_36T0082400"""
45775,,,,,,,,"""LINF_36T0082400"""
45776,,,,,,,,"""LINF_36T0082400"""
45777,"""LINF_360082500""","""LINF_360082500""",,,,,,


In [46]:
# Le'ts reorder the columns
new_col_df = new_col_df[["gene_id", "gene_name", "transcript_id", "transcript_name", "biotype", "parent_id", "pseudogen", "notes"]]
new_col_df

Unnamed: 0,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,"""LINF_010005000""","""LINF_010005000""",,,"""protein_coding""",,,"""Protein_of_unknown_function_(DUF2946)"""
1,,,"""LINF_01T0005000""","""LINF_01T0005000""","""protein_coding""","""LINF_010005000""",,"""Protein_of_unknown_function_(DUF2946)"""
2,,,"""LINF_01T0005000""",,,"""LINF_01T0005000""",,
3,,,,,,"""LINF_01T0005000""",,
4,,,,,,"""LINF_01T0005000""",,
...,...,...,...,...,...,...,...,...
45774,,,"""LINF_36T0082400""",,,"""LINF_36T0082400""",,
45775,,,,,,"""LINF_36T0082400""",,
45776,,,,,,"""LINF_36T0082400""",,
45777,"""LINF_360082500""","""LINF_360082500""",,,,,,


In [47]:
# Concatenating the new DataFrame with the original `gtf_df` and dropping the `attributes` column
gtf_df = pd.concat([gtf_df, new_col_df], axis=1)
gtf_df.drop(columns="attributes", inplace=True)
gtf_df

Unnamed: 0,chrom,feature,start,end,strand,gene_id,gene_name,transcript_id,transcript_name,biotype,parent_id,pseudogen,notes
0,LinJ.01,gene,1520,5066,-,"""LINF_010005000""","""LINF_010005000""",,,"""protein_coding""",,,"""Protein_of_unknown_function_(DUF2946)"""
1,LinJ.01,transcript,1520,5066,-,,,"""LINF_01T0005000""","""LINF_01T0005000""","""protein_coding""","""LINF_010005000""",,"""Protein_of_unknown_function_(DUF2946)"""
2,LinJ.01,CDS,3710,4711,-,,,"""LINF_01T0005000""",,,"""LINF_01T0005000""",,
3,LinJ.01,5utr,1520,3709,-,,,,,,"""LINF_01T0005000""",,
4,LinJ.01,3utr,4712,5066,-,,,,,,"""LINF_01T0005000""",,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
45774,LinJ.36,CDS,2739458,2740183,-,,,"""LINF_36T0082400""",,,"""LINF_36T0082400""",,
45775,LinJ.36,5utr,2738595,2739457,-,,,,,,"""LINF_36T0082400""",,
45776,LinJ.36,3utr,2740184,2740374,-,,,,,,"""LINF_36T0082400""",,
45777,LinJ.36,gene,2740760,2742268,-,"""LINF_360082500""","""LINF_360082500""",,,,,,


## 2. Compare coordinates

In this next part we are going to check for coordinates. To search which elements in the **sider_df** is inside which element in the **gtf_df**.

In [18]:
# for index, row in 