# Needed modules

In [16]:
# Load needed libraries
import pandas as pd
import os

In [17]:
# https://kioku-space.com/en/jupyter-skip-execution/
from IPython.core.magic import register_cell_magic # type: ignore

@register_cell_magic
def skip(line, cell):
    return

# Pickle save

In [18]:
%%skip
# =============================================================================
# Save the variables
# =============================================================================
variables_dict = {
}

# =============================================================================
# main function
# =============================================================================
def data_save_load(option, dict_variables=None):
    """
    This function is used to save or load data for the jupyter notebook
    """
    path_folder = "ipynb_db"  # Folder to save variables
    os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
    path_file = os.path.join(path_folder, "variables.pkl") # Path to save the variables

    if option == "save":
        with open(path_file, "wb") as f:
            pickle.dump(dict_variables, f)
    elif option == "load":
        with open(path_file, "rb") as f:
            variables = pickle.load(f)
        # Now load the variables
        for key, value in variables.items():
            variables[key] = value

# =============================================================================
# Call the function
# =============================================================================
data_save_load(option="save",
               dict_variables=variables_dict)

# 1. Load data

In [19]:
# Load new negative data:
neg_data = pd.read_csv("./2.Hallmarks_vs_NegData/negative_data_after_recaught.csv", sep=",", header=0)
neg_data.sort_values(["sseqid", "sstart"])
print(neg_data.shape)
print(neg_data.dtypes)
neg_data.head()

(913, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,36103,36242,plus,AGACAGACCGACACACGCAGCCGTGTGATGCCGCCGCCGAGGGCAG...
1,LinJ.01,113760,114388,plus,CAGCGCCATGCACGACATGGCCGCTGACGTCCGTAGCCCTAACTCG...
2,LinJ.01,146412,146530,plus,GCGAATTGTGTTCTGCGCATGCCTCTTCTCTGCCGTGCAGCATGCG...
3,LinJ.01,261866,262439,plus,CGGACTTGGCAAGTGGCCGCCATCGATGAAAACGCACCATGCCTTT...
4,LinJ.01,271363,271650,plus,CGAACGCCGCCCTCAATCGCGCGCTGAACTTCACGCGGCGGTCGAC...


In [20]:
# Load neg recaught data, i.e., data to add to the positive elements
neg_recaught_data = pd.read_csv("./2.Hallmarks_vs_NegData/neg_data_recaught_hits.csv", sep=",", header=0)
print(neg_recaught_data.shape)
print(neg_recaught_data.dtypes)
neg_recaught_data.head()

(1, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.16,79840,80334,plus,CCCGTGCTGCGTCGCGCCCATCTAGCTGCCACGGCGGCATGGCTCC...


In [21]:
# Load positive data:
pos_data = pd.read_csv("./1.new_data/positive_database.csv", sep=",", header=0)
print(pos_data.shape)
print(pos_data.dtypes)
pos_data.head()

(2116, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,24758,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,35956,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55547,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


# 2. Add recaught data to the positive data

In [22]:
# Add neg_recaught_data to pos_data
pos_data = pd.concat([pos_data, neg_recaught_data], axis=0)

# Sort by ssqid, sstart
pos_data.sort_values(by=["sseqid", "sstart"], inplace=True, ignore_index=True)

# Check data
print(pos_data.shape)
print(pos_data.dtypes)
pos_data.head()

(2117, 5)
sseqid     object
sstart      int64
send        int64
sstrand    object
sseq       object
dtype: object


Unnamed: 0,sseqid,sstart,send,sstrand,sseq
0,LinJ.01,1,173,plus,ACACCAGTACACCAGTACACCAGTACACCAGTACACCAGTACACCA...
1,LinJ.01,24093,24758,plus,GGGGGAGGCGGGGGAGGCGGGGGGCACGCACCTCCATGCGTGGCAT...
2,LinJ.01,35371,35956,plus,ACTCCCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...
3,LinJ.01,39790,40595,plus,ATTCTACCGCGAGCAAGGCAGCACACAGACGCACGCACAGCCACAG...
4,LinJ.01,54983,55547,plus,ACTCTCATCGCCTGGTGCGAAGCAGCGCAAGACACACGCGCGCTGC...


# 3. Check if the negative and positive data for included or overlaps

In [24]:
# Copy og data
pos_data_check = pos_data.copy()
neg_data_check = neg_data.copy()

In [25]:
# Create interval columns
pos_data_check["interval"] = pos_data_check.apply(lambda x: pd.Interval(left=x["sstart"], right=x["send"], closed="both"), axis=1)
neg_data_check["interval"] = neg_data_check.apply(lambda x: pd.Interval(left=x["sstart"], right=x["send"], closed="both"), axis=1)

print(pos_data_check.loc[:, ["sseqid", "sstart", "send", "interval"]].head())
print(neg_data_check.loc[:, ["sseqid", "sstart", "send", "interval"]].head())

    sseqid  sstart   send        interval
0  LinJ.01       1    173        [1, 173]
1  LinJ.01   24093  24758  [24093, 24758]
2  LinJ.01   35371  35956  [35371, 35956]
3  LinJ.01   39790  40595  [39790, 40595]
4  LinJ.01   54983  55547  [54983, 55547]
    sseqid  sstart    send          interval
0  LinJ.01   36103   36242    [36103, 36242]
1  LinJ.01  113760  114388  [113760, 114388]
2  LinJ.01  146412  146530  [146412, 146530]
3  LinJ.01  261866  262439  [261866, 262439]
4  LinJ.01  271363  271650  [271363, 271650]


In [26]:
# Group the elements
pos_data_grouped = pos_data_check.groupby("sseqid")
neg_data_grouped = neg_data_check.groupby("sseqid")

In [27]:
# Let's check for each group of positive elements if there is any negative if the elements are exactly the same, overlap or are inside one another
same_elems = []
overlaping_elems = []
inside_elems = []
inside_elems2 = []
for name1, data1 in pos_data_grouped:
    for name2, data2 in neg_data_grouped:
        if name2 != name1:
            # print(f">> SKIPPING Chrom. Chromosome in negative elements {name2} is different from the one in positive elements {name1}")
            continue
        else:
            print("="*50)
            print(f"Checking chromosome {name1}")
            # Checking for same coordinates
            for _, elem1 in data1.iterrows():
                for _, elem2 in data2.iterrows():
                    if elem1["interval"] == elem2["interval"]:
                        print(f"Found same element in chromosome {name1}")
                        same_elems.append(elem1)
                    elif elem1["interval"].overlaps(elem2["interval"]):
                        print(f"Found overlaping element in chromosome {name1}")
                        overlaping_elems.append([elem1, elem2])
                    else:
                        if (elem1["sstart"] > elem2["sstart"] and elem1["sstart"] < elem2["send"] /
                            elem1["send"] < elem2["send"] and elem1["send"] > elem2["sstart"]):
                            print(f"Found element inside another element in chromosome {name1}")
                            inside_elems.append(elem1)
                        if (elem2["sstart"] > elem1["sstart"] and elem2["sstart"] < elem1["send"] /
                            elem2["send"] < elem1["send"] and elem2["send"] > elem1["sstart"]):
                            print(f"Found element inside another element in chromosome {name1}")
                            inside_elems2.append(elem1)

print("*"*50)
print(f"Found {len(same_elems)} elements that are the same")
print(f"Found {len(overlaping_elems)} elements that are overlaping")
print(f"Found {len(inside_elems)} elements that are inside another element")
print(f"Found {len(inside_elems2)} elements that are inside another element")


Checking chromosome LinJ.01
Checking chromosome LinJ.02


Checking chromosome LinJ.03
Checking chromosome LinJ.04
Checking chromosome LinJ.05
Checking chromosome LinJ.06
Checking chromosome LinJ.07
Checking chromosome LinJ.08
Checking chromosome LinJ.09
Checking chromosome LinJ.10
Checking chromosome LinJ.11
Checking chromosome LinJ.12
Checking chromosome LinJ.13
Checking chromosome LinJ.14
Checking chromosome LinJ.15
Checking chromosome LinJ.16
Checking chromosome LinJ.17
Checking chromosome LinJ.18
Checking chromosome LinJ.19
Checking chromosome LinJ.20
Checking chromosome LinJ.21
Checking chromosome LinJ.22
Checking chromosome LinJ.23
Checking chromosome LinJ.24
Checking chromosome LinJ.25
Checking chromosome LinJ.26
Checking chromosome LinJ.27
Checking chromosome LinJ.28
Checking chromosome LinJ.29
Checking chromosome LinJ.30
Checking chromosome LinJ.31
Checking chromosome LinJ.32
Checking chromosome LinJ.33
Checking chromosome LinJ.34
Checking chromosome LinJ.35
Checking chromosome LinJ.36
************************************************

# 4. Save data

In [28]:
# Prepare paths
path_folder = "./3.new_data_v2"
path_pos_data = os.path.join(path_folder, "positive_database.csv")
path_neg_data = os.path.join(path_folder, "negative_database.csv")

# save data
os.makedirs(path_folder, exist_ok=True)
pos_data.to_csv(path_pos_data, sep=",", header=True, index=False)
neg_data.to_csv(path_neg_data, sep=",", header=True, index=False)