Author: Ronny F. Pacheco Date: Sep 2024
Copyright: © 2024 Ronny Pacheco License: MIT License

---

MIT License

Copyright (c) 2024 Ronny Pacheco

Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

# Needed modules

In [1]:
# Load the needed libraries
import pickle
import os
from tabnanny import check
from tarfile import data_filter

import pandas as pd
import numpy as np
import json

In [2]:
# https://kioku-space.com/en/jupyter-skip-execution/
from IPython.core.magic import register_cell_magic # type: ignore


@register_cell_magic
def skip(line, cell):
    print('Skipping cell')
    if line and cell:
        pass
    return

# Pickle save

In [3]:
%%skip
# =============================================================================
# main function
# =============================================================================
def data_save_load(option):
    """
    This function is used to save or load data for the jupyter notebook
    """
    path_folder = "ipynb_db"  # Folder to save variables
    os.makedirs(path_folder, exist_ok=True)  # Create folder if not exist
    notebook_name = os.path.basename(os.path.abspath(''))
    path_file = os.path.join(path_folder, f"{notebook_name}.variables.pkl") # Path to save the variables

    if option == "save":
        with open(path_file, "wb") as pickle_file:
            dict_variables = {
                "neg_to_filter" : neg_to_filter
            }
            pickle.dump(dict_variables, pickle_file)
    elif option == "load":
        with open(path_file, "rb") as pickle_file:
            variables = pickle.load(pickle_file)
        # Now load the variables
        for pickle_key, pickle_value in variables.items():
            print(f"* Loading variable: {pickle_key}")
            globals()[pickle_key] = pickle_value
# =============================================================================
# Call the function
# =============================================================================
data_save_load(option="load")

Skipping cell


# Prepare Data

In [4]:
neg_to_filter = pd.read_csv("./data/neg_to_filter.csv", sep=",", header=0)
print(neg_to_filter.shape)
print(neg_to_filter.dtypes)
neg_to_filter.head()

(1851, 16)
neg_name         object
chrom            object
start             int64
end               int64
strand           object
gene_id          object
transcript_id    object
parent_id        object
gene               bool
transcript         bool
CDS                bool
3utr               bool
5utr               bool
pseudogen        object
notes            object
interval         object
dtype: object


Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
0,rejected_noCDS_c01.10,LinJ.01,34736,37218,-,LINF_010006300,,,True,False,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
1,rejected_noCDS_c01.10,LinJ.01,34736,37218,-,,LINF_01T0006300,LINF_010006300,False,True,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
2,rejected_noCDS_c01.10,LinJ.01,34736,36818,-,,,LINF_01T0006300,False,False,False,True,False,,,"[34736, 36818]"
3,rejected_noCDS_c01.20,LinJ.01,114146,116224,+,LINF_010009600,,,True,False,False,False,False,,nicotinamidase|PNC1,"[114146, 116224]"
4,rejected_noCDS_c01.20,LinJ.01,114146,116224,+,,LINF_01T0009600,LINF_010009600,False,True,False,False,False,,nicotinamidase|PNC1,"[114146, 116224]"


## Cleaning phase

### Defining functions

In [5]:
def search_string(data_frame, searching_string):
    """
    :param data_frame: The DataFrame to search within. Must contain a 'notes' column to perform string matching.
    :param searching_string: The string to search for within the 'notes' column of the DataFrame.
    :return: A filtered DataFrame that contains only rows where 'notes' contains the searching_string, ignoring case.
    """
    filtered_df = data_frame[data_frame['notes'].fillna('').str.contains(searching_string, case=False)]
    print(f"The of filtered data: {filtered_df.shape}")
    print(f"The unique values in column 'neg_name': {filtered_df['neg_name'].nunique()}")
    return filtered_df

def checking_data(data_frame):
    """
    :param data_frame: pandas DataFrame that is being checked
    :return: None
    """
    print(f"Shape of the data frame is: {data_frame.shape}")
    print(f"Number of unique values in column 'neg_name': {data_frame['neg_name'].nunique()} ")

def check_family(data_frame, family_name):
    """
    Filters the DataFrame for rows where 'neg_name' contains the specified family_name pattern and 'gene' is True.
    
    Args:
    data_frame (pd.DataFrame): The DataFrame to filter.
    family_name (str): The pattern to match in 'neg_name'.
    
    Returns:
    pd.DataFrame: The filtered DataFrame with selected columns.
    """
    pattern = fr'\d+{family_name}$'
    filtered_data = data_frame.loc[
        (data_frame['neg_name'].str.contains(pattern, na=False)) & (data_frame['gene'] == True),
        ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
    ]
    return filtered_data
 

def group_and_count(data_frame, group_column):
    """
    :param data_frame: The input DataFrame containing the data to be grouped and counted.
    :param group_column: The column name used to group the data_frame.
    :return: A sorted DataFrame with unique notes from the group_column and their associated counts.
    """
    grouped_df = data_frame.groupby(group_column)
    grouped_column_counter = {}
    loc_counter_global = {}
    for _, group_data in grouped_df:
        notes = group_data['notes'].unique()
        for element in notes:
            if element is not None:
                if element not in grouped_column_counter:
                    grouped_column_counter[element] = 1
                else:
                    grouped_column_counter[element] += 1
                    
        loc_data = group_data[["gene", "transcript", "CDS", "3utr", "5utr"]].sum()
        for loc_element in loc_data.index:
            if loc_data[loc_element] > 0:
                if loc_element not in loc_counter_global:
                    loc_counter_global[loc_element] = 1
                else:
                    loc_counter_global[loc_element] += 1
        
    notes_counter_global_sorted = dict(sorted(grouped_column_counter.items(), key=lambda x: x[1], reverse=True))
    notes_counter_global_sorted_df = pd.DataFrame(notes_counter_global_sorted.items(), columns=["notes", "count"])
    print(loc_counter_global)
    return notes_counter_global_sorted_df

### Check function and location


In [6]:
checking_data(neg_to_filter)
### Defining functions
neg_to_filter.head()

Shape of the data frame is: (1851, 16)
Number of unique values in column 'neg_name': 640 


Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
0,rejected_noCDS_c01.10,LinJ.01,34736,37218,-,LINF_010006300,,,True,False,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
1,rejected_noCDS_c01.10,LinJ.01,34736,37218,-,,LINF_01T0006300,LINF_010006300,False,True,False,False,False,,hypothetical_protein_-_conserved__,"[34736, 37218]"
2,rejected_noCDS_c01.10,LinJ.01,34736,36818,-,,,LINF_01T0006300,False,False,False,True,False,,,"[34736, 36818]"
3,rejected_noCDS_c01.20,LinJ.01,114146,116224,+,LINF_010009600,,,True,False,False,False,False,,nicotinamidase|PNC1,"[114146, 116224]"
4,rejected_noCDS_c01.20,LinJ.01,114146,116224,+,,LINF_01T0009600,LINF_010009600,False,True,False,False,False,,nicotinamidase|PNC1,"[114146, 116224]"


In [7]:
group_and_count(neg_to_filter, "neg_name")

{'gene': 640, 'transcript': 640, '3utr': 400, '5utr': 52, 'CDS': 4}


Unnamed: 0,notes,count
0,,638
1,hypothetical_protein_-_conserved,88
2,protein_of_unknown_function_-_conserved,31
3,amastin_surface_glycoprotein_-_putative,23
4,hypothetical_protein,12
...,...,...
210,Haloacid_dehalogenase-like_hydrolase,1
211,protein-l-isoaspartate_o-methyltransferase_-_p...,1
212,2′-O-ribose_methyltransferase|MTr1,1
213,mitochondrial_carrier_protein_-_putative,1


### Checking: Hypothetical protein

We'll see what to do with this kind of data. In this case probably we won't do anything because we don't know what the gene does.

In [8]:
# Let's search for "hypothetical_protein" and add them to a new data frame
filter_data = search_string(neg_to_filter, "hypothetical_protein")

The of filtered data: (208, 16)
The unique values in column 'neg_name': 104


In [9]:
# Now let's take from "neg_to_filter" all the rows that have the same "neg_name" as in "good_negatives"
good_negatives = neg_to_filter[neg_to_filter['neg_name'].isin(filter_data['neg_name'])]
checking_data(good_negatives)

Shape of the data frame is: (349, 16)
Number of unique values in column 'neg_name': 104 


In [10]:
# Remove them now:
neg_filtered = neg_to_filter[~neg_to_filter['neg_name'].isin(good_negatives['neg_name'])]
# Let's check the data frame now
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1502, 16)
Number of unique values in column 'neg_name': 536 
{'gene': 536, 'transcript': 536, '5utr': 34, '3utr': 307, 'CDS': 4}


Unnamed: 0,notes,count
0,,534
1,protein_of_unknown_function_-_conserved,31
2,amastin_surface_glycoprotein_-_putative,23
3,glucose_transporter,9
4,amastin-like_protein,8
...,...,...
200,Haloacid_dehalogenase-like_hydrolase,1
201,protein-l-isoaspartate_o-methyltransferase_-_p...,1
202,2′-O-ribose_methyltransferase|MTr1,1
203,mitochondrial_carrier_protein_-_putative,1


### Checking: Protein of unknown function

In [11]:
filter_data = search_string(neg_filtered, "protein_of_unknown_function")
filter_data['notes'].value_counts()

The of filtered data: (84, 16)
The unique values in column 'neg_name': 39


notes
protein_of_unknown_function_-_conserved                                                                                                 64
protein_of_unknown_function_(DUF3184)                                                                                                    4
Protein_of_unknown_function_(DUF962),_putative                                                                                           2
Protein_of_unknown_function_(DUF2946)_-_putative                                                                                         2
Protein_of_unknown_function_(DUF775)_-_putative                                                                                          2
Protein_of_unknown_function_-_conserved_(L1p/L10e_family)                                                                                2
Protein_of_unknown_function_N-terminal_domain_(DUF2450)/Sec8_exocyst_complex_component_specific_domain_containing_protein_-_putative     2
Protein_of_unknown_fu

In [12]:
filter_data[filter_data["gene"] == True]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
47,rejected_noCDS_c03.190,LinJ.03,235179,238900,+,LINF_030011300,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[235179, 238900]"
107,rejected_noCDS_c06.380,LinJ.06,62983,65883,-,LINF_060007000,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[62983, 65883]"
113,rejected_noCDS_c06.410,LinJ.06,402727,406088,+,LINF_060014900,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[402727, 406088]"
292,rejected_noCDS_c12.1180S,LinJ.12,372226,374050,+,LINF_120012350,,,True,False,False,False,False,,"Protein_of_unknown_function_(DUF962),_putative","[372226, 374050]"
295,rejected_noCDS_c12.1180S,LinJ.12,374051,377775,+,LINF_120012400,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[374051, 377775]"
298,rejected_noCDS_c12.1190,LinJ.12,374051,377775,+,LINF_120012400,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[374051, 377775]"
323,rejected_noCDS_c12.1240S,LinJ.12,429222,431627,+,LINF_120013800,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[429222, 431627]"
338,rejected_noCDS_c12.1290T,LinJ.12,528652,535332,+,LINF_120016200,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[528652, 535332]"
340,rejected_noCDS_c12.1300U,LinJ.12,528652,535332,+,LINF_120016200,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[528652, 535332]"
342,rejected_noCDS_c12.1310,LinJ.12,528652,535332,+,LINF_120016200,,,True,False,False,False,False,,protein_of_unknown_function_-_conserved,"[528652, 535332]"


In [13]:
filter_data[
    (filter_data["gene"] == True) & (filter_data["notes"] == "protein_of_unknown_function_(DUF3184)")
]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
463,rejected_noCDS_c16.1760Y,LinJ.16,371773,376703,+,LINF_160015800,,,True,False,False,False,False,,protein_of_unknown_function_(DUF3184),"[371773, 376703]"
466,rejected_noCDS_c16.1770Y,LinJ.16,377060,383329,+,LINF_160015820,,,True,False,False,False,False,,protein_of_unknown_function_(DUF3184),"[377060, 383329]"


In [14]:
# Now let's take the all the elements with the same "neg_name" as `filter_data`
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (136, 16)
Number of unique values in column 'neg_name': 39 


In [15]:
# Now let's add the lines in `filter_data` to `good_negatives`
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (485, 16)
Number of unique values in column 'neg_name': 143 


In [16]:
# Now let's remove the `filter_data` data from `neg_filtered`
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)


Shape of the data frame is: (1366, 16)
Number of unique values in column 'neg_name': 497 


In [17]:
# Check the global functions:
group_and_count(neg_filtered, "neg_name")

{'gene': 497, 'transcript': 497, '5utr': 26, '3utr': 271, 'CDS': 4}


Unnamed: 0,notes,count
0,,495
1,amastin_surface_glycoprotein_-_putative,23
2,glucose_transporter,9
3,amastin-like_protein,8
4,UDP-galactose_transporter|LPG5A,7
...,...,...
188,Haloacid_dehalogenase-like_hydrolase,1
189,protein-l-isoaspartate_o-methyltransferase_-_p...,1
190,2′-O-ribose_methyltransferase|MTr1,1
191,mitochondrial_carrier_protein_-_putative,1


### Checking: amastin

In [18]:
# Checking contents with amastin
filter_data = search_string(neg_filtered, "amastin")
filter_data['notes'].value_counts()

The of filtered data: (86, 16)
The unique values in column 'neg_name': 43


notes
amastin_surface_glycoprotein_-_putative    46
amastin-like_protein                       16
amastin-like_surface_protein_-_putative    12
amastin_surface_glycoprotein                8
amastin_surface_protein                     2
amastin-like_surface_protein                2
Name: count, dtype: int64

In [19]:
filter_data[filter_data["gene"] == True]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
197,rejected_noCDS_c08.770N,LinJ.08,337654,340312,+,LINF_080012900,,,True,False,False,False,False,,amastin-like_protein,"[337654, 340312]"
1050,rejected_noCDS_c30.3870,LinJ.30,274753,277962,+,LINF_300014200,,,True,False,False,False,False,,amastin_surface_protein,"[274753, 277962]"
1118,rejected_noCDS_c31.4130,LinJ.31,150123,152831,-,LINF_310009800,,,True,False,False,False,False,,amastin_surface_glycoprotein,"[150123, 152831]"
1121,rejected_noCDS_c31.4120,LinJ.31,150123,152831,-,LINF_310009800,,,True,False,False,False,False,,amastin_surface_glycoprotein,"[150123, 152831]"
1372,rejected_noCDS_c34.5090BR,LinJ.34,423724,426392,+,LINF_340015400,,,True,False,False,False,False,,amastin-like_surface_protein_-_putative,"[423724, 426392]"
1377,rejected_noCDS_c34.5110BR,LinJ.34,432504,435210,+,LINF_340015500,,,True,False,False,False,False,,amastin-like_protein,"[432504, 435210]"
1380,rejected_noCDS_c34.5120BR,LinJ.34,445685,448392,+,LINF_340015800,,,True,False,False,False,False,,amastin-like_protein,"[445685, 448392]"
1383,rejected_noCDS_c34.5130BR,LinJ.34,450104,453599,+,LINF_340015900,,,True,False,False,False,False,,amastin-like_protein,"[450104, 453599]"
1388,rejected_noCDS_c34.5140BS,LinJ.34,513172,516089,-,LINF_340017500,,,True,False,False,False,False,,amastin-like_protein,"[513172, 516089]"
1401,rejected_noCDS_c34.5190,LinJ.34,739819,743551,-,LINF_340022900,,,True,False,False,False,False,,amastin-like_protein,"[739819, 743551]"


Since whe checked that these elements al from the repetitive elements "amastin", let's delete them

In [20]:
# Now let's take the all the elements with the same "neg_name" as `filter_data`
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (141, 16)
Number of unique values in column 'neg_name': 43 


In [21]:
# Let's remove them from `neg_filtered`
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1225, 16)
Number of unique values in column 'neg_name': 454 
{'gene': 454, 'transcript': 454, '5utr': 25, '3utr': 229, 'CDS': 4}


Unnamed: 0,notes,count
0,,452
1,glucose_transporter,9
2,UDP-galactose_transporter|LPG5A,7
3,Tripartite_attachment_complex_40|TAC40,6
4,phosphoglycan_beta_1-3_galactosyltransferase,5
...,...,...
182,Haloacid_dehalogenase-like_hydrolase,1
183,protein-l-isoaspartate_o-methyltransferase_-_p...,1
184,2′-O-ribose_methyltransferase|MTr1,1
185,mitochondrial_carrier_protein_-_putative,1


### Checking: glucose

In [22]:
filter_data = search_string(neg_filtered, "glucose")
filter_data['notes'].value_counts()

The of filtered data: (18, 16)
The unique values in column 'neg_name': 9


notes
glucose_transporter    18
Name: count, dtype: int64

In [23]:
filter_data[filter_data["gene"] == True]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
1807,rejected_noCDS_c36.6650CS,LinJ.36,2456799,2460093,-,LINF_360072900,,,True,False,False,False,False,,glucose_transporter,"[2456799, 2460093]"
1810,rejected_noCDS_c36.6660CS,LinJ.36,2460371,2463651,-,LINF_360073000,,,True,False,False,False,False,,glucose_transporter,"[2460371, 2463651]"
1813,rejected_noCDS_c36.6670CS,LinJ.36,2463918,2467210,-,LINF_360073100,,,True,False,False,False,False,,glucose_transporter,"[2463918, 2467210]"
1816,rejected_noCDS_c36.6680CS,LinJ.36,2467477,2470767,-,LINF_360073200,,,True,False,False,False,False,,glucose_transporter,"[2467477, 2470767]"
1819,rejected_noCDS_c36.6690CS,LinJ.36,2471045,2474323,-,LINF_360073300,,,True,False,False,False,False,,glucose_transporter,"[2471045, 2474323]"
1822,rejected_noCDS_c36.6700CS,LinJ.36,2474601,2477880,-,LINF_360073400,,,True,False,False,False,False,,glucose_transporter,"[2474601, 2477880]"
1825,rejected_noCDS_c36.6710CS,LinJ.36,2478158,2481440,-,LINF_360073500,,,True,False,False,False,False,,glucose_transporter,"[2478158, 2481440]"
1828,rejected_noCDS_c36.6720CS,LinJ.36,2481878,2486596,-,LINF_360073600,,,True,False,False,False,False,,glucose_transporter,"[2481878, 2486596]"
1831,rejected_noCDS_c36.6730,LinJ.36,2481878,2486596,-,LINF_360073600,,,True,False,False,False,False,,glucose_transporter,"[2481878, 2486596]"


Here is all the CS family and 6730. Let's remove them

In [24]:
# Take all elements with those names
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (27, 16)
Number of unique values in column 'neg_name': 9 


In [25]:
# Let's remove them from `neg_filtered`
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1198, 16)
Number of unique values in column 'neg_name': 445 
{'gene': 445, 'transcript': 445, '5utr': 25, '3utr': 220, 'CDS': 4}


Unnamed: 0,notes,count
0,,443
1,UDP-galactose_transporter|LPG5A,7
2,Tripartite_attachment_complex_40|TAC40,6
3,phosphoglycan_beta_1-3_galactosyltransferase,5
4,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,5
...,...,...
181,Haloacid_dehalogenase-like_hydrolase,1
182,protein-l-isoaspartate_o-methyltransferase_-_p...,1
183,2′-O-ribose_methyltransferase|MTr1,1
184,mitochondrial_carrier_protein_-_putative,1


### Checking: galactose

In [26]:
filter_data = search_string(neg_filtered, "galactose")
filter_data['notes'].value_counts()

The of filtered data: (16, 16)
The unique values in column 'neg_name': 7


notes
UDP-galactose_transporter|LPG5A    16
Name: count, dtype: int64

In [27]:
filter_data[filter_data["gene"] == True]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
727,rejected_noCDS_c24.2720,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
729,rejected_noCDS_c24.2730,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
731,rejected_noCDS_c24.2740,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
733,rejected_noCDS_c24.2750,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
735,rejected_noCDS_c24.2760,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
737,rejected_noCDS_c24.2710AJ,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
740,rejected_noCDS_c24.2770AJ,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
748,rejected_noCDS_c24.2770AJ,LinJ.24,117721,122337,+,LINF_240008400,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[117721, 122337]"


In [28]:
# Check all elements in `neg_to_filter` that ends in "number + J" in "neg_name" column
neg_to_filter[neg_to_filter['neg_name'].str.contains(r'\d+AJ$', na=False)]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
725,rejected_noCDS_c24.2710AJ,LinJ.24,108132,110204,+,LINF_240008250,,,True,False,False,False,False,,,"[108132, 110204]"
726,rejected_noCDS_c24.2710AJ,LinJ.24,108132,110204,+,,LINF_24T0008250,LINF_240008250,False,True,False,False,False,,,"[108132, 110204]"
737,rejected_noCDS_c24.2710AJ,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
738,rejected_noCDS_c24.2710AJ,LinJ.24,110205,117720,+,,LINF_24T0008300,LINF_240008300,False,True,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
739,rejected_noCDS_c24.2710AJ,LinJ.24,110205,110267,+,,,LINF_24T0008300,False,False,False,False,True,,,"[110205, 110267]"
740,rejected_noCDS_c24.2770AJ,LinJ.24,110205,117720,+,LINF_240008300,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
741,rejected_noCDS_c24.2770AJ,LinJ.24,110205,117720,+,,LINF_24T0008300,LINF_240008300,False,True,False,False,False,,UDP-galactose_transporter|LPG5A,"[110205, 117720]"
747,rejected_noCDS_c24.2770AJ,LinJ.24,111675,117720,+,,,LINF_24T0008300,False,False,False,True,False,,,"[111675, 117720]"
748,rejected_noCDS_c24.2770AJ,LinJ.24,117721,122337,+,LINF_240008400,,,True,False,False,False,False,,UDP-galactose_transporter|LPG5A,"[117721, 122337]"
749,rejected_noCDS_c24.2770AJ,LinJ.24,117721,122337,+,,LINF_24T0008400,LINF_240008400,False,True,False,False,False,,UDP-galactose_transporter|LPG5A,"[117721, 122337]"


Interesting the members from the family. rejected_noCDS_c24.2790AJ is new here

In [29]:
# Check all rows with the "neg_names"
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (26, 16)
Number of unique values in column 'neg_name': 7 


In [30]:
# Let's save all of them for now
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (511, 16)
Number of unique values in column 'neg_name': 150 


In [31]:
# And let's remove them from `neg_filtered`
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1172, 16)
Number of unique values in column 'neg_name': 438 
{'gene': 438, 'transcript': 438, '5utr': 23, '3utr': 214, 'CDS': 4}


Unnamed: 0,notes,count
0,,436
1,Tripartite_attachment_complex_40|TAC40,6
2,phosphoglycan_beta_1-3_galactosyltransferase,5
3,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,5
4,alpha/beta_hydrolase,3
...,...,...
180,Haloacid_dehalogenase-like_hydrolase,1
181,protein-l-isoaspartate_o-methyltransferase_-_p...,1
182,2′-O-ribose_methyltransferase|MTr1,1
183,mitochondrial_carrier_protein_-_putative,1


### Checking: tripartite

In [32]:
filter_data = search_string(neg_filtered, "tripartite")
filter_data['notes'].value_counts()

The of filtered data: (12, 16)
The unique values in column 'neg_name': 6


notes
Tripartite_attachment_complex_40|TAC40    12
Name: count, dtype: int64

In [33]:
filter_data[filter_data["gene"] == True]

Unnamed: 0,neg_name,chrom,start,end,strand,gene_id,transcript_id,parent_id,gene,transcript,CDS,3utr,5utr,pseudogen,notes,interval
1604,rejected_noCDS_c34.5900CD,LinJ.34,1372364,1376363,+,LINF_340037600,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1372364, 1376363]"
1606,rejected_noCDS_c34.5910CE,LinJ.34,1372364,1376363,+,LINF_340037600,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1372364, 1376363]"
1608,rejected_noCDS_c34.5920CC,LinJ.34,1372364,1376363,+,LINF_340037600,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1372364, 1376363]"
1615,rejected_noCDS_c34.5940CD,LinJ.34,1391972,1395962,+,LINF_340038100,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1391972, 1395962]"
1617,rejected_noCDS_c34.5950CE,LinJ.34,1391972,1395962,+,LINF_340038100,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1391972, 1395962]"
1619,rejected_noCDS_c34.5960CC,LinJ.34,1391972,1395962,+,LINF_340038100,,,True,False,False,False,False,,Tripartite_attachment_complex_40|TAC40,"[1391972, 1395962]"


In [34]:
# Take all rows with that name
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (18, 16)
Number of unique values in column 'neg_name': 6 


In [35]:
# Let's check families CC
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+CC$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1598,rejected_noCDS_c34.5880CC,LinJ.34,LINF_340037300,mitochondrial_carrier_protein|MCP6,1354107,1357619
1601,rejected_noCDS_c34.5890CC,LinJ.34,LINF_340037350,hypothetical_protein_-_conserved,1358089,1365799
1608,rejected_noCDS_c34.5920CC,LinJ.34,LINF_340037600,Tripartite_attachment_complex_40|TAC40,1372364,1376363
1613,rejected_noCDS_c34.5930CC,LinJ.34,LINF_340037850,,1381967,1383514
1619,rejected_noCDS_c34.5960CC,LinJ.34,LINF_340038100,Tripartite_attachment_complex_40|TAC40,1391972,1395962
1624,rejected_noCDS_c34.5980CC,LinJ.34,LINF_340038800,NAD_dependent_epimerase/dehydratase_family,1420070,1428451


In [36]:
# Let's check family CD
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+CD$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1604,rejected_noCDS_c34.5900CD,LinJ.34,LINF_340037600,Tripartite_attachment_complex_40|TAC40,1372364,1376363
1615,rejected_noCDS_c34.5940CD,LinJ.34,LINF_340038100,Tripartite_attachment_complex_40|TAC40,1391972,1395962


In [37]:
# Let's check family CE
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+CE$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1606,rejected_noCDS_c34.5910CE,LinJ.34,LINF_340037600,Tripartite_attachment_complex_40|TAC40,1372364,1376363
1617,rejected_noCDS_c34.5950CE,LinJ.34,LINF_340038100,Tripartite_attachment_complex_40|TAC40,1391972,1395962


Ok now let's remove them from the data

In [38]:
# Remove the from `neg_filtered`
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1154, 16)
Number of unique values in column 'neg_name': 432 
{'gene': 432, 'transcript': 432, '5utr': 23, '3utr': 208, 'CDS': 4}


Unnamed: 0,notes,count
0,,430
1,phosphoglycan_beta_1-3_galactosyltransferase,5
2,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,5
3,alpha/beta_hydrolase,3
4,guanosine_monophosphate_reductase|GMPR,3
...,...,...
179,Haloacid_dehalogenase-like_hydrolase,1
180,protein-l-isoaspartate_o-methyltransferase_-_p...,1
181,2′-O-ribose_methyltransferase|MTr1,1
182,mitochondrial_carrier_protein_-_putative,1


### Checking: phosphoglycan

In [39]:
filter_data = search_string(neg_filtered, "phosphoglycan")
filter_data['notes'].value_counts()

The of filtered data: (10, 16)
The unique values in column 'neg_name': 5


notes
phosphoglycan_beta_1-3_galactosyltransferase    10
Name: count, dtype: int64

In [40]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
13,rejected_noCDS_c02.80,LinJ.02,LINF_020006700,phosphoglycan_beta_1-3_galactosyltransferase,73083,83537
16,rejected_noCDS_c02.90B,LinJ.02,LINF_020007000,phosphoglycan_beta_1-3_galactosyltransferase,93941,102762
19,rejected_noCDS_c02.100,LinJ.02,LINF_020007000,phosphoglycan_beta_1-3_galactosyltransferase,93941,102762
22,rejected_noCDS_c02.110,LinJ.02,LINF_020007000,phosphoglycan_beta_1-3_galactosyltransferase,93941,102762
25,rejected_noCDS_c02.120,LinJ.02,LINF_020007000,phosphoglycan_beta_1-3_galactosyltransferase,93941,102762


In [41]:
# Let's check family "B"
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+B$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
16,rejected_noCDS_c02.90B,LinJ.02,LINF_020007000,phosphoglycan_beta_1-3_galactosyltransferase,93941,102762
28,rejected_noCDS_c02.130B,LinJ.02,LINF_020007950,,137296,139716
41,rejected_noCDS_c02.160B,LinJ.02,LINF_020009400,Transmembrane_amino_acid_transporter_protein_-...,196594,201033


In [42]:
# Take all elements
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (15, 16)
Number of unique values in column 'neg_name': 5 


In [43]:
# Let's add them to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (526, 16)
Number of unique values in column 'neg_name': 155 


In [44]:
# Let's remove them from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1139, 16)
Number of unique values in column 'neg_name': 427 
{'gene': 427, 'transcript': 427, '5utr': 23, '3utr': 203, 'CDS': 4}


Unnamed: 0,notes,count
0,,425
1,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,5
2,alpha/beta_hydrolase,3
3,guanosine_monophosphate_reductase|GMPR,3
4,ATG8/AUT7/APG8/PAZ2,3
...,...,...
178,Haloacid_dehalogenase-like_hydrolase,1
179,protein-l-isoaspartate_o-methyltransferase_-_p...,1
180,2′-O-ribose_methyltransferase|MTr1,1
181,mitochondrial_carrier_protein_-_putative,1


### Checking: ketoacyl

In [45]:
filter_data = search_string(neg_filtered, "ketoacyl")
filter_data['notes'].value_counts()

The of filtered data: (12, 16)
The unique values in column 'neg_name': 6


notes
Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3    10
Ketoacyl-CoA_synthase|fatty_acid_elongase|ELO-3     2
Name: count, dtype: int64

In [46]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
374,rejected_noCDS_c14.1430V,LinJ.14,LINF_140012400,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,255152,267139
377,rejected_noCDS_c14.1440V,LinJ.14,LINF_140012400,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,255152,267139
380,rejected_noCDS_c14.1450V,LinJ.14,LINF_140012400,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,255152,267139
383,rejected_noCDS_c14.1460V,LinJ.14,LINF_140012400,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,255152,267139
386,rejected_noCDS_c14.1470V,LinJ.14,LINF_140012400,Ketoacyl-CoA_synthase|Fatty_acid_elongase|ELO-3,255152,267139
389,rejected_noCDS_c14.1480,LinJ.14,LINF_140012800,Ketoacyl-CoA_synthase|fatty_acid_elongase|ELO-3,280945,285205


In [47]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (18, 16)
Number of unique values in column 'neg_name': 6 


In [48]:
# Add to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (544, 16)
Number of unique values in column 'neg_name': 161 


In [49]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1121, 16)
Number of unique values in column 'neg_name': 421 
{'gene': 421, 'transcript': 421, '5utr': 23, '3utr': 197, 'CDS': 4}


Unnamed: 0,notes,count
0,,419
1,alpha/beta_hydrolase,3
2,guanosine_monophosphate_reductase|GMPR,3
3,ATG8/AUT7/APG8/PAZ2,3
4,methionine_aminopeptidase_2|MetAP2,3
...,...,...
176,Haloacid_dehalogenase-like_hydrolase,1
177,protein-l-isoaspartate_o-methyltransferase_-_p...,1
178,2′-O-ribose_methyltransferase|MTr1,1
179,mitochondrial_carrier_protein_-_putative,1


### Checking alpha/beta

In [50]:
filter_data = search_string(neg_filtered, "alpha/beta")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
alpha/beta_hydrolase    6
Name: count, dtype: int64

In [51]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
349,rejected_noCDS_c12.1330,LinJ.12,LINF_120017600,alpha/beta_hydrolase,581574,586501
351,rejected_noCDS_c12.1340T,LinJ.12,LINF_120017600,alpha/beta_hydrolase,581574,586501
353,rejected_noCDS_c12.1350U,LinJ.12,LINF_120017600,alpha/beta_hydrolase,581574,586501


In [52]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (9, 16)
Number of unique values in column 'neg_name': 3 


In [53]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (553, 16)
Number of unique values in column 'neg_name': 164 


In [54]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, 'neg_name')

Shape of the data frame is: (1112, 16)
Number of unique values in column 'neg_name': 418 
{'gene': 418, 'transcript': 418, '5utr': 23, '3utr': 194, 'CDS': 4}


Unnamed: 0,notes,count
0,,416
1,guanosine_monophosphate_reductase|GMPR,3
2,ATG8/AUT7/APG8/PAZ2,3
3,methionine_aminopeptidase_2|MetAP2,3
4,argininosuccinate_synthase|Citrulline-aspartat...,3
...,...,...
175,Haloacid_dehalogenase-like_hydrolase,1
176,protein-l-isoaspartate_o-methyltransferase_-_p...,1
177,2′-O-ribose_methyltransferase|MTr1,1
178,mitochondrial_carrier_protein_-_putative,1


### Checking: guanosine

In [55]:
filter_data = search_string(neg_filtered, "GMPR")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
guanosine_monophosphate_reductase|GMPR    6
Name: count, dtype: int64

In [56]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
491,rejected_noCDS_c17.1870,LinJ.17,LINF_170014800,guanosine_monophosphate_reductase|GMPR,389710,393557
494,rejected_noCDS_c17.1880,LinJ.17,LINF_170014800,guanosine_monophosphate_reductase|GMPR,389710,393557
497,rejected_noCDS_c17.1890,LinJ.17,LINF_170014800,guanosine_monophosphate_reductase|GMPR,389710,393557


In [57]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (9, 16)
Number of unique values in column 'neg_name': 3 


In [58]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (562, 16)
Number of unique values in column 'neg_name': 167 


In [59]:
# Let's remove them from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1103, 16)
Number of unique values in column 'neg_name': 415 
{'gene': 415, 'transcript': 415, '5utr': 23, '3utr': 191, 'CDS': 4}


Unnamed: 0,notes,count
0,,413
1,ATG8/AUT7/APG8/PAZ2,3
2,methionine_aminopeptidase_2|MetAP2,3
3,argininosuccinate_synthase|Citrulline-aspartat...,3
4,beta_galactofuranosyl_transferase|LPG1,3
...,...,...
174,Haloacid_dehalogenase-like_hydrolase,1
175,protein-l-isoaspartate_o-methyltransferase_-_p...,1
176,2′-O-ribose_methyltransferase|MTr1,1
177,mitochondrial_carrier_protein_-_putative,1


### Checking. ATG8

In [60]:
filter_data = search_string(neg_filtered, "ATG8")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
ATG8/AUT7/APG8/PAZ2    6
Name: count, dtype: int64

In [61]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
558,rejected_noCDS_c19.2110,LinJ.19,LINF_190013715,ATG8/AUT7/APG8/PAZ2,354993,355737
563,rejected_noCDS_c19.2120AD,LinJ.19,LINF_190013900,ATG8/AUT7/APG8/PAZ2,361081,362554
566,rejected_noCDS_c19.2130AD,LinJ.19,LINF_190014100,ATG8/AUT7/APG8/PAZ2,364663,366138


In [62]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (11, 16)
Number of unique values in column 'neg_name': 3 


In [63]:
# Remove "rejected_noCDS_c19.2110" since it's in CDS
filter_data = filter_data[filter_data['neg_name'] != 'rejected_noCDS_c19.2110']
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [64]:
# Add to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (568, 16)
Number of unique values in column 'neg_name': 169 


In [65]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1097, 16)
Number of unique values in column 'neg_name': 413 
{'gene': 413, 'transcript': 413, '5utr': 23, '3utr': 189, 'CDS': 4}


Unnamed: 0,notes,count
0,,411
1,methionine_aminopeptidase_2|MetAP2,3
2,argininosuccinate_synthase|Citrulline-aspartat...,3
3,beta_galactofuranosyl_transferase|LPG1,3
4,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,2
...,...,...
174,Haloacid_dehalogenase-like_hydrolase,1
175,protein-l-isoaspartate_o-methyltransferase_-_p...,1
176,2′-O-ribose_methyltransferase|MTr1,1
177,mitochondrial_carrier_protein_-_putative,1


### Checking: MetAP2

In [66]:
filter_data = search_string(neg_filtered, "MetAP2")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
methionine_aminopeptidase_2|MetAP2    6
Name: count, dtype: int64

In [67]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
627,rejected_noCDS_c21.2350,LinJ.21,LINF_210014700,methionine_aminopeptidase_2|MetAP2,306597,310529
629,rejected_noCDS_c21.2360,LinJ.21,LINF_210014700,methionine_aminopeptidase_2|MetAP2,306597,310529
631,rejected_noCDS_c21.2370,LinJ.21,LINF_210014700,methionine_aminopeptidase_2|MetAP2,306597,310529


In [68]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (9, 16)
Number of unique values in column 'neg_name': 3 


In [69]:
# Add to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (577, 16)
Number of unique values in column 'neg_name': 172 


In [70]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1088, 16)
Number of unique values in column 'neg_name': 410 
{'gene': 410, 'transcript': 410, '5utr': 23, '3utr': 186, 'CDS': 4}


Unnamed: 0,notes,count
0,,408
1,argininosuccinate_synthase|Citrulline-aspartat...,3
2,beta_galactofuranosyl_transferase|LPG1,3
3,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,2
4,protein_kinase,2
...,...,...
173,Haloacid_dehalogenase-like_hydrolase,1
174,protein-l-isoaspartate_o-methyltransferase_-_p...,1
175,2′-O-ribose_methyltransferase|MTr1,1
176,mitochondrial_carrier_protein_-_putative,1


### Checking arginino

In [71]:
filter_data = search_string(neg_filtered, "arginino")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
argininosuccinate_synthase|Citrulline-aspartate_ligase    6
Name: count, dtype: int64

In [72]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
693,rejected_noCDS_c23.2580AG,LinJ.23,LINF_230007900,argininosuccinate_synthase|Citrulline-aspartat...,93890,100632
695,rejected_noCDS_c23.2590AG,LinJ.23,LINF_230007900,argininosuccinate_synthase|Citrulline-aspartat...,93890,100632
697,rejected_noCDS_c23.2600AG,LinJ.23,LINF_230007900,argininosuccinate_synthase|Citrulline-aspartat...,93890,100632


In [73]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (9, 16)
Number of unique values in column 'neg_name': 3 


In [74]:
# Add to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (586, 16)
Number of unique values in column 'neg_name': 175 


In [75]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1079, 16)
Number of unique values in column 'neg_name': 407 
{'gene': 407, 'transcript': 407, '5utr': 23, '3utr': 183, 'CDS': 4}


Unnamed: 0,notes,count
0,,405
1,beta_galactofuranosyl_transferase|LPG1,3
2,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,2
3,protein_kinase,2
4,dual_specificity_phosphatase-like_protein,2
...,...,...
172,Haloacid_dehalogenase-like_hydrolase,1
173,protein-l-isoaspartate_o-methyltransferase_-_p...,1
174,2′-O-ribose_methyltransferase|MTr1,1
175,mitochondrial_carrier_protein_-_putative,1


### Checking: LPG1

In [76]:
filter_data = search_string(neg_filtered, "LPG1")
filter_data['notes'].value_counts()

The of filtered data: (6, 16)
The unique values in column 'neg_name': 3


notes
beta_galactofuranosyl_transferase|LPG1    6
Name: count, dtype: int64

In [77]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
770,rejected_noCDS_c25.2860AL,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
773,rejected_noCDS_c25.2870AM,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
776,rejected_noCDS_c25.2850AK,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363


In [78]:
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+AL$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
770,rejected_noCDS_c25.2860AL,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
779,rejected_noCDS_c25.2880AL,LinJ.25,LINF_250006300,electron_transfer_flavoprotein,29864,33171
785,rejected_noCDS_c25.2900AL,LinJ.25,LINF_250007950,,81625,83195


In [79]:
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+AM$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
773,rejected_noCDS_c25.2870AM,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
787,rejected_noCDS_c25.2910AM,LinJ.25,LINF_250007950,,81625,83195


In [80]:
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+AK$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
776,rejected_noCDS_c25.2850AK,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
1359,rejected_noCDS_c33.5040AK,LinJ.33,LINF_330042920,,1531735,1532106


In [81]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (9, 16)
Number of unique values in column 'neg_name': 3 


In [82]:
# Add to "good_negatives"
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (595, 16)
Number of unique values in column 'neg_name': 178 


In [83]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1070, 16)
Number of unique values in column 'neg_name': 404 
{'gene': 404, 'transcript': 404, '5utr': 23, '3utr': 180, 'CDS': 4}


Unnamed: 0,notes,count
0,,402
1,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,2
2,protein_kinase,2
3,dual_specificity_phosphatase-like_protein,2
4,vacuolar-type_Ca2+-ATPase_-_putative,2
...,...,...
171,Haloacid_dehalogenase-like_hydrolase,1
172,protein-l-isoaspartate_o-methyltransferase_-_p...,1
173,2′-O-ribose_methyltransferase|MTr1,1
174,mitochondrial_carrier_protein_-_putative,1


### Checking: cassette

In [84]:
filter_data = search_string(neg_filtered, "cassette")
filter_data['notes'].value_counts()

The of filtered data: (10, 16)
The unique values in column 'neg_name': 5


notes
ATP-binding_cassette_subfamily_A_-_member_1|ABCA1           4
ATP-binding_cassette_protein_subfamily_A,_member_7|ABCA7    4
ATP-binding_cassette_protein_subfamily_B,_member_2|ABCB2    2
Name: count, dtype: int64

In [85]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
32,rejected_noCDS_c02.140,LinJ.02,LINF_020008000,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,139717,150152
35,rejected_noCDS_c02.150,LinJ.02,LINF_020008000,ATP-binding_cassette_subfamily_A_-_member_1|ABCA1,139717,150152
430,rejected_noCDS_c15.1630,LinJ.15,LINF_150015400,"ATP-binding_cassette_protein_subfamily_A,_memb...",353060,362858
433,rejected_noCDS_c15.1640,LinJ.15,LINF_150015400,"ATP-binding_cassette_protein_subfamily_A,_memb...",353060,362858
860,rejected_noCDS_c26.3190,LinJ.26,LINF_260032600,"ATP-binding_cassette_protein_subfamily_B,_memb...",1034309,1041738


In [86]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (20, 16)
Number of unique values in column 'neg_name': 5 


In [87]:
# Remove from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1050, 16)
Number of unique values in column 'neg_name': 399 
{'gene': 399, 'transcript': 399, '5utr': 22, '3utr': 175, 'CDS': 4}


Unnamed: 0,notes,count
0,,397
1,protein_kinase,2
2,dual_specificity_phosphatase-like_protein,2
3,vacuolar-type_Ca2+-ATPase_-_putative,2
4,malonyl-coa_decarboxylase-like_protein,2
...,...,...
167,Haloacid_dehalogenase-like_hydrolase,1
168,protein-l-isoaspartate_o-methyltransferase_-_p...,1
169,2′-O-ribose_methyltransferase|MTr1,1
170,mitochondrial_carrier_protein_-_putative,1


### Checking: r"protein_kinase|kinase_protein"

In [88]:
filter_data = search_string(neg_filtered, r"protein_kinase|kinase_protein")
filter_data['notes'].value_counts()

The of filtered data: (20, 16)
The unique values in column 'neg_name': 10


notes
protein_kinase                                   4
mitogen-activated_protein_kinase_3|MPK3|MAPK3    4
serine/threonine_protein_kinase                  4
Protein_kinase_domain-containing_protein         4
Protein_kinase_(STE_family)                      2
Cdc2-related_protein_kinase_7|CRK7               2
Name: count, dtype: int64

In [89]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
75,rejected_noCDS_c05.290,LinJ.05,LINF_050006200,protein_kinase,36373,40873
259,rejected_noCDS_c10.1040,LinJ.10,LINF_100011600,mitogen-activated_protein_kinase_3|MPK3|MAPK3,266644,271932
261,rejected_noCDS_c10.1050,LinJ.10,LINF_100011600,mitogen-activated_protein_kinase_3|MPK3|MAPK3,266644,271932
397,rejected_noCDS_c14.1510,LinJ.14,LINF_140019400,Protein_kinase_(STE_family),575947,582294
624,rejected_noCDS_c21.2340,LinJ.21,LINF_210006700,serine/threonine_protein_kinase,41211,50310
810,rejected_noCDS_c26.3000,LinJ.26,LINF_260005400,Cdc2-related_protein_kinase_7|CRK7,9244,11310
1187,rejected_noCDS_c31.4400,LinJ.31,LINF_310025700,protein_kinase,910778,916576
1744,rejected_noCDS_c36.6410,LinJ.36,LINF_360021400,serine/threonine_protein_kinase,571329,578055
1785,rejected_noCDS_c36.6580CR,LinJ.36,LINF_360062500,Protein_kinase_domain-containing_protein,2087103,2094370
1788,rejected_noCDS_c36.6590,LinJ.36,LINF_360062500,Protein_kinase_domain-containing_protein,2087103,2094370


In [90]:
neg_to_filter.loc[
    (neg_to_filter['neg_name'].str.contains(r'\d+CR$', na=False)) & (neg_to_filter['gene'] == True),
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1774,rejected_noCDS_c36.6540CR,LinJ.36,LINF_360058750,,1944919,1946935
1779,rejected_noCDS_c36.6560CR,LinJ.36,LINF_360059300,hypothetical_protein_-_conserved,1963667,1973734
1785,rejected_noCDS_c36.6580CR,LinJ.36,LINF_360062500,Protein_kinase_domain-containing_protein,2087103,2094370
1791,rejected_noCDS_c36.6600CR,LinJ.36,LINF_360063300,hypothetical_protein,2117939,2120182


In [91]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (32, 16)
Number of unique values in column 'neg_name': 10 


In [92]:
# Keep only "rejected_noCDS_c36.6580CR"
filter_data = filter_data[filter_data['neg_name'] == 'rejected_noCDS_c36.6580CR']
checking_data(filter_data)

Shape of the data frame is: (3, 16)
Number of unique values in column 'neg_name': 1 


In [93]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (598, 16)
Number of unique values in column 'neg_name': 179 


In [94]:
# Let's remove them from "neg_filtered"
filter_data = search_string(neg_filtered, r"protein_kinase|kinase_protein")
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

The of filtered data: (20, 16)
The unique values in column 'neg_name': 10
Shape of the data frame is: (1018, 16)
Number of unique values in column 'neg_name': 389 
{'gene': 389, 'transcript': 389, '5utr': 22, '3utr': 165, 'CDS': 4}


Unnamed: 0,notes,count
0,,387
1,dual_specificity_phosphatase-like_protein,2
2,vacuolar-type_Ca2+-ATPase_-_putative,2
3,malonyl-coa_decarboxylase-like_protein,2
4,Methionine_aminopeptidase_1|MetAP1,2
...,...,...
161,Haloacid_dehalogenase-like_hydrolase,1
162,protein-l-isoaspartate_o-methyltransferase_-_p...,1
163,2′-O-ribose_methyltransferase|MTr1,1
164,mitochondrial_carrier_protein_-_putative,1


### Checking: dual_specificity

In [95]:
filter_data = search_string(neg_filtered, "dual_specificity")
filter_data['notes'].value_counts()

The of filtered data: (8, 16)
The unique values in column 'neg_name': 4


notes
dual_specificity_phosphatase-like_protein    4
Dual_specificity_protein_phosphatase         4
Name: count, dtype: int64

In [96]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
78,rejected_noCDS_c05.310,LinJ.05,LINF_050007100,dual_specificity_phosphatase-like_protein,66289,73912
80,rejected_noCDS_c05.300D,LinJ.05,LINF_050007100,dual_specificity_phosphatase-like_protein,66289,73912
1490,rejected_noCDS_c34.5490BV,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617
1493,rejected_noCDS_c34.5500BW,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617


In [97]:
check_family(neg_to_filter, 'D')

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
80,rejected_noCDS_c05.300D,LinJ.05,LINF_050007100,dual_specificity_phosphatase-like_protein,66289,73912
84,rejected_noCDS_c05.320D,LinJ.05,LINF_050008400,mitoribosomal_protein_uS11m,104200,106507
87,rejected_noCDS_c05.320D,LinJ.05,LINF_050008500,trypanothione_reductase_(TRYR),106508,108915


In [98]:
check_family(neg_to_filter, 'BV')

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1475,rejected_noCDS_c34.5440BV,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326
1481,rejected_noCDS_c34.5460BV,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956
1490,rejected_noCDS_c34.5490BV,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617


In [99]:
check_family(neg_to_filter, 'BW')

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1478,rejected_noCDS_c34.5450BW,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326
1484,rejected_noCDS_c34.5470BW,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956
1493,rejected_noCDS_c34.5500BW,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617


In [100]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (12, 16)
Number of unique values in column 'neg_name': 4 


In [101]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (610, 16)
Number of unique values in column 'neg_name': 183 


In [102]:
# Let's remove them from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1006, 16)
Number of unique values in column 'neg_name': 385 
{'gene': 385, 'transcript': 385, '5utr': 21, '3utr': 162, 'CDS': 4}


Unnamed: 0,notes,count
0,,383
1,vacuolar-type_Ca2+-ATPase_-_putative,2
2,malonyl-coa_decarboxylase-like_protein,2
3,Methionine_aminopeptidase_1|MetAP1,2
4,mitochondrial_carrier_protein|MCP18,2
...,...,...
159,Haloacid_dehalogenase-like_hydrolase,1
160,protein-l-isoaspartate_o-methyltransferase_-_p...,1
161,2′-O-ribose_methyltransferase|MTr1,1
162,mitochondrial_carrier_protein_-_putative,1


### Checking: vacuolar

In [103]:
filter_data = search_string(neg_filtered, "vacuolar")
filter_data['notes'].value_counts()

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2


notes
vacuolar-type_Ca2+-ATPase_-_putative    4
Name: count, dtype: int64

In [104]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
127,rejected_noCDS_c07.460G,LinJ.07,LINF_070012100,vacuolar-type_Ca2+-ATPase_-_putative,297019,304430
129,rejected_noCDS_c07.470H,LinJ.07,LINF_070012100,vacuolar-type_Ca2+-ATPase_-_putative,297019,304430


In [105]:
check_family(neg_to_filter, "G")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
119,rejected_noCDS_c07.430G,LinJ.07,LINF_070010140,,211168,211744
123,rejected_noCDS_c07.430G,LinJ.07,LINF_070010160,,211951,214227
125,rejected_noCDS_c07.450G,LinJ.07,LINF_070010950,,251481,252207
127,rejected_noCDS_c07.460G,LinJ.07,LINF_070012100,vacuolar-type_Ca2+-ATPase_-_putative,297019,304430


In [106]:
check_family(neg_to_filter, "H")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
121,rejected_noCDS_c07.440H,LinJ.07,LINF_070010160,,211951,214227
129,rejected_noCDS_c07.470H,LinJ.07,LINF_070012100,vacuolar-type_Ca2+-ATPase_-_putative,297019,304430


In [107]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [108]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (616, 16)
Number of unique values in column 'neg_name': 185 


In [109]:
# Let's remove them from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (1000, 16)
Number of unique values in column 'neg_name': 383 
{'gene': 383, 'transcript': 383, '5utr': 21, '3utr': 160, 'CDS': 4}


Unnamed: 0,notes,count
0,,381
1,malonyl-coa_decarboxylase-like_protein,2
2,Methionine_aminopeptidase_1|MetAP1,2
3,mitochondrial_carrier_protein|MCP18,2
4,electron_transfer_flavoprotein,2
...,...,...
158,Haloacid_dehalogenase-like_hydrolase,1
159,protein-l-isoaspartate_o-methyltransferase_-_p...,1
160,2′-O-ribose_methyltransferase|MTr1,1
161,mitochondrial_carrier_protein_-_putative,1


### Checking: adenosyl

In [110]:
filter_data = search_string(neg_filtered, "adenosyl")
filter_data['notes'].value_counts()

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2


notes
S-adenosylmethionine_decarboxylase_proenzyme|ADOMETDC_prozyme    4
Name: count, dtype: int64

In [111]:
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1099,rejected_noCDS_c30.4050,LinJ.30,LINF_300036800,S-adenosylmethionine_decarboxylase_proenzyme|A...,1132567,1136504
1102,rejected_noCDS_c30.4060,LinJ.30,LINF_300036800,S-adenosylmethionine_decarboxylase_proenzyme|A...,1132567,1136504


In [112]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [113]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (622, 16)
Number of unique values in column 'neg_name': 187 


In [114]:
# Let's remove them from "neg_filtered"
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (994, 16)
Number of unique values in column 'neg_name': 381 
{'gene': 381, 'transcript': 381, '5utr': 21, '3utr': 158, 'CDS': 4}


Unnamed: 0,notes,count
0,,379
1,malonyl-coa_decarboxylase-like_protein,2
2,Methionine_aminopeptidase_1|MetAP1,2
3,mitochondrial_carrier_protein|MCP18,2
4,electron_transfer_flavoprotein,2
...,...,...
157,Haloacid_dehalogenase-like_hydrolase,1
158,protein-l-isoaspartate_o-methyltransferase_-_p...,1
159,2′-O-ribose_methyltransferase|MTr1,1
160,mitochondrial_carrier_protein_-_putative,1


### Checking: malonyl

In [117]:
filter_data = search_string(neg_filtered, "malonyl")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
malonyl-coa_decarboxylase-like_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
135,rejected_noCDS_c07.490,LinJ.07,LINF_070013000,malonyl-coa_decarboxylase-like_protein,334632,339908
137,rejected_noCDS_c07.500,LinJ.07,LINF_070013000,malonyl-coa_decarboxylase-like_protein,334632,339908


In [118]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [119]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (628, 16)
Number of unique values in column 'neg_name': 189 


In [121]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (988, 16)
Number of unique values in column 'neg_name': 379 
{'gene': 379, 'transcript': 379, '5utr': 21, '3utr': 156, 'CDS': 4}


Unnamed: 0,notes,count
0,,377
1,Methionine_aminopeptidase_1|MetAP1,2
2,mitochondrial_carrier_protein|MCP18,2
3,electron_transfer_flavoprotein,2
4,sphingolipid_delta_4_desaturase,2
...,...,...
156,Haloacid_dehalogenase-like_hydrolase,1
157,protein-l-isoaspartate_o-methyltransferase_-_p...,1
158,2′-O-ribose_methyltransferase|MTr1,1
159,mitochondrial_carrier_protein_-_putative,1


### Checking: MetAP1

In [122]:
filter_data = search_string(neg_filtered, "MetAP1")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Methionine_aminopeptidase_1|MetAP1    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
547,rejected_noCDS_c19.2070AC,LinJ.19,LINF_190010300,Methionine_aminopeptidase_1|MetAP1,207856,211386
549,rejected_noCDS_c19.2080AB,LinJ.19,LINF_190010300,Methionine_aminopeptidase_1|MetAP1,207856,211386


In [123]:
check_family(neg_to_filter, "AC")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
547,rejected_noCDS_c19.2070AC,LinJ.19,LINF_190010300,Methionine_aminopeptidase_1|MetAP1,207856,211386
553,rejected_noCDS_c19.2090AC,LinJ.19,LINF_190010600,protein_of_unknown_function_-_conserved,224264,231735


In [124]:
check_family(neg_to_filter, "AB")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
544,rejected_noCDS_c19.2060AB,LinJ.19,LINF_190009700,hypothetical_protein_-_conserved,182279,187638
549,rejected_noCDS_c19.2080AB,LinJ.19,LINF_190010300,Methionine_aminopeptidase_1|MetAP1,207856,211386


In [125]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [126]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (634, 16)
Number of unique values in column 'neg_name': 191 


In [127]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (982, 16)
Number of unique values in column 'neg_name': 377 
{'gene': 377, 'transcript': 377, '5utr': 21, '3utr': 154, 'CDS': 4}


Unnamed: 0,notes,count
0,,375
1,mitochondrial_carrier_protein|MCP18,2
2,electron_transfer_flavoprotein,2
3,sphingolipid_delta_4_desaturase,2
4,Amino_acid_transporter|AAT24.1,2
...,...,...
155,Haloacid_dehalogenase-like_hydrolase,1
156,protein-l-isoaspartate_o-methyltransferase_-_p...,1
157,2′-O-ribose_methyltransferase|MTr1,1
158,mitochondrial_carrier_protein_-_putative,1


### Checking: mitochondrial carrier protein | MCP18

In [132]:
filter_data = search_string(neg_filtered, r"MCP18")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
mitochondrial_carrier_protein|MCP18    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
713,rejected_noCDS_c23.2670,LinJ.23,LINF_230023000,mitochondrial_carrier_protein|MCP18,673878,677745
715,rejected_noCDS_c23.2680,LinJ.23,LINF_230023000,mitochondrial_carrier_protein|MCP18,673878,677745


In [133]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [134]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (640, 16)
Number of unique values in column 'neg_name': 193 


In [135]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (976, 16)
Number of unique values in column 'neg_name': 375 
{'gene': 375, 'transcript': 375, '5utr': 21, '3utr': 152, 'CDS': 4}


Unnamed: 0,notes,count
0,,373
1,electron_transfer_flavoprotein,2
2,sphingolipid_delta_4_desaturase,2
3,Amino_acid_transporter|AAT24.1,2
4,Exonuclease_-_putative,2
...,...,...
154,Haloacid_dehalogenase-like_hydrolase,1
155,protein-l-isoaspartate_o-methyltransferase_-_p...,1
156,2′-O-ribose_methyltransferase|MTr1,1
157,mitochondrial_carrier_protein_-_putative,1


### Checking: electron transfer flavoprotein

In [136]:
filter_data = search_string(neg_filtered, "electron")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
electron_transfer_flavoprotein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
779,rejected_noCDS_c25.2880AL,LinJ.25,LINF_250006300,electron_transfer_flavoprotein,29864,33171
782,rejected_noCDS_c25.2890AN,LinJ.25,LINF_250006300,electron_transfer_flavoprotein,29864,33171


In [137]:
check_family(neg_to_filter, "AL")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
770,rejected_noCDS_c25.2860AL,LinJ.25,LINF_250005000,beta_galactofuranosyl_transferase|LPG1,799,5363
779,rejected_noCDS_c25.2880AL,LinJ.25,LINF_250006300,electron_transfer_flavoprotein,29864,33171
785,rejected_noCDS_c25.2900AL,LinJ.25,LINF_250007950,,81625,83195


In [138]:
check_family(neg_to_filter, "AN")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
782,rejected_noCDS_c25.2890AN,LinJ.25,LINF_250006300,electron_transfer_flavoprotein,29864,33171
789,rejected_noCDS_c25.2920AN,LinJ.25,LINF_250007950,,81625,83195


In [139]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [140]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (646, 16)
Number of unique values in column 'neg_name': 195 


In [141]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (970, 16)
Number of unique values in column 'neg_name': 373 
{'gene': 373, 'transcript': 373, '5utr': 21, '3utr': 150, 'CDS': 4}


Unnamed: 0,notes,count
0,,371
1,sphingolipid_delta_4_desaturase,2
2,Amino_acid_transporter|AAT24.1,2
3,Exonuclease_-_putative,2
4,Rab-GTPase-TBC_domain_containing_protein,2
...,...,...
153,Haloacid_dehalogenase-like_hydrolase,1
154,protein-l-isoaspartate_o-methyltransferase_-_p...,1
155,2′-O-ribose_methyltransferase|MTr1,1
156,mitochondrial_carrier_protein_-_putative,1


### Checking: sphingolipid delta 4 desaturase

In [143]:
filter_data = search_string(neg_filtered, "sphingolipid_delta")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
sphingolipid_delta_4_desaturase    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
836,rejected_noCDS_c26.3090AP,LinJ.26,LINF_260022000,sphingolipid_delta_4_desaturase,605032,607263
839,rejected_noCDS_c26.3100AP,LinJ.26,LINF_260022100,sphingolipid_delta_4_desaturase,607528,609700


In [144]:
check_family(neg_to_filter, "AP")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
832,rejected_noCDS_c26.3070AP,LinJ.26,LINF_260021850,,598501,599917
836,rejected_noCDS_c26.3090AP,LinJ.26,LINF_260022000,sphingolipid_delta_4_desaturase,605032,607263
839,rejected_noCDS_c26.3100AP,LinJ.26,LINF_260022100,sphingolipid_delta_4_desaturase,607528,609700


In [145]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [146]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (652, 16)
Number of unique values in column 'neg_name': 197 


In [147]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (964, 16)
Number of unique values in column 'neg_name': 371 
{'gene': 371, 'transcript': 371, '5utr': 19, '3utr': 150, 'CDS': 4}


Unnamed: 0,notes,count
0,,369
1,Amino_acid_transporter|AAT24.1,2
2,Exonuclease_-_putative,2
3,Rab-GTPase-TBC_domain_containing_protein,2
4,adenosine_kinase-like_protein,2
...,...,...
152,Haloacid_dehalogenase-like_hydrolase,1
153,protein-l-isoaspartate_o-methyltransferase_-_p...,1
154,2′-O-ribose_methyltransferase|MTr1,1
155,mitochondrial_carrier_protein_-_putative,1


### Checking: amino acid transporter | AAT24.1

In [149]:
filter_data = search_string(neg_filtered, "AAT24")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Amino_acid_transporter|AAT24.1    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
900,rejected_noCDS_c27.3320,LinJ.27,LINF_270022600,Amino_acid_transporter|AAT24.1,711026,716531
902,rejected_noCDS_c27.3330,LinJ.27,LINF_270022600,Amino_acid_transporter|AAT24.1,711026,716531


In [150]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [152]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (658, 16)
Number of unique values in column 'neg_name': 199 


In [153]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (958, 16)
Number of unique values in column 'neg_name': 369 
{'gene': 369, 'transcript': 369, '5utr': 19, '3utr': 148, 'CDS': 4}


Unnamed: 0,notes,count
0,,367
1,Exonuclease_-_putative,2
2,Rab-GTPase-TBC_domain_containing_protein,2
3,adenosine_kinase-like_protein,2
4,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,2
...,...,...
151,Haloacid_dehalogenase-like_hydrolase,1
152,protein-l-isoaspartate_o-methyltransferase_-_p...,1
153,2′-O-ribose_methyltransferase|MTr1,1
154,mitochondrial_carrier_protein_-_putative,1


### Checking: exonuclease - putative

In [159]:
filter_data = search_string(neg_filtered, "exonuclease_-_")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Exonuclease_-_putative    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
939,rejected_noCDS_c29.3460AT,LinJ.29,LINF_290010600,Exonuclease_-_putative,205977,209251
942,rejected_noCDS_c29.3470AU,LinJ.29,LINF_290010600,Exonuclease_-_putative,205977,209251


In [160]:
check_family(neg_to_filter, "AT")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
939,rejected_noCDS_c29.3460AT,LinJ.29,LINF_290010600,Exonuclease_-_putative,205977,209251
945,rejected_noCDS_c29.3480AT,LinJ.29,LINF_290013800,hypothetical_protein_-_conserved,315865,322387


In [162]:
check_family(neg_to_filter, "AU")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
942,rejected_noCDS_c29.3470AU,LinJ.29,LINF_290010600,Exonuclease_-_putative,205977,209251
948,rejected_noCDS_c29.3490AU,LinJ.29,LINF_290013800,hypothetical_protein_-_conserved,315865,322387


In [163]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [164]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (664, 16)
Number of unique values in column 'neg_name': 201 


In [165]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (952, 16)
Number of unique values in column 'neg_name': 367 
{'gene': 367, 'transcript': 367, '5utr': 19, '3utr': 146, 'CDS': 4}


Unnamed: 0,notes,count
0,,365
1,Rab-GTPase-TBC_domain_containing_protein,2
2,adenosine_kinase-like_protein,2
3,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,2
4,tuzin_like_protein,2
...,...,...
150,Haloacid_dehalogenase-like_hydrolase,1
151,protein-l-isoaspartate_o-methyltransferase_-_p...,1
152,2′-O-ribose_methyltransferase|MTr1,1
153,mitochondrial_carrier_protein_-_putative,1


### Checking: rab-GTPase

In [166]:
filter_data = search_string(neg_filtered, "rab")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Rab-GTPase-TBC_domain_containing_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1044,rejected_noCDS_c30.3860BC,LinJ.30,LINF_300008300,Rab-GTPase-TBC_domain_containing_protein,102141,105323
1326,rejected_noCDS_c33.4890BC,LinJ.33,LINF_330032500,Rab-GTPase-TBC_domain_containing_protein,1005687,1009527


In [167]:
check_family(neg_to_filter, 'BC')

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1044,rejected_noCDS_c30.3860BC,LinJ.30,LINF_300008300,Rab-GTPase-TBC_domain_containing_protein,102141,105323
1047,rejected_noCDS_c30.3860BC,LinJ.30,LINF_300008400,KU80_protein,105722,109074
1323,rejected_noCDS_c33.4890BC,LinJ.33,LINF_330032400,Acetate:succinate_CoA_transferase,1002245,1005303
1326,rejected_noCDS_c33.4890BC,LinJ.33,LINF_330032500,Rab-GTPase-TBC_domain_containing_protein,1005687,1009527


In [168]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (12, 16)
Number of unique values in column 'neg_name': 2 


In [169]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (676, 16)
Number of unique values in column 'neg_name': 203 


In [170]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (940, 16)
Number of unique values in column 'neg_name': 365 
{'gene': 365, 'transcript': 365, '5utr': 17, '3utr': 144, 'CDS': 4}


Unnamed: 0,notes,count
0,,363
1,adenosine_kinase-like_protein,2
2,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,2
3,tuzin_like_protein,2
4,H(+)-exporting_diphosphatase,2
...,...,...
147,Haloacid_dehalogenase-like_hydrolase,1
148,protein-l-isoaspartate_o-methyltransferase_-_p...,1
149,2′-O-ribose_methyltransferase|MTr1,1
150,mitochondrial_carrier_protein_-_putative,1


### Checking: adenosine kinase - like protein

In [172]:
filter_data = search_string(neg_filtered, "adenosine_kinase")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
adenosine_kinase-like_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1053,rejected_noCDS_c30.3880,LinJ.30,LINF_300014500,adenosine_kinase-like_protein,285194,289972
1055,rejected_noCDS_c30.3890BD,LinJ.30,LINF_300014500,adenosine_kinase-like_protein,285194,289972


In [173]:
check_family(neg_to_filter, "BD")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1055,rejected_noCDS_c30.3890BD,LinJ.30,LINF_300014500,adenosine_kinase-like_protein,285194,289972
1059,rejected_noCDS_c30.3900BD,LinJ.30,LINF_300015100,Histone-binding_protein_RBBP4_or_subunit_C_of_...,313068,318963


In [174]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [175]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (682, 16)
Number of unique values in column 'neg_name': 205 


In [176]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (934, 16)
Number of unique values in column 'neg_name': 363 
{'gene': 363, 'transcript': 363, '5utr': 17, '3utr': 142, 'CDS': 4}


Unnamed: 0,notes,count
0,,361
1,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,2
2,tuzin_like_protein,2
3,H(+)-exporting_diphosphatase,2
4,peptidase_m20/m25/m40_family-like_protein,2
...,...,...
146,Haloacid_dehalogenase-like_hydrolase,1
147,protein-l-isoaspartate_o-methyltransferase_-_p...,1
148,2′-O-ribose_methyltransferase|MTr1,1
149,mitochondrial_carrier_protein_-_putative,1


### Checking: RNA recognition motif

In [177]:
filter_data = search_string(neg_filtered, "motif")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_RNP_domain)/RNA_recognition_motif_(a.k.a._RRM_-_RBD_-_or_RNP_domain)_-_putative    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1082,rejected_noCDS_c30.3990,LinJ.30,LINF_300027200,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,776492,786702
1318,rejected_noCDS_c33.4860,LinJ.33,LINF_330022800,RNA_recognition_motif._(a.k.a._RRM_-_RBD_-_or_...,609978,620397


In [178]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [179]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (688, 16)
Number of unique values in column 'neg_name': 207 


In [180]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (928, 16)
Number of unique values in column 'neg_name': 361 
{'gene': 361, 'transcript': 361, '5utr': 17, '3utr': 140, 'CDS': 4}


Unnamed: 0,notes,count
0,,359
1,tuzin_like_protein,2
2,H(+)-exporting_diphosphatase,2
3,peptidase_m20/m25/m40_family-like_protein,2
4,p-nitrophenylphosphatase,2
...,...,...
145,Haloacid_dehalogenase-like_hydrolase,1
146,protein-l-isoaspartate_o-methyltransferase_-_p...,1
147,2′-O-ribose_methyltransferase|MTr1,1
148,mitochondrial_carrier_protein_-_putative,1


### Checking: tuzin like protein

In [182]:
filter_data = search_string(neg_filtered, "tuzin_like")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
tuzin_like_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1124,rejected_noCDS_c31.4150,LinJ.31,LINF_310009850,tuzin_like_protein,153423,156946
1126,rejected_noCDS_c31.4140,LinJ.31,LINF_310009850,tuzin_like_protein,153423,156946


In [183]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (5, 16)
Number of unique values in column 'neg_name': 2 


In [184]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (693, 16)
Number of unique values in column 'neg_name': 209 


In [185]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (923, 16)
Number of unique values in column 'neg_name': 359 
{'gene': 359, 'transcript': 359, '5utr': 17, '3utr': 139, 'CDS': 4}


Unnamed: 0,notes,count
0,,358
1,H(+)-exporting_diphosphatase,2
2,peptidase_m20/m25/m40_family-like_protein,2
3,p-nitrophenylphosphatase,2
4,Ubiquitin-conjugating_enzyme_E2|UBC8,2
...,...,...
144,Haloacid_dehalogenase-like_hydrolase,1
145,protein-l-isoaspartate_o-methyltransferase_-_p...,1
146,2′-O-ribose_methyltransferase|MTr1,1
147,mitochondrial_carrier_protein_-_putative,1


### Checking: diphosphatase

In [187]:
filter_data = search_string(neg_filtered, "exporting_diphosphatase")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
H(+)-exporting_diphosphatase    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1152,rejected_noCDS_c31.4280BG,LinJ.31,LINF_310018700,H(+)-exporting_diphosphatase,500464,506449
1155,rejected_noCDS_c31.4290BF,LinJ.31,LINF_310018700,H(+)-exporting_diphosphatase,500464,506449


In [188]:
check_family(neg_to_filter, "BG")
## TODO: why there is no "rejected_noCDS_c31.4240BG"

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1152,rejected_noCDS_c31.4280BG,LinJ.31,LINF_310018700,H(+)-exporting_diphosphatase,500464,506449


In [189]:
check_family(neg_to_filter, "BF")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1136,rejected_noCDS_c31.4190BF,LinJ.31,LINF_310014250,,317136,317964
1138,rejected_noCDS_c31.4200BF,LinJ.31,LINF_310014500,amino_acid_permease_3|arginine_transporter|AAP...,322998,328575
1141,rejected_noCDS_c31.4210BF,LinJ.31,LINF_310016350,,405961,406903
1146,rejected_noCDS_c31.4260BF,LinJ.31,LINF_310017000,hypothetical_protein_-_conserved,440694,447113
1155,rejected_noCDS_c31.4290BF,LinJ.31,LINF_310018700,H(+)-exporting_diphosphatase,500464,506449


In [193]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [194]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (699, 16)
Number of unique values in column 'neg_name': 211 


In [195]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (917, 16)
Number of unique values in column 'neg_name': 357 
{'gene': 357, 'transcript': 357, '5utr': 17, '3utr': 137, 'CDS': 4}


Unnamed: 0,notes,count
0,,356
1,peptidase_m20/m25/m40_family-like_protein,2
2,p-nitrophenylphosphatase,2
3,Ubiquitin-conjugating_enzyme_E2|UBC8,2
4,Vps23_core_domain_containing_protein|Li1040,2
...,...,...
143,Haloacid_dehalogenase-like_hydrolase,1
144,protein-l-isoaspartate_o-methyltransferase_-_p...,1
145,2′-O-ribose_methyltransferase|MTr1,1
146,mitochondrial_carrier_protein_-_putative,1


### Checking: peptidase m20

In [196]:
filter_data = search_string(neg_filtered, "m20")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
peptidase_m20/m25/m40_family-like_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1190,rejected_noCDS_c31.4410BJ,LinJ.31,LINF_310026700,peptidase_m20/m25/m40_family-like_protein,967493,971919
1204,rejected_noCDS_c31.4460BJ,LinJ.31,LINF_310028200,peptidase_m20/m25/m40_family-like_protein,1029887,1033101


In [197]:
check_family(neg_to_filter, "BJ")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1190,rejected_noCDS_c31.4410BJ,LinJ.31,LINF_310026700,peptidase_m20/m25/m40_family-like_protein,967493,971919
1204,rejected_noCDS_c31.4460BJ,LinJ.31,LINF_310028200,peptidase_m20/m25/m40_family-like_protein,1029887,1033101


In [199]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (0, 16)
Number of unique values in column 'neg_name': 0 


In [200]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (911, 16)
Number of unique values in column 'neg_name': 355 
{'gene': 355, 'transcript': 355, '5utr': 17, '3utr': 135, 'CDS': 4}


Unnamed: 0,notes,count
0,,354
1,p-nitrophenylphosphatase,2
2,Ubiquitin-conjugating_enzyme_E2|UBC8,2
3,Vps23_core_domain_containing_protein|Li1040,2
4,Peroxisomal_membrane_protein,2
...,...,...
142,Haloacid_dehalogenase-like_hydrolase,1
143,protein-l-isoaspartate_o-methyltransferase_-_p...,1
144,2′-O-ribose_methyltransferase|MTr1,1
145,mitochondrial_carrier_protein_-_putative,1


### Checking: nitrophenylphoshatase

In [201]:
filter_data = search_string(neg_filtered, "nitrophenyl")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
p-nitrophenylphosphatase    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1216,rejected_noCDS_c31.4510,LinJ.31,LINF_310031700,p-nitrophenylphosphatase,1195831,1201297
1219,rejected_noCDS_c31.4520,LinJ.31,LINF_310031700,p-nitrophenylphosphatase,1195831,1201297


In [202]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [203]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (705, 16)
Number of unique values in column 'neg_name': 213 


In [204]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (905, 16)
Number of unique values in column 'neg_name': 353 
{'gene': 353, 'transcript': 353, '5utr': 17, '3utr': 133, 'CDS': 4}


Unnamed: 0,notes,count
0,,352
1,Ubiquitin-conjugating_enzyme_E2|UBC8,2
2,Vps23_core_domain_containing_protein|Li1040,2
3,Peroxisomal_membrane_protein,2
4,Cardiolipin_synthetase,2
...,...,...
141,Haloacid_dehalogenase-like_hydrolase,1
142,protein-l-isoaspartate_o-methyltransferase_-_p...,1
143,2′-O-ribose_methyltransferase|MTr1,1
144,mitochondrial_carrier_protein_-_putative,1


### Checking: ubiquitin-conjugating

In [206]:
filter_data = search_string(neg_filtered, "UBC")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Ubiquitin-conjugating_enzyme_E2|UBC8    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1236,rejected_noCDS_c32.4580,LinJ.32,LINF_320012400,Ubiquitin-conjugating_enzyme_E2|UBC8,239876,246114
1238,rejected_noCDS_c32.4590,LinJ.32,LINF_320012400,Ubiquitin-conjugating_enzyme_E2|UBC8,239876,246114


In [207]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [208]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (711, 16)
Number of unique values in column 'neg_name': 215 


In [209]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (899, 16)
Number of unique values in column 'neg_name': 351 
{'gene': 351, 'transcript': 351, '5utr': 17, '3utr': 131, 'CDS': 4}


Unnamed: 0,notes,count
0,,350
1,Vps23_core_domain_containing_protein|Li1040,2
2,Peroxisomal_membrane_protein,2
3,Cardiolipin_synthetase,2
4,eukaryotic_translation_initiation_factor_2-alp...,2
...,...,...
140,Haloacid_dehalogenase-like_hydrolase,1
141,protein-l-isoaspartate_o-methyltransferase_-_p...,1
142,2′-O-ribose_methyltransferase|MTr1,1
143,mitochondrial_carrier_protein_-_putative,1


### Checking: vps23

In [210]:
filter_data = search_string(neg_filtered, "vps23")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Vps23_core_domain_containing_protein|Li1040    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1252,rejected_noCDS_c32.4640BL,LinJ.32,LINF_320015500,Vps23_core_domain_containing_protein|Li1040,391856,395482
1254,rejected_noCDS_c32.4650BM,LinJ.32,LINF_320015500,Vps23_core_domain_containing_protein|Li1040,391856,395482


In [211]:
check_family(neg_to_filter, "BL")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1242,rejected_noCDS_c32.4600BL,LinJ.32,LINF_320014830,,345025,351432
1246,rejected_noCDS_c32.4620BL,LinJ.32,LINF_320015400,hypothetical_protein_-_conserved,374515,391521
1252,rejected_noCDS_c32.4640BL,LinJ.32,LINF_320015500,Vps23_core_domain_containing_protein|Li1040,391856,395482


In [212]:
check_family(neg_to_filter, "BM")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1244,rejected_noCDS_c32.4610BM,LinJ.32,LINF_320014830,,345025,351432
1248,rejected_noCDS_c32.4630BM,LinJ.32,LINF_320015400,hypothetical_protein_-_conserved,374515,391521
1254,rejected_noCDS_c32.4650BM,LinJ.32,LINF_320015500,Vps23_core_domain_containing_protein|Li1040,391856,395482


In [213]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [214]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (717, 16)
Number of unique values in column 'neg_name': 217 


In [215]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (893, 16)
Number of unique values in column 'neg_name': 349 
{'gene': 349, 'transcript': 349, '5utr': 17, '3utr': 129, 'CDS': 4}


Unnamed: 0,notes,count
0,,348
1,Peroxisomal_membrane_protein,2
2,Cardiolipin_synthetase,2
3,eukaryotic_translation_initiation_factor_2-alp...,2
4,calcineurin-like_phosphoesterase-like_protein,2
...,...,...
139,Haloacid_dehalogenase-like_hydrolase,1
140,protein-l-isoaspartate_o-methyltransferase_-_p...,1
141,2′-O-ribose_methyltransferase|MTr1,1
142,mitochondrial_carrier_protein_-_putative,1


### Checking: peroxisomal membrane

In [216]:
filter_data = search_string(neg_filtered, "peroxisomal")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Peroxisomal_membrane_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1297,rejected_noCDS_c33.4790,LinJ.33,LINF_330007900,Peroxisomal_membrane_protein,81344,85743
1300,rejected_noCDS_c33.4800,LinJ.33,LINF_330007900,Peroxisomal_membrane_protein,81344,85743


In [217]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [218]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (723, 16)
Number of unique values in column 'neg_name': 219 


In [219]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (887, 16)
Number of unique values in column 'neg_name': 347 
{'gene': 347, 'transcript': 347, '5utr': 17, '3utr': 127, 'CDS': 4}


Unnamed: 0,notes,count
0,,346
1,Cardiolipin_synthetase,2
2,eukaryotic_translation_initiation_factor_2-alp...,2
3,calcineurin-like_phosphoesterase-like_protein,2
4,NADH-dependent_fumarate_reductase,2
...,...,...
138,Haloacid_dehalogenase-like_hydrolase,1
139,protein-l-isoaspartate_o-methyltransferase_-_p...,1
140,2′-O-ribose_methyltransferase|MTr1,1
141,mitochondrial_carrier_protein_-_putative,1


### Checking: cardiolipin

In [220]:
filter_data = search_string(neg_filtered, "cardiolipin")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
Cardiolipin_synthetase    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1475,rejected_noCDS_c34.5440BV,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326
1478,rejected_noCDS_c34.5450BW,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326


In [221]:
check_family(neg_to_filter, "BV")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1475,rejected_noCDS_c34.5440BV,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326
1481,rejected_noCDS_c34.5460BV,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956
1490,rejected_noCDS_c34.5490BV,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617


In [222]:
check_family(neg_to_filter, "BW")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1478,rejected_noCDS_c34.5450BW,LinJ.34,LINF_340026200,Cardiolipin_synthetase,889011,895326
1484,rejected_noCDS_c34.5470BW,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956
1493,rejected_noCDS_c34.5500BW,LinJ.34,LINF_340027100,Dual_specificity_protein_phosphatase,923279,937617


In [223]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [224]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (729, 16)
Number of unique values in column 'neg_name': 221 


In [225]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (881, 16)
Number of unique values in column 'neg_name': 345 
{'gene': 345, 'transcript': 345, '5utr': 17, '3utr': 125, 'CDS': 4}


Unnamed: 0,notes,count
0,,344
1,eukaryotic_translation_initiation_factor_2-alp...,2
2,calcineurin-like_phosphoesterase-like_protein,2
3,NADH-dependent_fumarate_reductase,2
4,clathrin_heavy_chain,2
...,...,...
137,Haloacid_dehalogenase-like_hydrolase,1
138,protein-l-isoaspartate_o-methyltransferase_-_p...,1
139,2′-O-ribose_methyltransferase|MTr1,1
140,mitochondrial_carrier_protein_-_putative,1


### Checking: factor 2

In [226]:
filter_data = search_string(neg_filtered, "factor_2")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
eukaryotic_translation_initiation_factor_2-alpha_kinase|eIF2alpha_kinase|PERK|EIF2AK2    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1481,rejected_noCDS_c34.5460BV,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956
1484,rejected_noCDS_c34.5470BW,LinJ.34,LINF_340026700,eukaryotic_translation_initiation_factor_2-alp...,902943,911956


In [227]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [228]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (735, 16)
Number of unique values in column 'neg_name': 223 


In [229]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (875, 16)
Number of unique values in column 'neg_name': 343 
{'gene': 343, 'transcript': 343, '5utr': 17, '3utr': 123, 'CDS': 4}


Unnamed: 0,notes,count
0,,342
1,calcineurin-like_phosphoesterase-like_protein,2
2,NADH-dependent_fumarate_reductase,2
3,clathrin_heavy_chain,2
4,nicotinamidase|PNC1,1
...,...,...
136,Haloacid_dehalogenase-like_hydrolase,1
137,protein-l-isoaspartate_o-methyltransferase_-_p...,1
138,2′-O-ribose_methyltransferase|MTr1,1
139,mitochondrial_carrier_protein_-_putative,1


### Checking: calcineurin

In [230]:
filter_data = search_string(neg_filtered, "calcineurin")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
calcineurin-like_phosphoesterase-like_protein    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1504,rejected_noCDS_c34.5540,LinJ.34,LINF_340033800,calcineurin-like_phosphoesterase-like_protein,1225952,1230078
1507,rejected_noCDS_c34.5550BX,LinJ.34,LINF_340033800,calcineurin-like_phosphoesterase-like_protein,1225952,1230078


In [231]:
check_family(neg_to_filter, "BX")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1507,rejected_noCDS_c34.5550BX,LinJ.34,LINF_340033800,calcineurin-like_phosphoesterase-like_protein,1225952,1230078
1688,rejected_noCDS_c35.6200BX,LinJ.35,LINF_350036100,glycerol_kinase_-_glycosomal,1182104,1185965


In [232]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [233]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (741, 16)
Number of unique values in column 'neg_name': 225 


In [234]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (869, 16)
Number of unique values in column 'neg_name': 341 
{'gene': 341, 'transcript': 341, '5utr': 16, '3utr': 122, 'CDS': 4}


Unnamed: 0,notes,count
0,,340
1,NADH-dependent_fumarate_reductase,2
2,clathrin_heavy_chain,2
3,nicotinamidase|PNC1,1
4,Transmembrane_amino_acid_transporter_protein_-...,1
...,...,...
135,Haloacid_dehalogenase-like_hydrolase,1
136,protein-l-isoaspartate_o-methyltransferase_-_p...,1
137,2′-O-ribose_methyltransferase|MTr1,1
138,mitochondrial_carrier_protein_-_putative,1


### Checking: NADH-dpendent

In [235]:
filter_data = search_string(neg_filtered, "NADH")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
NADH-dependent_fumarate_reductase    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1665,rejected_noCDS_c35.6120CG,LinJ.35,LINF_350016700,NADH-dependent_fumarate_reductase,462165,468419
1668,rejected_noCDS_c35.6130,LinJ.35,LINF_350016700,NADH-dependent_fumarate_reductase,462165,468419


In [236]:
check_family(neg_to_filter, "CG")

Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1645,rejected_noCDS_c35.6050CG,LinJ.35,LINF_350011700,Endonuclease/Exonuclease/phosphatase_family_-_...,249811,254421
1665,rejected_noCDS_c35.6120CG,LinJ.35,LINF_350016700,NADH-dependent_fumarate_reductase,462165,468419


In [237]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [238]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (747, 16)
Number of unique values in column 'neg_name': 227 


In [239]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (863, 16)
Number of unique values in column 'neg_name': 339 
{'gene': 339, 'transcript': 339, '5utr': 15, '3utr': 121, 'CDS': 4}


Unnamed: 0,notes,count
0,,338
1,clathrin_heavy_chain,2
2,nicotinamidase|PNC1,1
3,Transmembrane_amino_acid_transporter_protein_-...,1
4,cytochrome_b-domain_protein,1
...,...,...
134,Haloacid_dehalogenase-like_hydrolase,1
135,protein-l-isoaspartate_o-methyltransferase_-_p...,1
136,2′-O-ribose_methyltransferase|MTr1,1
137,mitochondrial_carrier_protein_-_putative,1


### Checking: clathrin heavy chain

In [240]:
filter_data = search_string(neg_filtered, "clathrin")
print("="*50)
print(filter_data['notes'].value_counts())
filter_data.loc[
    filter_data["gene"] == True,
    ['neg_name', 'chrom', 'gene_id', 'notes', 'start', 'end']
]

The of filtered data: (4, 16)
The unique values in column 'neg_name': 2
notes
clathrin_heavy_chain    4
Name: count, dtype: int64


Unnamed: 0,neg_name,chrom,gene_id,notes,start,end
1754,rejected_noCDS_c36.6450,LinJ.36,LINF_360022600,clathrin_heavy_chain,632898,640647
1757,rejected_noCDS_c36.6460,LinJ.36,LINF_360022600,clathrin_heavy_chain,632898,640647


In [241]:
filter_data = neg_filtered[neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(filter_data)

Shape of the data frame is: (6, 16)
Number of unique values in column 'neg_name': 2 


In [242]:
good_negatives = pd.concat([good_negatives, filter_data])
checking_data(good_negatives)

Shape of the data frame is: (753, 16)
Number of unique values in column 'neg_name': 229 


In [243]:
neg_filtered = neg_filtered[~neg_filtered['neg_name'].isin(filter_data['neg_name'])]
checking_data(neg_filtered)
group_and_count(neg_filtered, "neg_name")

Shape of the data frame is: (857, 16)
Number of unique values in column 'neg_name': 337 
{'gene': 337, 'transcript': 337, '5utr': 15, '3utr': 119, 'CDS': 4}


Unnamed: 0,notes,count
0,,336
1,nicotinamidase|PNC1,1
2,Transmembrane_amino_acid_transporter_protein_-...,1
3,cytochrome_b-domain_protein,1
4,Ankyrin_repeats_(3_copies)/Ankyrin_repeats_(ma...,1
...,...,...
133,Haloacid_dehalogenase-like_hydrolase,1
134,protein-l-isoaspartate_o-methyltransferase_-_p...,1
135,2′-O-ribose_methyltransferase|MTr1,1
136,mitochondrial_carrier_protein_-_putative,1
