In [1]:
import numpy as np
import pandas as pd

In [2]:
#Loading data DataFrame
df_hocomoco = pd.read_csv('H12_complete_dataframe.csv')
df_jaspar = pd.read_csv('JASPAR2024_CORE_all_taxonomy.csv')
df_animaltfdb = pd.read_csv('New_matrix_AnimalRFDB.csv')

In [3]:
for idx, df in enumerate([df_hocomoco, df_jaspar, df_animaltfdb]):
    print(f'SHAPE DataFrame {['Hocomoco', 'Jaspar', 'AnimalTFDB'][idx]}: {df.shape}\n')

SHAPE DataFrame Hocomoco: (1443, 19)

SHAPE DataFrame Jaspar: (1362, 8)

SHAPE DataFrame AnimalTFDB: (404, 4)



### Merge dataframes into one dataframe

In [4]:
df_hocomoco.head(3)

Unnamed: 0,Motif,LOGO,Gene (human),Gene (mouse),Motif length,Quality,Motif subtype,Consensus,Quality.1,TF family,TF subfamily,HGNC,UniProt ID (human),UniProt ID (mouse),path_pcm,path_pwm,path_pfm,seqs_HUMAN,seqs_MOUSE
0,AHR.H12CORE.0.P.B,,AHR,Ahr,10,B,0,vhbWGCGTGM,P,PAS {1.2.5},PAS-AHR {1.2.5.1},HGNC:348,AHR_HUMAN,AHR_MOUSE,Эукариоты/HOCOMOCO/H12CORE_pcm/pcm/AHR.H12CORE...,Эукариоты/HOCOMOCO/H12CORE_pwm/pwm/AHR.H12CORE...,Эукариоты/HOCOMOCO/H12CORE_pfm/pfm/AHR.H12CORE...,>sp|P35869|AHR_HUMAN Aryl hydrocarbon receptor...,>sp|P30561|AHR_MOUSE Aryl hydrocarbon receptor...
1,AHRR.H12CORE.0.P.C,,AHRR,Ahrr,11,C,0,bYKCGTGMvWd,P,PAS {1.2.5},PAS-AHR {1.2.5.1},HGNC:346,AHRR_HUMAN,AHRR_MOUSE,Эукариоты/HOCOMOCO/H12CORE_pcm/pcm/AHRR.H12COR...,Эукариоты/HOCOMOCO/H12CORE_pwm/pwm/AHRR.H12COR...,Эукариоты/HOCOMOCO/H12CORE_pfm/pfm/AHRR.H12COR...,>sp|A9YTQ3|AHRR_HUMAN Aryl hydrocarbon recepto...,>sp|Q3U1U7|AHRR_MOUSE Aryl hydrocarbon recepto...
2,ALX1.H12CORE.0.SM.B,,ALX1,Alx1,20,B,0,nvbTAATTRRRTTAvnnnnn,S+M,Paired-related HD {3.1.3},ALX {3.1.3.1},HGNC:1494,ALX1_HUMAN,ALX1_MOUSE,Эукариоты/HOCOMOCO/H12CORE_pcm/pcm/ALX1.H12COR...,Эукариоты/HOCOMOCO/H12CORE_pwm/pwm/ALX1.H12COR...,Эукариоты/HOCOMOCO/H12CORE_pfm/pfm/ALX1.H12COR...,>sp|Q15699|ALX1_HUMAN ALX homeobox protein 1 O...,>sp|Q8C8B0|ALX1_MOUSE ALX homeobox protein 1 O...


In [5]:
df_jaspar.head(2)

Unnamed: 0,ID,Matrix,Taxonomy,TF_family,TF_class,Uniprot,Data_type,Sequence_TF
0,MA0010.2,"[[1.0, 1.0, 1.0, 6.0], [5.0, 2.0, 1.0, 1.0], [...",insects,Other factors with up to three adjacent zinc f...,C2H2 zinc finger factors,Q01295,COMPILED,>sp|Q01295|BRC1_DROME Broad-complex core prote...
1,MA0011.2,"[[0.0, 10.0, 0.0, 2.0], [0.0, 1.0, 0.0, 11.0],...",insects,Other factors with up to three adjacent zinc f...,C2H2 zinc finger factors,Q01295,COMPILED,>sp|Q01295|BRC1_DROME Broad-complex core prote...


In [6]:
df_animaltfdb.head(2)

Unnamed: 0,ID,Matrix,Protein_sequence,Tresh
0,V_AHR_Q6,"[[0.0, 1.0, 0.0, 0.0], [0.958333, 0.041667, 0....",>sp|A9YTQ3|AHRR_HUMAN Aryl hydrocarbon recepto...,33
1,V_ALX1_05,"[[0.138864, 0.266416, 0.123031, 0.471688], [0....",>sp|Q15699|ALX1_HUMAN ALX homeobox protein 1 O...,56


In [15]:
def convert_file_in_matrix(path_file: list) -> np.array:
    """Loading a matrix from a file in the format np.array 

    Parameters
    -----------
    path_file : the path to the file

    Returns
    -------
    matrix : the matrix in the format np.array
    
    """
    with open(f'../../{path_file}', 'r', encoding='utf-8') as f:
        text = f.readlines()

    text_matrix = text[1:]
    matrix = np.zeros((len(text_matrix), 4))
    for i in range(len(text_matrix)):
        matrix[i] = list(map(float, text_matrix[i].split('\t')))

    return matrix

In [8]:
def merge_motif_databases(
    hocomoco: pd.DataFrame,
    jaspar: pd.DataFrame,
    animal: pd.DataFrame) -> pd.DataFrame:
    """Combines data from three motive databases into a single DataFrame

    Parameters
    ----------
    hocomoco : table with data from database HOCOMOCO (pd.DataFrame)
    jaspar : table with data from database Japspar (pd.DataFrame)
    animal : table with data from database AnimalTFDB (pd.DataFrame)

    Returns
    -------

    merge_df : a dataframe with combined data from three databases; \ 
               columns: ids, name_database, matrix, uniprot_id 
    """
    
    # Combining ids
    motif_ids = (
        hocomoco['Motif'].tolist() +
        jaspar['ID'].tolist() +
        animal['ID'].tolist())

    # Matrix processing
    def process_hocomoco_matrices(df: pd.DataFrame) -> list:
        """Converting HOCOMOCO files to matrices"""
        return [convert_file_in_matrix(path) for path in df.path_pfm]

    def process_jaspar_matrices(df: pd.DataFrame) -> list:
        """Normalization of JASPAR matrices"""
        return [
            normalize_matrix(np.array(eval(matrix)))
            for matrix in df.Matrix.tolist()]

    def normalize_matrix(matrix: np.ndarray) -> np.ndarray:
        """Normalization of the matrix by columns"""
        return matrix / matrix.sum(axis=1, keepdims=True)

    matrices = (
        process_hocomoco_matrices(hocomoco) +
        process_jaspar_matrices(jaspar) +
        list(map(lambda x: np.array(eval(x)), animal.Matrix.tolist()))
    )

    # UniProt ID processing
    def extract_uniprot(sequence: str) -> str:
        """Extracting a UniProt ID from a sequence"""
        try:
            return '_'.join(sequence.split(' ')[0].split('|')[1:])
        except:
            return None

    uniprot_ids = (
        [extract_uniprot(seq) for seq in hocomoco["seqs_HUMAN"]]
        + [extract_uniprot(seq) for seq in jaspar["Sequence_TF"]]
        + [extract_uniprot(seq) for seq in animal["Protein_sequence"]]
    )

    # Creating the resulting DataFrame
    data = {
        'ID': motif_ids,
        'DATABASE': (
            ['HOCOMOCO'] * len(hocomoco) +
            ['JASPAR'] * len(jaspar) +
            ['AnimalTFDB'] * len(animal)
        ),
        'MATRIX': list(map(lambda x: x.tolist(), matrices)),
        'PROTEIN': uniprot_ids
    }

    merged_df = pd.DataFrame(data).dropna()
    return merged_df


In [9]:
merge_df = merge_motif_databases(df_hocomoco, df_jaspar, df_animaltfdb)

In [10]:
merge_df = merge_df.dropna()

In [11]:
merge_df.to_csv('dataset_matrices.csv', index=False)