In [1]:
library(community)
# library(dplyr)
# library(stringr)
# library(mygene)
# library(OmnipathR)

# Building the `community` Database


This notebook contains functions and their detailed explanations essential for constructing the `community` database. This offers flexibility for users, allowing for both automated updates and manual interventions in the preprocessing steps. Users have the option to provide their own annotations or specify lists of ligands and receptors to tailor the database according to their specific research needs.

**For users looking to quickly update the database, simply run the following command:**


```R
library(community) #stringr, mygene and OmniPathR needs to be installed manually. I did not include these libraries as dependencies into the package

LR_database <- auto_update_db("both") # Specify the db, noncurated, curated or both.
```

# import_db 

## Description
This function imports ligand-receptor interaction data based on the specified database type. It allows for the selection of non-curated, curated, or both types of databases. The function filters out duplicate pairs and assigns pair names and annotation strategies to each interaction.

## Arguments
- `db_type`: A character string specifying the type of database to import. It accepts three values:
  - `"noncurated"`: To import only non-curated ligand-receptor [database](https://r.omnipathdb.org/reference/import_ligrecextra_interactions.html).
  - `"curated"`: To import only curated ligand-receptor [database](https://r.omnipathdb.org/reference/import_ligrecextra_interactions.html)..
  - `"both"`: To import both curated and non-curated interactions. In this case, it also identifies which pairs are present in both databases.


In [3]:
import_db <- function(db_type = c("noncurated", "curated", "both")) {
    db_type <- match.arg(db_type)

    if (db_type %in% c("noncurated", "both")) {
        non_curated <- import_ligrecextra_interactions()
        non_curated <- non_curated %>% filter(!duplicated(non_curated[, c("source_genesymbol", "target_genesymbol")]))
        non_curated$Pair.Name <- paste(non_curated$source_genesymbol, non_curated$target_genesymbol, sep = "_")
        non_curated$annotation_strategy <- "LR"
  }

    if (db_type %in% c("curated", "both")) {
        curated <- curated_ligand_receptor_interactions()
        curated <- curated %>% filter(!duplicated(curated[, c("source_genesymbol", "target_genesymbol")]))
        curated$Pair.Name <- paste(curated$source_genesymbol, curated$target_genesymbol, sep = "_")
        curated$annotation_strategy <- "curated"
  }

    if (db_type == "both") {
    non_curated <- non_curated %>%
      mutate(annotation_strategy = ifelse(Pair.Name %in% curated$Pair.Name, "both", annotation_strategy))

    combined_db <- rbind(non_curated, curated)
#     combined_db <- combined_db[!duplicated(combined_db$Pair.Name), ]

    cat("Retrieved interactions from", db_type, "DB")
    return(combined_db)
    } else if (db_type == "noncurated") {
    return(non_curated)
    } else if (db_type == "curated") {
    return(curated)
    }
}

In [2]:
db <- import_db("noncurated")

In [3]:
head(db)

source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,Pair.Name,annotation_strategy
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>
P0DP23,Q13507,CALM1,TRPC3,1,0,1,1,0,1,CellTalkDB;Fantom5_LRdb;HPRD;HPRD_LRdb;LRdb;TRIP;iTALK,CellTalkDB:11248050;HPRD:15104175;TRIP:11248050;TRIP:11290752;TRIP:12601176;TRIP:18215135,6,5,5,CALM1_TRPC3,LR
P60903,Q9H1D0,S100A10,TRPV6,1,1,0,1,1,0,CellTalkDB;HPRD;TRIP,CellTalkDB:18187190;HPRD:12660155;TRIP:12660155;TRIP:16189514;TRIP:18187190,5,3,3,S100A10_TRPV6,LR
O60674,P19235,JAK2,EPOR,1,1,0,1,1,0,BEL-Large-Corpus_ProtMapper;BioGRID;Cellinker;HPRD;HPRD-phos;HPRD_KEA;HPRD_MIMP;KEA;MIMP;PhosphoNetworks;PhosphoPoint;PhosphoSite_KEA;PhosphoSite_MIMP;ProtMapper;SIGNOR;SIGNOR_ProtMapper;SPIKE;Wang;iPTMnet;phosphoELM;phosphoELM_KEA;phosphoELM_MIMP,BioGRID:8343951;Cellinker:9030561;HPRD-phos:12441334;HPRD:11779507;HPRD:12441334;HPRD:8343951;KEA:10579919;KEA:10660611;KEA:11443118;KEA:12027890;KEA:12441334;KEA:7559499;KEA:9573010;ProtMapper:12441334;ProtMapper:15212693;SIGNOR:12441334;SPIKE:12524467;SPIKE:18672044;iPTMnet:10579919;iPTMnet:12441334;phosphoELM:10579919,21,13,14,JAK2_EPOR,LR
P46531,Q9Y219,NOTCH1,JAG2,1,0,1,0,0,0,Baccin2019;CellCall;HPRD;NetPath;Ramilowski2015_Baccin2019;SPIKE,HPRD:11006133;NetPath:11006133;SPIKE:15358736,3,2,5,NOTCH1_JAG2,LR
Q9Y219,P46531,JAG2,NOTCH1,1,1,1,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;STRING_talklr;SignaLink3;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1100613311006130;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;HPRD:11006133;ICELLNET:16921404;ICELLNET:21352254;ICELLNET:22503540;LRdb:11006133;NetPath:11006133;SIGNOR:9315665;SignaLink3:10958687;SignaLink3:11006133;SignaLink3:18988627;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:11006133,19,11,20,JAG2_NOTCH1,LR
O00548,P46531,DLL1,NOTCH1,1,1,0,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1006133;Baccin2019:98194281;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;Cellinker:9819428;HPRD:11006133;ICELLNET:21685328;LRdb:11;LRdb:9819428;NetPath:11006133;SIGNOR:16140393;SPIKE:11006133;SPIKE:17537801;connectomeDB2020:11006133;connectomeDB2020:9819428,18,9,20,DLL1_NOTCH1,LR


# create_pairwise_pairs 

## Description
This function processes the database to handle complex rows where either the target or the source is a complex. It splits such complex interactions into individual pairwise interactions. For each complex interaction, it generates all possible combinations of pairs, handling both the original and reverse orientations. The function returns a dataframe with detailed information for each pair, including the original complex pair name.

## Arguments
- `both_db`: A dataframe representing the database. It should contain at least 'source_genesymbol' and 'target_genesymbol' columns.

## Details
- The function first identifies rows in the database where the target or source is a complex.
- It then iterates over these complex rows, splitting the genes in the source and target columns, and creates all possible pairwise combinations.
- Each combination is stored in a list, which is then combined into a single dataframe.
- Duplicate pair names are removed, and a final dataframe with unique ligand-receptor pairs is returned.

## Returns
- A dataframe where each row represents a unique pair. The dataframe includes columns for ligands, receptors, the original complex pair names, and other relevant information from the input database.

In [30]:
create_pairwise_pairs <- function(both_db) {

    # Filter for complex rows
    complex <- both_db %>% 
               filter(str_detect(target, "COMPLEX") | str_detect(source, "COMPLEX"))
    cat(nrow(complex), " Number of complex pairs detected")
    # Remove pair column if exists
    complex$Pair.Name <- NULL

    # Initialize a list to store results
    results_list <- list()

    # Process each row
    for (i in 1:nrow(complex)) {
        values1 <- str_split(complex[i, "source_genesymbol"], "_", simplify = TRUE)
        values2 <- str_split(complex[i, "target_genesymbol"], "_", simplify = TRUE)

        original <- paste(complex[i, "source_genesymbol"], complex[i, "target_genesymbol"], sep = "_")
        
        pairs <- combn(c(values1, values2), 2)
        pairs <- t(pairs)
        pairs <- rbind(pairs, pairs[, 2:1])
        
        pairs_df <- as.data.frame(pairs)
        names(pairs_df) <- c("Ligand", "Receptor")
        pairs_df$complex_pair <- original
        
        pairs_df <- cbind(pairs_df, complex[i, ])
        results_list[[i]] <- pairs_df
    }

    # Combine all results
    results <- do.call(rbind, results_list)
    results$Pair.Name <- paste(results$Ligand, results$Receptor, sep = "_")
    cat(nrow(results), " Number of non-redundant binary pairs produced")
    return(results[, c("Pair.Name", names(results)[!names(results) %in% "Pair.Name"])])
}

In [4]:
pairwise_pairs <- create_pairwise_pairs(db)

In [5]:
head(pairwise_pairs)

Unnamed: 0_level_0,Pair.Name,Ligand,Receptor,complex_pair,source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>
1,IL17A_IL17RA,IL17A,IL17RA,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
2,IL17A_IL17RC,IL17A,IL17RC,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
3,IL17RA_IL17RC,IL17RA,IL17RC,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
4,IL17RA_IL17A,IL17RA,IL17A,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
5,IL17RC_IL17A,IL17RC,IL17A,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
6,IL17RC_IL17RA,IL17RC,IL17RA,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR


# filter_pairs_with_ppi 

## Description
This function filters pairwise pairs based on their presence in the protein-protein interaction (PPI) network. It first imports a PPI network and then filters the input pairwise pairs to include only those that exist in the PPI network. This helps in identifying biologically relevant interactions.

## Arguments
- `pairwise_pairs`: A dataframe representing pairwise pairs. This dataframe should contain a column `Pair.Name` that uniquely identifies each ligand-receptor pair.

## Details
- The function starts by importing all PPI interactions to form [OmniPath](https://r.omnipathdb.org/reference/import_post_translational_interactions.html). 
- It ensures that the PPI network contains unique pairs by removing duplicates based on `source_genesymbol` and `target_genesymbol`.
- The `Pair.Name` column is created in the PPI network for each pair by concatenating the source and target gene symbols.
- The input `pairwise_pairs` dataframe is then filtered to retain only those pairs whose names match with the pairs in the PPI network.
- Only distinct pairs are kept in the final dataframe to avoid redundancy.

## Returns
- A filtered dataframe of pairs that are present in the PPI network. This dataframe includes all columns from the input `pairwise_pairs` dataframe, ensuring that only biologically relevant interactions are included.


In [31]:
# Function to filter pairwise pairs based on PPI network
filter_pairs_with_ppi <- function(pairwise_pairs) {

    # Import all PPI
    pt <- import_post_translational_interactions()
    ppi_network <- pt %>% filter(!duplicated(.[, c("source_genesymbol", "target_genesymbol")]))
    ppi_network$Pair.Name <- paste(ppi_network$source_genesymbol, ppi_network$target_genesymbol, sep = "_")

    # Filter pairs that exist in the PPI network
    pt_interactions <- pairwise_pairs %>%
        filter(Pair.Name %in% ppi_network$Pair.Name) %>%
        distinct(Pair.Name, .keep_all = TRUE)
    cat(nrow(pt_interactions), " Number of binary pairs detected through PPI")
    return(pt_interactions)
}

In [6]:
pt_interactions <- filter_pairs_with_ppi(pairwise_pairs)

In [7]:
head(pt_interactions)

Unnamed: 0_level_0,Pair.Name,Ligand,Receptor,complex_pair,source,target,source_genesymbol,target_genesymbol,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>
1,IL17A_IL17RA,IL17A,IL17RA,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
2,IL17A_IL17RC,IL17A,IL17RC,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
3,IL17RA_IL17A,IL17RA,IL17A,IL17A_IL17RA_IL17RC,Q16552,COMPLEX:Q8NAC3_Q96F46,IL17A,IL17RA_IL17RC,1,1,0,1,1,0,CellChatDB;CellPhoneDB;Cellinker;ICELLNET;SIGNOR,Cellinker:19838198;Cellinker:25204502;Cellinker:9367539;ICELLNET:24011563;SIGNOR:32024054,5,5,5,LR
4,NPNT_ITGA8,NPNT,ITGA8,NPNT_ITGA8_ITGB1,Q6UXI9,COMPLEX:P05556_P53708,NPNT,ITGA8_ITGB1,1,1,0,1,1,0,Baccin2019;SIGNOR,Baccin2019:16988024;SIGNOR:22613833,2,2,2,LR
5,NPNT_ITGB1,NPNT,ITGB1,NPNT_ITGA8_ITGB1,Q6UXI9,COMPLEX:P05556_P53708,NPNT,ITGA8_ITGB1,1,1,0,1,1,0,Baccin2019;SIGNOR,Baccin2019:16988024;SIGNOR:22613833,2,2,2,LR
6,ITGAL_ICAM1,ITGAL,ICAM1,ITGAL_ITGB2_ICAM1,COMPLEX:P05107_P20701,P05362,ITGAL_ITGB2,ICAM1,1,1,0,0,0,0,Baccin2019;CellPhoneDB;ICELLNET;SIGNOR,Baccin2019:16988024;ICELLNET:10940895;ICELLNET:23418628;SIGNOR:12808052,4,4,4,LR


# process_single_components

## Description
This function processes binary pairs from the database and merges them with the binary pairs detected through PPI. It processes the data to standardize and reorder columns, and then combines the binary pairs with the PPI interaction data. The function ensures that only unique pairs are retained in the final dataset.

## Arguments
- `db`: A dataframe containing the database that includes all interactions,  complex and single-component.
- `pt_interactions`: A dataframe of PT interactions, which will be merged with the single-component pairs.

## Details
- The function starts by filtering out complex interactions from `both_db`.
- The single-component interactions and PT interactions are merged, and duplicates are dropped based on the `Pair.Name` column.

## Returns
- A dataframe that combines single-component pairs with PPI interactions, ensuring unique pairs. The dataframe includes columns for Pair Name, Ligand, Receptor, and other relevant interaction details.

In [32]:
process_binary_pairs <- function(both_db, pt_interactions) {
    # Filter out single components
    single_components <- filter(both_db, !grepl('COMPLEX', target) & !grepl('COMPLEX', source))
    single_components$pair <- NULL  # Remove pair column

    # Process and rename columns
    single_components <- single_components %>%
        dplyr::rename(Ligand = source_genesymbol, Receptor = target_genesymbol) %>%
        dplyr::mutate(complex_pair = NA, 
                      Pair.Name = paste(Ligand, Receptor, sep = "_"))

    # Reorder columns
    single_components <- data.frame(Pair.Name = single_components$Pair.Name, 
                                    single_components[, !(names(single_components) %in% "Pair.Name")])

    # Clean pt_interactions data
    pt_interactions$target_genesymbol <- NULL
    pt_interactions$source_genesymbol <- NULL
    
    desired_cols <- c('Pair.Name','Ligand','Receptor','source','target','is_directed','is_stimulation',
                  'is_inhibition','consensus_direction','consensus_stimulation','consensus_inhibition',
                  'sources','references','curation_effort','n_references','n_resources','annotation_strategy',
                  'complex_pair')

    
    single_components <- single_components[desired_cols]
    pt_interactions <- pt_interactions[desired_cols]
    
    # Merge single components with PT interactions and drop duplicates
    complete <- rbind(single_components, pt_interactions)
    complete <- complete[!duplicated(complete$Pair.Name, fromLast = TRUE),]
    cat(nrow(complete), " Non-redundant number of pairs in the DB")
    return(complete)
}

In [8]:
complete_data <- process_binary_pairs(db, pt_interactions)

In [9]:
head(complete_data)

Unnamed: 0_level_0,Pair.Name,Ligand,Receptor,source,target,is_directed,is_stimulation,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy,complex_pair
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>
1,CALM1_TRPC3,CALM1,TRPC3,P0DP23,Q13507,1,0,1,1,0,1,CellTalkDB;Fantom5_LRdb;HPRD;HPRD_LRdb;LRdb;TRIP;iTALK,CellTalkDB:11248050;HPRD:15104175;TRIP:11248050;TRIP:11290752;TRIP:12601176;TRIP:18215135,6,5,5,LR,
2,S100A10_TRPV6,S100A10,TRPV6,P60903,Q9H1D0,1,1,0,1,1,0,CellTalkDB;HPRD;TRIP,CellTalkDB:18187190;HPRD:12660155;TRIP:12660155;TRIP:16189514;TRIP:18187190,5,3,3,LR,
3,JAK2_EPOR,JAK2,EPOR,O60674,P19235,1,1,0,1,1,0,BEL-Large-Corpus_ProtMapper;BioGRID;Cellinker;HPRD;HPRD-phos;HPRD_KEA;HPRD_MIMP;KEA;MIMP;PhosphoNetworks;PhosphoPoint;PhosphoSite_KEA;PhosphoSite_MIMP;ProtMapper;SIGNOR;SIGNOR_ProtMapper;SPIKE;Wang;iPTMnet;phosphoELM;phosphoELM_KEA;phosphoELM_MIMP,BioGRID:8343951;Cellinker:9030561;HPRD-phos:12441334;HPRD:11779507;HPRD:12441334;HPRD:8343951;KEA:10579919;KEA:10660611;KEA:11443118;KEA:12027890;KEA:12441334;KEA:7559499;KEA:9573010;ProtMapper:12441334;ProtMapper:15212693;SIGNOR:12441334;SPIKE:12524467;SPIKE:18672044;iPTMnet:10579919;iPTMnet:12441334;phosphoELM:10579919,21,13,14,LR,
4,NOTCH1_JAG2,NOTCH1,JAG2,P46531,Q9Y219,1,0,1,0,0,0,Baccin2019;CellCall;HPRD;NetPath;Ramilowski2015_Baccin2019;SPIKE,HPRD:11006133;NetPath:11006133;SPIKE:15358736,3,2,5,LR,
5,JAG2_NOTCH1,JAG2,NOTCH1,Q9Y219,P46531,1,1,1,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;STRING_talklr;SignaLink3;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1100613311006130;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;HPRD:11006133;ICELLNET:16921404;ICELLNET:21352254;ICELLNET:22503540;LRdb:11006133;NetPath:11006133;SIGNOR:9315665;SignaLink3:10958687;SignaLink3:11006133;SignaLink3:18988627;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:11006133,19,11,20,LR,
6,DLL1_NOTCH1,DLL1,NOTCH1,O00548,P46531,1,1,0,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1006133;Baccin2019:98194281;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;Cellinker:9819428;HPRD:11006133;ICELLNET:21685328;LRdb:11;LRdb:9819428;NetPath:11006133;SIGNOR:16140393;SPIKE:11006133;SPIKE:17537801;connectomeDB2020:11006133;connectomeDB2020:9819428,18,9,20,LR,


# map_gene_data

## Description
This function enriches database with gene descriptions. It queries gene symbols to fetch their respective gene descriptions from [MyGene, a gene annotation servise](https://mygene.info/).

## Arguments
- `complete`: A dataframe representing the combined and processed dataset of interaction pairs.

## Details
- Unique gene symbols are extracted from the dataset.
- A query is made to fetch gene descriptions for these gene symbols.
- The gene descriptions are then mapped back to the gene pairs in the dataset.
- Columns in the dataframe are reordered to place essential information at the forefront.

## Returns
- The modified dataframe with added columns 'Ligand.Name' and 'Receptor.Name', providing the respective gene descriptions. The dataframe is reorganized for clarity and ease of analysis.

## Notes
- The function relies on online queries for protein descriptions.


In [37]:
map_gene_data <- function(complete) {
    # Get unique gene symbols
    warning("If this function fails, it may be due to internet connectivity issues. Try running it again.")
    
    gene_symbols <- unique(c(complete$Ligand, complete$Receptor))

    # Query for protein descriptions
    prot_descriptions <- queryMany(gene_symbols, scopes = "symbol", 
                                   fields = c("name"), 
                                   species = "human", 
                                   as_dataframe = TRUE)
    prot_descriptions <- as.data.frame(prot_descriptions)

    # Map protein descriptions to the complete dataset
    for (x in 1:nrow(complete)) {
        ligand_symbol = complete[x,]$Ligand
        receptor_symbol = complete[x,]$Receptor
        ligand_description = filter(prot_descriptions, query == ligand_symbol)$name
        receptor_description = filter(prot_descriptions, query == receptor_symbol)$name

        complete[x, "Ligand.Name"] = ligand_description[1]
        complete[x, "Receptor.Name"] = receptor_description[1]
    }

    # Handle specific case for "PIK3CD-AS1"
    complete$Ligand <- str_replace(complete$Ligand, "PIK3CD-AS1", "PIK3CD")
    complete$Pair.Name <- paste(complete$Ligand, complete$Receptor, sep = "_")
    complete$dup <- paste(complete$Receptor, complete$Ligand, sep = "_")

    # Reorder columns
    desired_order <- c("Pair.Name", "Ligand", "Ligand.Name", "Receptor", "Receptor.Name", "complex_pair")
    remaining_cols <- setdiff(names(complete), desired_order)
    final_order <- c(desired_order, remaining_cols)

    # Reorder the dataframe columns
    complete <- complete[, final_order]

    return(complete)
}

In [10]:
complete_data <- map_gene_data(complete_data)

“If this function fails, it may be due to internet connectivity issues. Try running it again.”
Querying chunk 1

Querying chunk 2

Querying chunk 3



Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.


In [11]:
head(complete_data)

Unnamed: 0_level_0,Pair.Name,Ligand,Ligand.Name,Receptor,Receptor.Name,complex_pair,source,target,is_directed,is_stimulation,⋯,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy,dup
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>
1,CALM1_TRPC3,CALM1,calmodulin 1,TRPC3,transient receptor potential cation channel subfamily C member 3,,P0DP23,Q13507,1,0,⋯,1,0,1,CellTalkDB;Fantom5_LRdb;HPRD;HPRD_LRdb;LRdb;TRIP;iTALK,CellTalkDB:11248050;HPRD:15104175;TRIP:11248050;TRIP:11290752;TRIP:12601176;TRIP:18215135,6,5,5,LR,TRPC3_CALM1
2,S100A10_TRPV6,S100A10,S100 calcium binding protein A10,TRPV6,transient receptor potential cation channel subfamily V member 6,,P60903,Q9H1D0,1,1,⋯,1,1,0,CellTalkDB;HPRD;TRIP,CellTalkDB:18187190;HPRD:12660155;TRIP:12660155;TRIP:16189514;TRIP:18187190,5,3,3,LR,TRPV6_S100A10
3,JAK2_EPOR,JAK2,Janus kinase 2,EPOR,erythropoietin receptor,,O60674,P19235,1,1,⋯,1,1,0,BEL-Large-Corpus_ProtMapper;BioGRID;Cellinker;HPRD;HPRD-phos;HPRD_KEA;HPRD_MIMP;KEA;MIMP;PhosphoNetworks;PhosphoPoint;PhosphoSite_KEA;PhosphoSite_MIMP;ProtMapper;SIGNOR;SIGNOR_ProtMapper;SPIKE;Wang;iPTMnet;phosphoELM;phosphoELM_KEA;phosphoELM_MIMP,BioGRID:8343951;Cellinker:9030561;HPRD-phos:12441334;HPRD:11779507;HPRD:12441334;HPRD:8343951;KEA:10579919;KEA:10660611;KEA:11443118;KEA:12027890;KEA:12441334;KEA:7559499;KEA:9573010;ProtMapper:12441334;ProtMapper:15212693;SIGNOR:12441334;SPIKE:12524467;SPIKE:18672044;iPTMnet:10579919;iPTMnet:12441334;phosphoELM:10579919,21,13,14,LR,EPOR_JAK2
4,NOTCH1_JAG2,NOTCH1,notch receptor 1,JAG2,jagged canonical Notch ligand 2,,P46531,Q9Y219,1,0,⋯,0,0,0,Baccin2019;CellCall;HPRD;NetPath;Ramilowski2015_Baccin2019;SPIKE,HPRD:11006133;NetPath:11006133;SPIKE:15358736,3,2,5,LR,JAG2_NOTCH1
5,JAG2_NOTCH1,JAG2,jagged canonical Notch ligand 2,NOTCH1,notch receptor 1,,Q9Y219,P46531,1,1,⋯,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;STRING_talklr;SignaLink3;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1100613311006130;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;HPRD:11006133;ICELLNET:16921404;ICELLNET:21352254;ICELLNET:22503540;LRdb:11006133;NetPath:11006133;SIGNOR:9315665;SignaLink3:10958687;SignaLink3:11006133;SignaLink3:18988627;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:11006133,19,11,20,LR,NOTCH1_JAG2
6,DLL1_NOTCH1,DLL1,delta like canonical Notch ligand 1,NOTCH1,notch receptor 1,,O00548,P46531,1,1,⋯,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1006133;Baccin2019:98194281;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;Cellinker:9819428;HPRD:11006133;ICELLNET:21685328;LRdb:11;LRdb:9819428;NetPath:11006133;SIGNOR:16140393;SPIKE:11006133;SPIKE:17537801;connectomeDB2020:11006133;connectomeDB2020:9819428,18,9,20,LR,NOTCH1_DLL1


# annotate_components 

## Description
This function annotates the each gene from the protein-protein interaction (PPI) network with their respective parent categories. It is designed to process a database and assign each component a category based on its role in the network (e.g., ligand, receptor, intracellular). The function handles components not initially classified as ligands or receptors by considering additional categories such as secreted and extracellular matrix.

## Arguments
- `db`: A dataframe representing the combined and processed dataset of interaction pairs.

## Details
- The function extracts unique gene symbols from the input dataframe.
- It initializes a new dataframe for storing annotations with columns for gene symbols, scores, and parent categories.
- The function then imports Omnipath intercellular interactions and filters them to ligands and receptors.
- For each component, it checks if it's categorized as a ligand or receptor and assigns the highest score and corresponding parent category.
- If a component is not classified as ligand or receptor, the function searches for other possible categories like secreted or extracellular matrix.
- The categories 'ecm' and 'secreted' are replaced with 'ligand' to maintain consistency.

## Returns
- A dataframe with columns `genesymbol`, `score`, and `parent`. This dataframe provides annotated categories for each gene symbol based on its role in the PPI network.


In [45]:
annotate_components <- function(complete_data) {
    
    components <- unique(c(complete_data$Ligand, complete_data$Receptor))
    
    #create a df to store annotation
    df <- data.frame(genesymbol = character(length(components)), score = numeric(length(components)),
                     parent = character(length(components)), stringsAsFactors = FALSE)
    
    
    anno_raw <- import_omnipath_intercell()
    anno_lig <- anno_raw %>%
    dplyr::filter(category %in% c("receptor","ligand"))
    
    
    # Check if the components are categorized as ligands or receptors
    for (x in 1:length(components)) {
    #     maxvalue=max(filter(anno, uniprot==components[x])$consensus_score)
        genename <- components[x]
        parent_score <- sort(table(filter(anno_lig, genesymbol==components[x])$parent), decreasing = T, na.last = T)[1]
        parent_category <- names(parent_score)

        if (is.null(parent_category)) {
          parent_category <- "NA"
          parent_score <- 0
        }

        df[x, "genesymbol"] <- genename
        df[x, "score"] <- parent_score
        df[x, "parent"] <- parent_category

    #     df$genesymbol[x] <- genename
    #     df$score[x] <- parent_score
    #     df$parent[x] <- parent_category
    }
    
    # If a component is not classified as a ligand or receptor, we may consider other categories such as 
    # extracellular matrix, secreted, and transmembrane.# annotated others such as secreted, ecm etc

    df_na <- filter(df, parent=="NA")$genesymbol

    if (length(df_na) > 0) {
        for (x in 1:length(df_na)) {
            genesymbol <- df_na[x]
            parent_score <- sort(table(filter(anno_raw, genesymbol == df_na[x])$parent), decreasing = TRUE, na.last = TRUE)[1]
            parent_category <- names(parent_score)

            df <- df %>% mutate(parent = ifelse(genesymbol == df_na[x], parent_category, parent))
            df <- df %>% mutate(score = ifelse(genesymbol == df_na[x], parent_score, score))
        }
    }

    # replace ecm and secreted with ligand
    df$parent <- replace(df$parent, df$parent == "ecm", "ligand")
    df$parent <- replace(df$parent, df$parent == "secreted", "ligand")
    
    return(df)

}

In [12]:
annotation <- annotate_components(complete_data)

In [13]:
head(annotation)

Unnamed: 0_level_0,genesymbol,score,parent
Unnamed: 0_level_1,<chr>,<dbl>,<chr>
1,CALM1,5,intracellular
2,S100A10,4,ligand
3,JAK2,2,receptor
4,NOTCH1,22,receptor
5,JAG2,12,ligand
6,DLL1,12,ligand


# process_lr_db 

## Description
This function processes the complete dataset of ligand-receptor (LR) pairs along with the annotations to fix the directionality of the interactions and to identify the true LR pairs. It ensures that the ligands and receptors are correctly aligned according to the provided annotations. The function also handles cases where the direction of the interaction needs to be swapped and identifies true LR interactions.

## Arguments
- `db`: A dataframe representing the combined and processed dataset of interaction pairs.
- `annotation`: A dataframe with annotations for genes, including their classification as ligands, receptors and others.
- `all_ligands` (optional): A vector of gene symbols classified as ligands. If not provided, it will be determined from the annotation.
- `all_receptors` (optional): A vector of gene symbols classified as receptors. If not provided, it will be determined from the annotation.

## Details
- If `all_ligands` and `all_receptors` are not provided, the function first determines them from the annotation data.
- The function then identifies pairs where the direction needs to be corrected (i.e., ligands are listed as receptors and vice versa).
- It swaps the values in the Ligand, Receptor, and related columns for these pairs to correct their direction.
- After fixing the directions, the function filters out duplicates and then combines the corrected pairs with the original true LR pairs.
- A new column, `True_LR`, is added to indicate whether a pair is a true LR interaction.

## Returns
- A dataframe that combines the true LR pairs and those with corrected directions, ensuring the accuracy of ligand-receptor pairings. This dataframe includes detailed information about each interaction, such as ligands, receptors, and whether the pair is a true LR interaction.


In [56]:
process_lr_db <- function(complete, annotation, all_ligands = NULL, all_receptors = NULL) {

    # If not provided, determine the true LR genespace from the annotation
    if (is.null(all_ligands) || is.null(all_receptors)) {
        true_LR_anno <- filter(annotation, parent == "receptor" | parent == "ligand")
        all_ligands <- filter(true_LR_anno, parent == "ligand")$genesymbol
        all_receptors <- filter(true_LR_anno, parent == "receptor")$genesymbol
    }

    # Filter pairs that need direction fixing
    LR_fix_dir <- filter(complete, Ligand %in% all_receptors & Receptor %in% all_ligands)

    # Swap values in Ligand and Receptor columns
    temp <- LR_fix_dir$Ligand
    LR_fix_dir$Ligand <- LR_fix_dir$Receptor
    LR_fix_dir$Receptor <- temp

    # Swap values in Ligand.Name and Receptor.Name columns
    temp <- LR_fix_dir$Ligand.Name
    LR_fix_dir$Ligand.Name <- LR_fix_dir$Receptor.Name
    LR_fix_dir$Receptor.Name <- temp

    # Swap values in source and target columns
    temp <- LR_fix_dir$source
    LR_fix_dir$source <- LR_fix_dir$target
    LR_fix_dir$target <- temp

    rm(temp)
    
    LR_fix_dir["Pair.Name"] <- paste(LR_fix_dir$Ligand, LR_fix_dir$Receptor, sep="_")
    LR_fix_dir["dup"] <- paste(LR_fix_dir$Receptor, LR_fix_dir$Ligand, sep="_")

    # Get the list of interactions that are strictly in LR direction
    true_LR_DB <- filter(complete, Ligand %in% all_ligands & Receptor %in% all_receptors)

    # Remove duplicates after fix
    LR_fix_dir <- LR_fix_dir[!LR_fix_dir$Pair.Name %in% true_LR_DB$Pair.Name,]

    # Add True_LR column
    true_LR_DB["True_LR"] <- TRUE
    LR_fix_dir["True_LR"] <- TRUE

    true_LR_DB <- rbind(true_LR_DB, LR_fix_dir)

    #move column to the first
    true_LR_DB <- true_LR_DB %>% dplyr::select(True_LR, everything())

    # Combine and return the dataset
    return(true_LR_DB)
}

In [14]:
true_LR_DB <- process_lr_db(complete_data, annotation)

In [15]:
head(true_LR_DB)

Unnamed: 0_level_0,True_LR,Pair.Name,Ligand,Ligand.Name,Receptor,Receptor.Name,complex_pair,source,target,is_directed,⋯,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy
Unnamed: 0_level_1,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>
1,True,S100A10_TRPV6,S100A10,S100 calcium binding protein A10,TRPV6,transient receptor potential cation channel subfamily V member 6,,P60903,Q9H1D0,1,⋯,0,1,1,0,CellTalkDB;HPRD;TRIP,CellTalkDB:18187190;HPRD:12660155;TRIP:12660155;TRIP:16189514;TRIP:18187190,5,3,3,LR
2,True,JAG2_NOTCH1,JAG2,jagged canonical Notch ligand 2,NOTCH1,notch receptor 1,,Q9Y219,P46531,1,⋯,1,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;STRING_talklr;SignaLink3;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1100613311006130;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;HPRD:11006133;ICELLNET:16921404;ICELLNET:21352254;ICELLNET:22503540;LRdb:11006133;NetPath:11006133;SIGNOR:9315665;SignaLink3:10958687;SignaLink3:11006133;SignaLink3:18988627;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:11006133,19,11,20,LR
3,True,DLL1_NOTCH1,DLL1,delta like canonical Notch ligand 1,NOTCH1,notch receptor 1,,O00548,P46531,1,⋯,0,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;KEGG-MEDICUS;Kirouac2010;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:1006133;Baccin2019:98194281;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;Cellinker:9819428;HPRD:11006133;ICELLNET:21685328;LRdb:11;LRdb:9819428;NetPath:11006133;SIGNOR:16140393;SPIKE:11006133;SPIKE:17537801;connectomeDB2020:11006133;connectomeDB2020:9819428,18,9,20,LR
4,True,IGF1_IGF1R,IGF1,insulin like growth factor 1,IGF1R,insulin like growth factor 1 receptor,,P05019,P08069,1,⋯,0,1,1,0,Baccin2019;CA1;CellCall;CellChatDB;CellPhoneDB;CellTalkDB;Cellinker;DIP;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;Guide2Pharma;Guide2Pharma_Cellinker;Guide2Pharma_LRdb;Guide2Pharma_talklr;HPMR;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;IntAct;KEGG-MEDICUS;Kirouac2010;LRdb;ProtMapper;REACH_ProtMapper;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;SignaLink3;Sparser_ProtMapper;UniProt_LRdb;Wang;connectomeDB2020;iTALK;scConnect;talklr,Baccin2019:1852007;Baccin2019:2877871;CA1:8408032;CellChatDB:14604834;CellTalkDB:2877871;Cellinker:14604834;Cellinker:1852007;Cellinker:2877871;DIP:11287679;DIP:21645859;HPMR:2877871;HPRD:1852007;IntAct:21645859;IntAct:8452530;LRdb:185;LRdb:2877871;ProtMapper:19258508;ProtMapper:23675206;SIGNOR:19029956;SIGNOR:21798082;SPIKE:1852007;SignaLink3:12067807;SignaLink3:1852007;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:1852007;connectomeDB2020:2877871,27,15,26,LR
5,True,JAG1_NOTCH1,JAG1,jagged canonical Notch ligand 1,NOTCH1,notch receptor 1,,P78504,P46531,1,⋯,1,1,1,0,ACSN;Baccin2019;BioGRID;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;DIP;DLRP_Cellinker;DLRP_talklr;EMBRACE;Fantom5_LRdb;HPMR;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;HPRD;HPRD_LRdb;HPRD_talklr;ICELLNET;IntAct;KEGG-MEDICUS;Kirouac2010;LRdb;Lit-BM-17;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;SignaLink3;UniProt_LRdb;Wang;connectomeDB2020;iTALK;talklr,ACSN:22330899;ACSN:22363130;Baccin2019:769772111006133;BioGRID:11006133;CellChatDB:22353464;CellPhoneDB:22353464;CellTalkDB:22353464;Cellinker:11006133;Cellinker:22353464;Cellinker:7697721;DIP:18660822;HPMR:7697721;HPRD:11006133;ICELLNET:15137944;ICELLNET:16921404;ICELLNET:18292500;IntAct:18660822;IntAct:23086448;LRdb:11;LRdb:7697721;Lit-BM-17:11006133;Lit-BM-17:18660822;Lit-BM-17:21820430;NetPath:11006133;SIGNOR:18660822;SPIKE:11006133;SPIKE:16713569;SPIKE:17537801;SPIKE:18495817;SignaLink3:10958687;SignaLink3:11006133;SignaLink3:18988627;SignaLink3:21071413;SignaLink3:23331499;connectomeDB2020:11006133;connectomeDB2020:7697721,36,20,27,LR
6,True,WNT5A_FZD2,WNT5A,Wnt family member 5A,FZD2,frizzled class receptor 2,,P41221,Q14332,1,⋯,1,1,1,0,Baccin2019;CellCall;CellChatDB;CellPhoneDB;CellPhoneDB_Cellinker;CellTalkDB;Cellinker;EMBRACE;Fantom5_LRdb;HPMR_Cellinker;HPMR_LRdb;HPMR_talklr;LRdb;NetPath;Ramilowski2015;Ramilowski2015_Baccin2019;SIGNOR;SPIKE;STRING_talklr;Wang;connectomeDB2020;iTALK;talklr,Baccin2019:9389482;CellPhoneDB:24032637;CellTalkDB:24032637;Cellinker:9389482;LRdb:9389482;NetPath:19910923;SIGNOR:19008118;SIGNOR:19910923;SIGNOR:2808370;SPIKE:19910923;connectomeDB2020:19910923;connectomeDB2020:9389482,12,5,16,LR


# process_adhesive_DB 

## Description
This function is designed for processing adhesive interactions, including handling swapped duplicated pairs. It allows manual curation by enabling the user to specify lists of genes annotated as ligands or receptors, including those categorized under other classes like transmitters or cell adhesion molecules. The function performs various filtering and re-assignment operations to correct misannotated pairs and ensure that the final database correctly represents interactions.

## Arguments
- `db`: A dataframe representing the combined and processed dataset of interaction pairs.
- `annotation`: A dataframe containing annotated information for genes.
- `ligand_list`: A vector of genes manually annotated as ligands. Default is an empty list.
- `receptor_list`: A vector of genes manually annotated as receptors. Default is an empty list.

## Details
- The function begins by filtering ligand-receptor pairs based on the provided annotations and manually curated lists.
- It identifies reversed pairs and applies a series of transformations to correct them.
- Additional filtering steps are implemented to handle interactions involving genes that are annotated as receptors but listed as ligands and vice versa.
- The function also implements a lexicographical sorting mechanism to maintain a consistent order of gene pairs where a directionality not found (none of the components of interacting pair is not classified as ligand or receptor).


## Returns
- A dataframe representing the processed database of adhesive interactions. This includes both manually curated annotations and corrected pairs, marked accordingly to distinguish adhesive vs true ligand-receptor interactions.


In [9]:
process_adhesive_DB <- function(complete, annotation, ligand_list=list(), receptor_list=list()) {
    
    true_LR_anno <- filter(annotation, parent == "receptor" | parent == "ligand")
    anno_ligands <- filter(true_LR_anno, parent == "ligand")$genesymbol
    anno_receptors <- filter(true_LR_anno, parent == "receptor")$genesymbol
    
    LR_DB <- filter(complete, Ligand %in% anno_ligands & Receptor %in% anno_receptors)
    LR_fixed <- filter(complete, Ligand %in% anno_receptors & Receptor %in% anno_ligands)
    LR_DB <- rbind(LR_DB, LR_fixed)
    
    # Filter out rows not in true_LR_DB
    adhesive_DB <- filter(complete, !Pair.Name %in% LR_DB$Pair.Name)
    adhesive_DB["True_LR"] <- FALSE

    # Find reversed pairs
    reversed <- adhesive_DB[adhesive_DB$dup %in% adhesive_DB$Pair.Name,]
    adhesive_DB <- adhesive_DB[!adhesive_DB$Pair.Name %in% reversed$Pair.Name,]
    
    # Define gene families and annotations
    # manual annotation of genes

    plexin_family <- as.vector(reversed[grep("plexin", reversed$Receptor.Name), ]$Receptor)

    neuroligin_family <- as.vector(reversed[grep("neuroligin", reversed$Receptor.Name), ]$Receptor)

    adam_family <- as.vector(reversed[grep("ADAM", reversed$Receptor.Name), ]$Receptor)

    #extract all_receptors that has annotation of "receptor" under Ligand.Name
    receptor_anno <- as.vector(reversed[grep("receptor", reversed$Ligand.Name), ]$Ligand)
    
    
    # Combine the additional all_ligands with the plexin, neuroligin, and ADAM families into a vector called ligand
    ligand_list <- unique(c(ligand_list,plexin_family,neuroligin_family,adam_family))
    receptor_list <- unique(c(receptor_list,receptor_anno))
    
    
    
    # processing swapped duplicated
    # Subset the data frame to only include rows where the consensus_direction column is 1
    dir <- reversed %>% filter(Pair.Name %in% reversed$dup & consensus_direction == 1)
    reversed$dup = paste(reversed$Receptor, reversed$Ligand, sep="_")
    dir = reversed[reversed$Pair.Name %in% reversed$dup & reversed$consensus_direction == 1, ]
    

    # Subset the data frame to only include rows where the consensus_direction column is 0
    no_dir <- reversed %>% filter(Pair.Name %in% reversed$dup & consensus_direction == 0)
    
    
    # Remove rows from no_dir where the pair is already present in dir
    in_dir <- dir[dir$dup %in% no_dir$Pair.Name,] 
    no_dir <- no_dir[!no_dir$dup %in% in_dir$Pair.Name,  ] #removal of those in dir below is rm of nodir

    dir <- dir[!dir$Pair.Name %in% in_dir$Pair.Name,]
    

    # remove the interactions where receptor is annotated as ligand
    wrong_lig <- no_dir[no_dir$Receptor %in% ligand_list,]
    correct_lig <- no_dir[no_dir$dup %in% wrong_lig$Pair.Name,]
    no_dir <- no_dir[!no_dir$Pair.Name %in% c(wrong_lig$Pair.Name, correct_lig$Pair.Name),]

    wrong_rec <- no_dir[no_dir$Ligand %in% receptor_list,]
    wrong_rec <- wrong_rec[!wrong_rec$Pair.Name %in% wrong_rec$dup,]

    # remove the interactions where receptor is annotated as ligand
    wrong_rec <- no_dir[no_dir$Ligand %in% receptor_list,]
    wrong_rec <- wrong_rec[!wrong_rec$Pair.Name %in% wrong_rec$dup,]
    correct_rec <- no_dir[no_dir$dup %in% wrong_rec$Pair.Name,]
    no_dir <- no_dir[!no_dir$Pair.Name %in% c(wrong_rec$Pair.Name, correct_rec$Pair.Name),]
    
    
    # Function to lexographically sort the gene pairs
    sort_pairs <- function(pair) {
        parts <- strsplit(pair, "_")[[1]]
        sorted_parts <- sort(parts)
        return(paste(sorted_parts, collapse = "_"))
  }

    
    no_dir <- no_dir[order( no_dir[,2], no_dir[,4] ),]
    no_dir$sort <- sapply(no_dir$Pair.Name, sort_pairs)
    no_dir <- no_dir %>% distinct(sort, .keep_all = TRUE)
    no_dir$sort <- NULL
    
    
    c# some of the swapped duplicates are both directional, for this one, we also keep the lexograph order
    dir <- dir[order( dir[,2], dir[,4] ),]
    dir$sort <- sapply(dir$Pair.Name, sort_pairs)
    dir <- dir %>% distinct(sort, .keep_all = TRUE)
    dir$sort <- NULL
    
    # Combine dataframes and return
    subset_lr <- rbind(dir, no_dir, in_dir, correct_lig, correct_rec)
    subset_lr["True_LR"] <- FALSE

    adhesive_DB <- rbind(subset_lr, adhesive_DB)
    
    #move column to the first
    adhesive_DB <- adhesive_DB %>% dplyr::select(True_LR, everything())
    
    return(adhesive_DB)
}

In [16]:
adhesive_DB <- process_adhesive_DB(complete_data, annotation, ligand_list=list(), receptor_list=list())

In [17]:
head(adhesive_DB)

Unnamed: 0_level_0,True_LR,Pair.Name,Ligand,Ligand.Name,Receptor,Receptor.Name,complex_pair,source,target,is_directed,⋯,is_inhibition,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy
Unnamed: 0_level_1,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>
1,False,EGFR_ERBB2,EGFR,epidermal growth factor receptor,ERBB2,erb-b2 receptor tyrosine kinase 2,EGF_EGFR_ERBB2,P01133,COMPLEX:P00533_P04626,1,⋯,0,1,1,0,CellChatDB;Cellinker;DLRP_Cellinker;Guide2Pharma_Cellinker;HPMR_Cellinker;KEGG-MEDICUS,Cellinker:10788520;Cellinker:12093292;Cellinker:12297050;Cellinker:12620237;Cellinker:15620700;Cellinker:16274239;Cellinker:6289330,7,7,3,LR
2,False,IL1R1_IL1RAP,IL1R1,interleukin 1 receptor type 1,IL1RAP,interleukin 1 receptor accessory protein,IL1A_IL1R1_IL1RAP,P01583,COMPLEX:P14778_Q9NPH3,1,⋯,0,1,1,0,CellChatDB;Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:2946959;Cellinker:9820540;ICELLNET:20959797;ICELLNET:24332029,4,4,4,LR
3,False,KDR_NRP1,KDR,kinase insert domain receptor,NRP1,neuropilin 1,NRP1_FLT1_KDR,O14786,COMPLEX:P17948_P35968,1,⋯,0,1,1,0,CellChatDB-cofactors,,0,0,1,LR
4,False,TGFBR1_TGFBR2,TGFBR1,transforming growth factor beta receptor 1,TGFBR2,transforming growth factor beta receptor 2,TGFB1_TGFBR1_TGFBR2,P01137,COMPLEX:P36897_P37173,1,⋯,0,1,1,0,CellChatDB;CellPhoneDB;ICELLNET;KEGG-MEDICUS,ICELLNET:1333888;ICELLNET:7693660;ICELLNET:8242743,3,3,4,LR
5,False,ABCA1_SHANK1,ABCA1,ATP binding cassette subfamily A member 1,SHANK1,SH3 and multiple ankyrin repeat domains 1,,O95477,Q9Y566,1,⋯,0,0,0,0,Baccin2019;HPRD;Ramilowski2015_Baccin2019,HPRD:16192279,1,1,2,LR
6,False,ACVR2A_ACVRL1,ACVR2A,activin A receptor type 2A,ACVRL1,activin A receptor like type 1,BMP10_ACVR2A_ACVRL1,O95393,COMPLEX:P27037_P37023,1,⋯,0,0,0,0,CellChatDB;Cellinker;HPMR_Cellinker;ICELLNET,CellChatDB:25620979;CellChatDB:26893264;Cellinker:16049014;Cellinker:17068149;Cellinker:26893264;Cellinker:27252362;Cellinker:30246252;ICELLNET:17878607,8,7,3,LR


### Merge

In [18]:
 LR_database <- rbind(true_LR_DB, adhesive_DB)

In [19]:
str(LR_database)

'data.frame':	6916 obs. of  21 variables:
 $ True_LR              : logi  TRUE TRUE TRUE TRUE TRUE TRUE ...
 $ Pair.Name            : chr  "S100A10_TRPV6" "JAG2_NOTCH1" "DLL1_NOTCH1" "IGF1_IGF1R" ...
 $ Ligand               : chr  "S100A10" "JAG2" "DLL1" "IGF1" ...
 $ Ligand.Name          : chr  "S100 calcium binding protein A10" "jagged canonical Notch ligand 2" "delta like canonical Notch ligand 1" "insulin like growth factor 1" ...
 $ Receptor             : chr  "TRPV6" "NOTCH1" "NOTCH1" "IGF1R" ...
 $ Receptor.Name        : chr  "transient receptor potential cation channel subfamily V member 6" "notch receptor 1" "notch receptor 1" "insulin like growth factor 1 receptor" ...
 $ complex_pair         : chr  NA NA NA NA ...
 $ source               : chr  "P60903" "Q9Y219" "O00548" "P05019" ...
 $ target               : chr  "Q9H1D0" "P46531" "P46531" "P08069" ...
 $ is_directed          : num  1 1 1 1 1 1 1 1 1 1 ...
 $ is_stimulation       : num  1 1 1 1 1 1 1 1 1 1 ...
 $ is_inhibit

# auto_update_db 

## Description
This function automates the process of all of the above, updating a ligand-receptor (LR) database. It allows for the selection and processing of either non-curated, curated, or both types of ligand-receptor interaction databases. The function encompasses several steps: importing the database, creating pairwise pairs, filtering pairs based on the protein-protein interaction (PPI) network, processing single components, mapping gene data, annotating components, processing the LR database, and handling adhesive pairs. Finally, it combines the true LR and adhesive databases into a single comprehensive LR database.

## Arguments
- `db_type`: A character string specifying the type of database. Acceptable values are `"noncurated"`, `"curated"`, and `"both"`.

## Details
- The function starts by importing the specified type of database using `import_db`.
- It then generates pairwise pairs of ligand-receptor interactions using `create_pairwise_pairs`.
- These pairs are filtered based on their presence in the PPI network through `filter_pairs_with_ppi`.
- The `process_single_components` function is used to filter out single components and reorder the data.
- Gene data is mapped onto the complete dataset using `map_gene_data`.
- The components are annotated with their parent category using `annotate_components`.
- The LR database is processed with `process_lr_db` to correct the direction of LR pairs.
- Adhesive pairs are handled using `process_adhesive_DB`, allowing for manual curation and correction of swapped duplicated pairs.
- The function concludes by combining the true LR and adhesive databases into a single dataset.

## Returns
- A comprehensive dataframe (`LR_database`) that combines true LR and adhesive pairs. This dataframe includes detailed information about each ligand-receptor pair, including annotations, gene data, and interaction types.

In [10]:
auto_update_db <- function(db_type) {
    # manual annotation of genes


    ligand_list <- c("AGRN", "BMP2", "BMP4", "VTCN1", "CD244", "CD38", "GAS6", "GDNF", "GUCA2A", 
    "HHLA2", "IHH", "PSEN1", "NLGN", "NRTN", "RPH3A", "SHH","FLT3LG")

    receptor_list <- c("CD2", "CD27", "CD80", "CD86", "SELL", "CD44", "CD81", "CD8A", "CLEC1B", 
    "GLG1", "TYROBP", "FLT3", "ERBB2", "EGFR", "IL1R1", "IL1RAP", "KDR", "NRP1")

    if (db_type == "noncurated") {
    db <- import_db("noncurated")
    } else if (db_type == "curated") {
    db <- import_db("curated")
    } else if (db_type == "both") {
    db <- import_db("both")
    } else {
    stop("Invalid database type. Please choose 'noncurated', 'curated', or 'both'.")
    }

    pairwise_pairs <- create_pairwise_pairs(db)
    pt_interactions <- filter_pairs_with_ppi(pairwise_pairs)
    print("Number of PPI network interactions found:")
    print(nrow(pt_interactions))

    complete_data <- process_single_components(db, pt_interactions)
    complete_data <- map_gene_data(complete_data)
    annotation <- annotate_components(complete_data)
    true_LR_DB <- process_lr_db(complete_data, annotation)
    adhesive_DB <- process_adhesive_DB(complete_data, annotation, ligand_list, receptor_list)

    LR_database <- rbind(true_LR_DB, adhesive_DB)

    return(LR_database)
}

In [11]:
LR_database <- auto_update_db("both")

[1] "Number of pairs found"
[1] "Number of PPI network interactions found:"
[1] 1262


“If this function fails, it may be due to internet connectivity issues. Try running it again.”
Querying chunk 1

Querying chunk 2

Querying chunk 3



Finished
Pass returnall=TRUE to return lists of duplicate or missing query terms.


Interleukin 2 (IL2) is a ligand that interacts with a complex molecule composed of three distinct receptors: interleukin 2 receptor subunit alpha (IL2RA), interleukin 2 receptor subunit beta (IL2RB), and interleukin 2 receptor subunit gamma (IL2RG).

Each interaction between IL2 and these receptors is directed, forming distinct ligand-receptor pairs. These interactions are crucial for the biological function of IL2, involving individual binding to IL2RA, IL2RB, and IL2RG.

Additionally, the receptors (IL2RA, IL2RB, IL2RG) also interact among themselves, enhancing their combined affinity for IL2. These receptor-receptor interactions are categorized as adhesive interactions, representing a network of communication between the receptors.


**Affinity levels of IL2 - Receptor complexes**

| Pair                         | Description              |
|------------------------------|--------------------------|
| IL2_IL2RA                    | Low affinity receptor    |
| IL2_IL2RA_IL2RB              | Medium affinity receptor |
| IL2_IL2RA_IL2RB_IL2RG        | High affinity receptor   |



Below provides a detailed view of these interactions

In [12]:
filter(LR_database, complex_pair == "IL2_IL2RA_IL2RB_IL2RG")

Unnamed: 0_level_0,True_LR,Pair.Name,Ligand,Ligand.Name,Receptor,Receptor.Name,complex_pair,source,target,is_directed,⋯,consensus_direction,consensus_stimulation,consensus_inhibition,sources,references,curation_effort,n_references,n_resources,annotation_strategy,dup
Unnamed: 0_level_1,<lgl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<int>,<chr>,<chr>
3306,True,IL2_IL2RA,IL2,interleukin 2,IL2RA,interleukin 2 receptor subunit alpha,IL2_IL2RA_IL2RB_IL2RG,P60568,COMPLEX:P01589_P14784_P31785,1,⋯,1,1,0,Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:15265897;ICELLNET:15546386,2,2,3,both,IL2RA_IL2
3307,True,IL2_IL2RB,IL2,interleukin 2,IL2RB,interleukin 2 receptor subunit beta,IL2_IL2RA_IL2RB_IL2RG,P60568,COMPLEX:P01589_P14784_P31785,1,⋯,1,1,0,Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:15265897;ICELLNET:15546386,2,2,3,both,IL2RB_IL2
3308,True,IL2_IL2RG,IL2,interleukin 2,IL2RG,interleukin 2 receptor subunit gamma,IL2_IL2RA_IL2RB_IL2RG,P60568,COMPLEX:P01589_P14784_P31785,1,⋯,1,1,0,Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:15265897;ICELLNET:15546386,2,2,3,both,IL2RG_IL2
28341,False,IL2RA_IL2RG,IL2RA,interleukin 2 receptor subunit alpha,IL2RG,interleukin 2 receptor subunit gamma,IL2_IL2RA_IL2RB_IL2RG,P60568,COMPLEX:P01589_P14784_P31785,1,⋯,1,1,0,Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:15265897;ICELLNET:15546386,2,2,3,both,IL2RG_IL2RA
28351,False,IL2RB_IL2RA,IL2RB,interleukin 2 receptor subunit beta,IL2RA,interleukin 2 receptor subunit alpha,IL2_IL2RA_IL2RB_IL2RG,P60568,COMPLEX:P01589_P14784_P31785,1,⋯,1,1,0,Cellinker;DLRP_Cellinker;HPMR_Cellinker;ICELLNET;KEGG-MEDICUS,Cellinker:15265897;ICELLNET:15546386,2,2,3,both,IL2RA_IL2RB


In [13]:
# LR_database[LR_database$complex_pair == 'IL2_IL2RA_IL2RB_IL2RG',]