#### Data preparation

In [1]:
# Install required packages
if (!require("BiocManager", quietly = TRUE))
    install.packages("BiocManager")

BiocManager::install("biomaRt")

SyntaxError: invalid syntax (405335517.py, line 2)

##### Gene ID annotations for human interactome from Gysi et al., 2021

In [None]:
library(biomaRt)
ensembl <- useEnsembl(biomart = "genes", dataset = "hsapiens_gene_ensembl")
entrezGeneId_2_entrezGeneAcc <- getBM(attributes = c("ensembl_gene_id", "entrezgene_id", "entrezgene_accession", "description", "gene_biotype"), mart = ensembl) #Total:67472

ppi <- read.csv("Data/Barabasi_CoV2_PPI.csv", header = TRUE)

entrez_ids <- unique(c(ppi$proteinA_entrezid, ppi$proteinB_entrezid))

entrezGeneId_2_entrezGeneAcc <- entrezGeneId_2_entrezGeneAcc[entrezGeneId_2_entrezGeneAcc$entrezgene_id %in% entrez_ids, c("entrezgene_id", "entrezgene_accession")]

write.csv(entrezGeneId_2_entrezGeneAcc, "Data/Barabasi_CoV2_PPI_nodeAnnot.csv", row.names = FALSE)

##### STRING network for exercise

In [None]:
library(STRINGdb)
library(igraph)
library(org.Hs.eg.db)


# Read the Therapeutic Target Database (TTD) Covid-19 drug targets
TTD_data <- read.table("Data/COVID19-Target-Data.txt", sep = "\t", skip = 24, fill = TRUE, quote = "", header = FALSE)
TTD_data <- TTD_data[TTD_data$V2 %in% c("GENENAME", "UNIPROAC", "TARGNAME", "TARGSTAT"), ]


TTD_data <- reshape(TTD_data, direction = "wide",
                    idvar = c("V1"),
                    v.names = c("V3"),
                    timevar = c("V2"))


TTD_data <- TTD_data[grep("_HUMAN", TTD_data$V3.UNIPROAC), ]

TTD_Covid_targets <- unique(na.exclude(TTD_data$V3.GENENAME))
TTD_Covid_targets <- unlist(strsplit(TTD_Covid_targets, split = "; |, "))
TTD_Covid_targets <- TTD_Covid_targets[!(TTD_Covid_targets %in% c("NO-GeName"))]
TTD_Covid_targets <- gsub("cullin 2", "CUL2", TTD_Covid_targets)

# Extract whole STRING network for human
STRING_db <- STRINGdb$new(version = "11.5", species = 9606)
genes <- as.data.frame(as.data.frame(org.Hs.egSYMBOL), stringsAsFactors = FALSE)
STRING_ids <- STRING_db$map(my_data_frame = genes, my_data_frame_id_col_names = "symbol", removeUnmappedRows = TRUE)
STRING_ixns <- STRING_db$get_interactions(STRING_ids$STRING_id)
STRING_human_PPI <- graph_from_data_frame(STRING_ixns, directed = FALSE)


# Map the Covid targets to STRING_id
TTD_Covid_targets_STRING_ids <- STRING_ids[STRING_ids$symbol %in% TTD_Covid_targets, ]


# Extract the network of Covid targets
STRING_Covid_net <- induced_subgraph(STRING_human_PPI, V(STRING_human_PPI)[V(STRING_human_PPI)$name %in% TTD_Covid_targets_STRING_ids$STRING_id])

# STRING_Covid_net <- ego(graph = STRING_human_PPI, 
                                   order = 1,
                                   nodes = V(STRING_human_PPI)[V(STRING_human_PPI)$name %in% TTD_Covid_targets_STRING_ids$STRING_id])
# STRING_Covid_net <- induced_subgraph(STRING_human_PPI, unlist(STRING_Covid_net))
vcount(STRING_Covid_net)
ecount(STRING_Covid_net)
count_components(STRING_Covid_net)


STRING_Covid_net <- as_data_frame(STRING_Covid_net, what = "edges")
STRING_Covid_net$from_geneSymbol <- STRING_ids$symbol[match(STRING_Covid_net$from, STRING_ids$STRING_id)]
STRING_Covid_net$to_geneSymbol <- STRING_ids$symbol[match(STRING_Covid_net$to, STRING_ids$STRING_id)]

write.csv(STRING_Covid_net[, c("from", "from_geneSymbol", "to", "to_geneSymbol", "combined_score")], "Data/STRING_Covid_net.csv", row.names = FALSE)



##### Exercise network

In [None]:
library(igraph)
library(tidyr)

# Read network
Barabasi_Net <- read.csv("Data/Barabasi_CoV2_PPI.csv")
Barabasi_Net <- Barabasi_Net <- graph_from_data_frame(Barabasi_Net, directed = FALSE)


# Read list of human genes interacting with SARS-Cov-2
SarsCov_targets <- read.csv("Data/SARSCoV2_Targets.csv")

# Create subnetwork with the covid targets
Barabasi_SarsCov_Net <- induced_subgraph(Barabasi_Net, V(Barabasi_Net)[V(Barabasi_Net)$name %in% SarsCov_targets$EntrezID])
Barabasi_SarsCov_Net <- simplify(Barabasi_SarsCov_Net, remove.loops = TRUE) # remove loops
isolated <- names(which(degree(Barabasi_SarsCov_Net)==0)) # remove isolated nodes
Barabasi_SarsCov_Net <- delete.vertices(Barabasi_SarsCov_Net, isolated)
Barabasi_SarsCov_Net <- set_vertex_attr(Barabasi_SarsCov_Net, 
                                        name = "clustering_coefficient", 
                                        value = transitivity(Barabasi_SarsCov_Net, type = "local", isolates = "zero"))

Barabasi_SarsCov_Net <- induced_subgraph(Barabasi_SarsCov_Net, V(Barabasi_SarsCov_Net)[V(Barabasi_SarsCov_Net)$clustering_coefficient > 0.5])
selected_SarsCov_genes <- V(Barabasi_SarsCov_Net)$name


# Read disease genes association
disease_gene_links <- data.frame()
for(line in readLines("Data/Guney2016_GenesDisease.tsv")){
  tmp1 <- strsplit(line, "\t")
  tmp2 <- data.frame(disease = tmp1[[1]][2],
                      genes = paste(tmp1[[1]][3:length(tmp1[[1]])], collapse = ";"))
  disease_gene_links <- rbind(disease_gene_links, tmp2)
}
disease_gene_links <- separate_rows(disease_gene_links, "genes", sep = ";")

# Get disease genes from selected distant diseases from SARS-Cov-2
distant_diseases = c("peroxisomal disorders", "cardiomyopathy, hypertrophic", "anemia", "sarcoma")
dis_genes_select <- unique(disease_gene_links$genes[disease_gene_links$disease %in%  distant_diseases])

# Create network for exercise
Exercise_Net <- induced_subgraph(Barabasi_Net, 
                                 V(Barabasi_Net)[V(Barabasi_Net)$name %in% c(selected_SarsCov_genes, dis_genes_select)])
Exercise_Net <- as_data_frame(Exercise_Net, what = "edges")
write.csv(Exercise_Net, "Exercise_PPI_Net.csv", row.names = FALSE)


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=567e54c1-8e1f-4560-8621-8b304ec1f642' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>