In [1]:
import os
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.font_manager
import seaborn as sns
import anndata
import scanpy as sc
import re
import decoupler as dc
import sc_toolbox
import random
import seaborn.objects as so

In [2]:
data_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/data/"
newdata_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/data/"
plot_dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_out/final_plots/"

sys.path = sys.path + ["/well/immune-rep/users/vbw431/python_utils/"]

In [3]:
import rpy2.rinterface_lib.callbacks
import anndata2ri
import logging
import rpy2.robjects.lib.ggplot2 as gp
from rpy2.robjects import pandas2ri
from rpy2.robjects import r
from rpy2.ipython.ggplot import image_png

#sc.settings.verbosity = 0
rpy2.rinterface_lib.callbacks.logger.setLevel(logging.ERROR)

pandas2ri.activate()
anndata2ri.activate()
%load_ext rpy2.ipython



In [4]:
%%R
.libPaths(c(paste0("/well/immune-rep/users/vbw431/conda/ivybridge/envs/scverse_new/r_modules"), .libPaths()))

library(tidyverse)
library(Seurat)


cur.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/data/DIMITRA_FASTQ/"
work.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/"
out.dir = "/well/immune-rep/users/vbw431/Projects/Peppa/new_analysis/out/"
references = "/well/immune-rep/users/vbw431/reference/reference/refdata-gex-GRCh38-2020-A/"
com.out = "/well/immune-rep/users/vbw431/Projects/Peppa/reference_combat/"

##plotting
library(ggplot2); theme_set(theme_bw(base_size = 18)+
                              theme(strip.text = element_text(colour = 'black', face="bold",size=12), 
                                    panel.grid.major = element_blank(), 
                                    panel.grid.minor = element_blank(),
                                    panel.border = element_rect(size = 0.7),
                                    axis.ticks.length=unit(.10, "cm"),
                                    axis.ticks = element_line(size=0.7),
                                    strip.background = element_blank()))



-- Attaching core tidyverse packages ------------------------ tidyverse 2.0.0 --
v dplyr     1.1.2     v readr     2.1.4
v forcats   1.0.0     v stringr   1.5.0
v lubridate 1.9.2     v tibble    3.2.1
v purrr     1.0.1     v tidyr     1.3.0
-- Conflicts ------------------------------------------ tidyverse_conflicts() --
x purrr::%@%()                 masks rlang::%@%()
x lazyeval::as_name()          masks rlang::as_name()
x lazyeval::call_modify()      masks rlang::call_modify()
x lazyeval::call_standardise() masks rlang::call_standardise()
x lazyeval::expr_label()       masks rlang::expr_label()
x lazyeval::expr_text()        masks rlang::expr_text()
x lazyeval::f_env()            masks rlang::f_env()
x lazyeval::f_env<-()          masks rlang::f_env<-()
x lazyeval::f_label()          masks rlang::f_label()
x lazyeval::f_lhs()            masks rlang::f_lhs()
x lazyeval::f_lhs<-()          masks rlang::f_lhs<-()
x lazyeval::f_rhs()            masks rlang::f_rhs()
x lazyeval::f_rhs<-() 

## Load Seurat data and Prepare for DGE

In [5]:
%%R 

##load, normalise  and trim to remove pre_treatment replicates and post vaccine samples from azimuth

peppa <- readRDS(paste0(out.dir, "/peppa_pbmc_all.rds"))




In [6]:
%%R 
unique(peppa@meta.data$Ref_lab)

[1] "Peppa"   "Azimuth"


In [7]:
%%R 
##filter pre vaccination samples and pre treatment samples out
peppa <- subset(peppa, subset = Treatment_status != "Pre_treatment")
peppa <- subset(peppa, subset = Ref_lab == "Peppa")


print(dim(peppa))



[1]  23776 118312


In [8]:
%%R
unique(peppa@meta.data$Treatment_status)

[1] "On_PrEP"      "On_treatment"


In [9]:
%%R
print(table(peppa@meta.data$celltype_consensus.l3))


                 ASDC          B.CSW.Memory            B.DN.ITGAX 
                   35                  1714                   544 
              B.Naive        B.Naive.Mitohi     B.Naive.TCLA1_low 
                 4470                   159                  2294 
        B.Plasmablast         B.UNSW.Memory            CD4.Mitohi 
                  239                  3539                  1534 
            CD4.Naive               CD4.TCM               CD4.TEM 
                15557                 11832                  1645 
            CD4.TEMRA              CD4.Treg             CD8.Naive 
                 2942                  1805                  8629 
              CD8.TCM               CD8.TEM          CD8.TEM.MPEC 
                 1361                  2771                   142 
        CD8.TEM.NF4A2        CD8.TEMRA.CMC1      CD8.TEMRA.FCGR3A 
                 4046                  3051                   814 
  CD8.TEMRA.FCGR3Alow                  HSPC                  

In [10]:
%%R

head(peppa@meta.data)

peppa_meta <- peppa@meta.data %>%
select(nCount_RNA, nFeature_RNA, sample_ID, nCount_HTO, 
       nFeature_HTO, Hashtag.1, Hashtag.2, Classification,
       solo_doublet_prob, solo_singlet_prob, solo_classification, 
       scanpy_index, 
       percent.mt, 
       percent.ribo, percent.Ig, 
      sample_id, disease_group, 
       study_ID, Age,
       celltype_consensus,
       celltype_consensus.l1,
       celltype_consensus.l2,
       celltype_consensus.l3,
       barcodes, 
       scanpy_index
      ) %>%
dplyr::rename(library_name = sample_id) %>%
mutate(title = gsub("_Hashtag.*","",scanpy_index)) %>%
mutate(celltype_consensus.l2 = gsub("Prolifering", "Proliferating", celltype_consensus.l2)) %>%
mutate(celltype_consensus.l3 = gsub("Prolifering", "Proliferating", celltype_consensus.l3))



head(peppa_meta)



                            nCount_RNA nFeature_RNA sample_ID nCount_HTO
AAACCTGTCGCATGGC-1-HIVHBV_C   2508.859         1259  HIVHBV_C        260
AAACGGGAGATATACG-1-HIVHBV_C   2921.987         2053  HIVHBV_C        733
AAACGGGAGTTACCCA-1-HIVHBV_C   2819.482         2175  HIVHBV_C        570
AAACGGGCATCGGTTA-1-HIVHBV_C   2773.429         1649  HIVHBV_C       1193
AAAGATGTCTCGCTTG-1-HIVHBV_C   2699.850         1657  HIVHBV_C       3162
AAAGCAAAGCGTTCCG-1-HIVHBV_C   2513.127         1455  HIVHBV_C        744
                            nFeature_HTO Hashtag.1 Hashtag.2 Classification
AAACCTGTCGCATGGC-1-HIVHBV_C            2        55       205      Hashtag.2
AAACGGGAGATATACG-1-HIVHBV_C            2       221       512      Hashtag.2
AAACGGGAGTTACCCA-1-HIVHBV_C            2       115       455      Hashtag.2
AAACGGGCATCGGTTA-1-HIVHBV_C            2       347       846      Hashtag.2
AAAGATGTCTCGCTTG-1-HIVHBV_C            2      3104        58      Hashtag.1
AAAGCAAAGCGTTCCG-1-HIVHBV_C      

In [11]:
%%R
print(table(peppa_meta$celltype_consensus.l3))


                 ASDC          B.CSW.Memory            B.DN.ITGAX 
                   35                  1714                   544 
              B.Naive        B.Naive.Mitohi     B.Naive.TCLA1_low 
                 4470                   159                  2294 
        B.Plasmablast         B.UNSW.Memory            CD4.Mitohi 
                  239                  3539                  1534 
            CD4.Naive               CD4.TCM               CD4.TEM 
                15557                 11832                  1645 
            CD4.TEMRA              CD4.Treg             CD8.Naive 
                 2942                  1805                  8629 
              CD8.TCM               CD8.TEM          CD8.TEM.MPEC 
                 1361                  2771                   142 
        CD8.TEM.NF4A2        CD8.TEMRA.CMC1      CD8.TEMRA.FCGR3A 
                 4046                  3051                   814 
  CD8.TEMRA.FCGR3Alow                  HSPC                  

In [12]:
%%R
print(peppa_meta %>% select(library_name, title, Age, Classification) %>%
      mutate(organism = "Homo Sapiens", 
            tissue = "Peripheral blood mononuclear cells", 
            read_type = "paired-end",
            description = "10x Genomics",
            processed_data_file.1 = paste0(library_name, ".csv")
            )%>% 
      unique())

                                library_name        title Age Classification
AAACCTGTCGCATGGC-1-HIVHBV_C        control_1     HIVHBV_C  46      Hashtag.2
AAAGATGTCTCGCTTG-1-HIVHBV_C       patient_10     HIVHBV_C  44      Hashtag.1
AAACCTGAGACTGTAA-1-HH_Tre_4        patient_4     HH_Tre_4  46      Hashtag.1
AAACCTGAGGGAGTAA-1-HH_Tre_4        patient_5     HH_Tre_4  57      Hashtag.2
AAACCTGCAATGAATG-1-HBV_Tre_1       patient_1    HBV_Tre_1  56      Hashtag.2
AAACCTGCACAAGACG-1-HIVHBV_Tre_5    patient_6 HIVHBV_Tre_5  32      Hashtag.2
AAACCTGAGAAACCGC-1-HBV_Tre_3       patient_3    HBV_Tre_3  55      Hashtag.2
AAACCTGAGTGGGATC-1-HBV_X2          patient_7       HBV_X2  37      Hashtag.2
AAACCTGGTCATCGGC-1-HBV_C           control_1        HBV_C  46      Hashtag.2
AAAGTAGGTGTAAGTA-1-HBV_C          patient_11        HBV_C  44      Hashtag.1
AAACCTGCACTTCGAA-1-HIVHBV_X2       patient_8    HIVHBV_X2  69      Hashtag.1
AAAGCAAAGATAGTCA-1-HIVHBV_X2       patient_9    HIVHBV_X2  42      Hashtag.2

## Segment matrices

In [13]:
%%R
library(DropletUtils)


In [14]:
%%R

peppa_list <- SplitObject(peppa, split.by = "sample_id")

print(names(peppa_list))


 [1] "control_1"  "patient_10" "patient_4"  "patient_5"  "patient_1" 
 [6] "patient_6"  "patient_3"  "patient_7"  "patient_11" "patient_8" 
[11] "patient_9"  "patient_2" 


In [15]:
%%R

lapply(names(peppa_list), function(x){
    system(paste0("mkdir /well/immune-rep/users/vbw431/Projects/Peppa/GEO/", x))
})

[[1]]
[1] 0

[[2]]
[1] 0

[[3]]
[1] 0

[[4]]
[1] 0

[[5]]
[1] 0

[[6]]
[1] 0

[[7]]
[1] 0

[[8]]
[1] 0

[[9]]
[1] 0

[[10]]
[1] 0

[[11]]
[1] 0

[[12]]
[1] 0



In [17]:
%%R

lapply(names(peppa_list), function(x){
    write10xCounts(peppa_list[[x]]@assays$RNA@counts, path = paste0("/well/immune-rep/users/vbw431/Projects/Peppa/GEO/", x, "/"))
})

[[1]]
[1] TRUE

[[2]]
[1] TRUE

[[3]]
[1] TRUE

[[4]]
[1] TRUE

[[5]]
[1] TRUE

[[6]]
[1] TRUE

[[7]]
[1] TRUE

[[8]]
[1] TRUE

[[9]]
[1] TRUE

[[10]]
[1] TRUE

[[11]]
[1] TRUE

[[12]]
[1] TRUE



In [20]:
%%R

updatemeta <- peppa_meta %>% select(library_name, title, Age, Classification) %>%
      mutate(organism = "Homo Sapiens", 
            tissue = "Peripheral blood mononuclear cells", 
            read_type = "paired-end",
            description = "10x Genomics",
            processed_data_file.1 = paste0(library_name, "_barcodes.tsv"),
            processed_data_file.2 = paste0(library_name, "_genes.tsv"),
            processed_data_file.3 = paste0(library_name, "_matrix.mtx")

            )%>% 
      unique()



In [22]:
%%R
updatemeta
write.csv(updatemeta, "/well/immune-rep/users/vbw431/Projects/Peppa/GEO/metadata.csv")

In [25]:
%%R 
file_list <- list.files("/well/immune-rep/users/vbw431/Projects/Peppa/GEO/", recursive = T, full.names=T)



In [40]:
%%R

var1 <- file_list[!grepl("metadata", file_list)]


In [42]:
%%R

# Split the vector using forward slash as the delimiter
split_vector <- str_split(var1, "/")

# Create a dataframe from the split_vector
df <- data.frame(do.call(rbind, split_vector))
var2 <- paste0(gsub("/[^/]*$", "/", var1),df$X10, "_", df$X11)

rename_df <- data.frame(var1, var2)


In [45]:
%%R


for (i in 1:nrow(rename_df)){
    system(paste0("mv ", rename_df$var1[i], " ", rename_df$var2[i]))
}


In [46]:
%%R

head(peppa_meta)
write.csv(peppa_meta, "/well/immune-rep/users/vbw431/Projects/Peppa/GEO/dataset_metadata.csv")

In [47]:
%%R
head(peppa_meta)


                            nCount_RNA nFeature_RNA sample_ID nCount_HTO
AAACCTGTCGCATGGC-1-HIVHBV_C   2508.859         1259  HIVHBV_C        260
AAACGGGAGATATACG-1-HIVHBV_C   2921.987         2053  HIVHBV_C        733
AAACGGGAGTTACCCA-1-HIVHBV_C   2819.482         2175  HIVHBV_C        570
AAACGGGCATCGGTTA-1-HIVHBV_C   2773.429         1649  HIVHBV_C       1193
AAAGATGTCTCGCTTG-1-HIVHBV_C   2699.850         1657  HIVHBV_C       3162
AAAGCAAAGCGTTCCG-1-HIVHBV_C   2513.127         1455  HIVHBV_C        744
                            nFeature_HTO Hashtag.1 Hashtag.2 Classification
AAACCTGTCGCATGGC-1-HIVHBV_C            2        55       205      Hashtag.2
AAACGGGAGATATACG-1-HIVHBV_C            2       221       512      Hashtag.2
AAACGGGAGTTACCCA-1-HIVHBV_C            2       115       455      Hashtag.2
AAACGGGCATCGGTTA-1-HIVHBV_C            2       347       846      Hashtag.2
AAAGATGTCTCGCTTG-1-HIVHBV_C            2      3104        58      Hashtag.1
AAAGCAAAGCGTTCCG-1-HIVHBV_C      