## Notebook for Tauc 2021 (Drosophila) anndata file creation 
### Developed by: Anna Maguza

### Institute of Computational Biology - Computational Health Centre - Hemlholtz Munich

### 2nd April 2023

#### Load packages

In [1]:
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as an
import h5py
from scipy.io import mmread
from scipy.sparse import coo_matrix
import matplotlib.pyplot as plt
import scipy as sci

In [2]:
import rpy2

#### Setup Cells

In [3]:
%matplotlib inline

In [4]:
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()
sc.settings.set_figure_params(dpi=80, facecolor='white')

  from .autonotebook import tqdm as notebook_tqdm


scanpy==1.9.3 anndata==0.8.0 umap==0.5.3 numpy==1.23.5 scipy==1.10.1 pandas==1.3.5 scikit-learn==1.2.2 statsmodels==0.13.5 pynndescent==0.5.8


# GSE157775

## Data Upload

In [5]:
# Upload barcodes, features, cells annotations and gene expression matrix
barcodes = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Interspecies_analysis/Drosophila/Tauc_2021/GSE157775/GSE157775_barcodes.tsv', sep='\t', header=None)
cell_annotations = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Interspecies_analysis/Drosophila/Tauc_2021/GSE157775/GSE157775_cell_annotation.csv', header=None)
features = pd.read_csv('/Users/anna.maguza/Desktop/Data/Gut_project/Interspecies_analysis/Drosophila/Tauc_2021/GSE157775/GSE157775_features.tsv', sep='\t', header=None)
matrix = mmread('/Users/anna.maguza/Desktop/Data/Gut_project/Interspecies_analysis/Drosophila/Tauc_2021/GSE157775/GSE157775_matrix.mtx').T.tocoo()

In [6]:
# Make column 0 as index in barcodes and rename it to 'cell_id'
barcodes = barcodes.set_index(0)
barcodes.index.name = 'cell_id'

# Make column 0 as index in features and rename it to 'gene_id'
features = features.set_index(0)
features.index.name = 'FlyBase_ID'
#rename column 1 to 'gene_name'
features = features.rename(columns={1: 'gene_name'})
features = features.rename(columns={2: 'gene_type'})


In [7]:
#Make the first row as column names
cell_annotations.columns = cell_annotations.iloc[0]
cell_annotations = cell_annotations.drop(cell_annotations.index[0])
# Make column 0 as index in cell_annotations and rename it to 'cell_id'
cell_annotations = cell_annotations.set_index('cellID')

In [8]:
#Create anndata object
adata = an.AnnData(X=matrix, obs=barcodes, var=features)

  adata = an.AnnData(X=matrix, obs=barcodes, var=features)


In [9]:
adata.obs

AAACCTGAGAAACCAT-1
AAACCTGAGAAACCGC-1
AAACCTGAGAAACCTA-1
AAACCTGAGAAACGAG-1
AAACCTGAGAAACGCC-1
...
TTTGTCATCTTTACAC-5
TTTGTCATCTTTACGT-5
TTTGTCATCTTTAGGG-5
TTTGTCATCTTTAGTC-5
TTTGTCATCTTTCCTC-5


In [28]:
import pandas as pd

import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import anndata

In [18]:
readRDS = robjects.r['readRDS']

In [12]:
df = readRDS('/Users/anna.maguza/Desktop/Data/Gut_project/Interspecies_analysis/Drosophila/Tauc_2021/GSE157775/GSE157775_sce.Rds')

R[write to console]: Lade nötiges Paket: SingleCellExperiment

R[write to console]: Lade nötiges Paket: SummarizedExperiment

R[write to console]: Lade nötiges Paket: MatrixGenerics

R[write to console]: Lade nötiges Paket: matrixStats

R[write to console]: 
Attache Paket: ‘MatrixGenerics’


R[write to console]: Die folgenden Objekte sind maskiert von ‘package:matrixStats’:

    colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
    colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
    colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
    colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
    colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
    colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
    colWeightedMeans, colWeightedMedians, colWeightedSds,
    colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
    rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
    rowCumsums, rowDiffs, rowIQRDi

In [22]:
# Get the names of the elements in the R object
object_names = robjects.r['names'](df)

In [23]:
# Print the names of the elements
print("Elements inside the R object:")
for name in object_names:
    print("-", name)

Elements inside the R object:
- FBgn0000003
- FBgn0000008
- FBgn0000014
- FBgn0000015
- FBgn0000017
- FBgn0000018
- FBgn0000024
- FBgn0000028
- FBgn0000032
- FBgn0000036
- FBgn0000037
- FBgn0000038
- FBgn0000039
- FBgn0000042
- FBgn0000043
- FBgn0000044
- FBgn0000045
- FBgn0000046
- FBgn0000047
- FBgn0000052
- FBgn0000053
- FBgn0000054
- FBgn0000055
- FBgn0000056
- FBgn0000057
- FBgn0000063
- FBgn0000064
- FBgn0000071
- FBgn0000075
- FBgn0000077
- FBgn0000078
- FBgn0000079
- FBgn0000083
- FBgn0000084
- FBgn0000092
- FBgn0000094
- FBgn0000097
- FBgn0000099
- FBgn0000100
- FBgn0000108
- FBgn0000109
- FBgn0000114
- FBgn0000115
- FBgn0000116
- FBgn0000117
- FBgn0000119
- FBgn0000137
- FBgn0000139
- FBgn0000140
- FBgn0000146
- FBgn0000147
- FBgn0000150
- FBgn0000152
- FBgn0000153
- FBgn0000157
- FBgn0000158
- FBgn0000163
- FBgn0000173
- FBgn0000179
- FBgn0000180
- FBgn0000181
- FBgn0000182
- FBgn0000183
- FBgn0000210
- FBgn0000212
- FBgn0000216
- FBgn0000221
- FBgn0000228
- FBgn0000229
- FB

In [24]:
# Check if it's a list or a data.frame
if df.rclass[0] == "data.frame":
    print("The object is a data.frame")
    df_py = pandas2ri.ri2py(df)
    print("\nPython DataFrame:")
    print(df_py.head())
elif df.rclass[0] == "list":
    print("The object is a list")
    for name in object_names:
        element = df.rx2(name)
        print("\nElement:", name)
        print("Type:", element.rclass[0])
        if element.rclass[0] == "data.frame":
            element_py = pandas2ri.ri2py(element)
            print("Python DataFrame:")
            print(element_py.head())
        else:
            print("Value:", element)
else:
    print("The object is of type:", df.rclass[0])
    print("Value:", df)

The object is of type: SingleCellExperiment
Value: class: SingleCellExperiment 
dim: 12872 12149 
metadata(0):
assays(2): counts logcounts
rownames(12872): FBgn0000003 FBgn0000008 ... FBgn0267794 FBgn0267795
rowData names(12): is_feature_control is_feature_control_mito ...
  symbol desc
colnames(12149): CELL904410 CELL904498 ... CELL2662877 CELL2662890
colData names(13): orig.ident nCount_RNA ... batch age
reducedDimNames(0):
mainExpName: NULL
altExpNames(0):

