In [None]:
import pyreadr
import pandas as pd
import numpy as np


# Path to the RDS file
rcc_filepath = "/data/sr933/scRCC/RCC_cells_expression_matrix.rds"
endo_filepath= "/data/sr933/scRCC/Endo_cells_expression_matrix.rds"
b_filepath = "/data/sr933/scRCC/B_cells_expression_matrix.rds"
cd4_filepath= "/data/sr933/scRCC/CD4_cells_expression_matrix.rds"
cd8_filepath = "/data/sr933/scRCC/CD8_cells_expression_matrix.rds"
epi_filepath= "/data/sr933/scRCC/Epi_cells_expression_matrix.rds"
fibro_filepath = "/data/sr933/scRCC/Fibro_cells_expression_matrix.rds"
mye_filepath= "/data/sr933/scRCC/Mye_cells_expression_matrix.rds"
nk_filepath = "/data/sr933/scRCC/NK_cells_expression_matrix.rds"

# List of file paths
filepaths = [
    rcc_filepath,
    endo_filepath,
    b_filepath,
    cd4_filepath,
    cd8_filepath,
    epi_filepath,
    fibro_filepath,
    mye_filepath,
    nk_filepath
]

# Function to extract DataFrame
def extract_df(rds_filepath):
    # Skip CD8 files
    result = pyreadr.read_r(rds_filepath)  # Read RDS file
    df = list(result.values())[0]         # Extract the first object (the DataFrame)
    return df

# Extract all DataFrames into a dictionary
dataframes = {filepath[:3]: extract_df(filepath) for filepath in filepaths}


    

In [None]:
# Extract the indices of all DataFrames
all_indices = [df.index for df in dataframes.values() if df is not None]

# Find the common indices across all DataFrames
common_indices = all_indices[0]
for idx in all_indices[1:]:
    common_indices = common_indices.intersection(idx)

# Filter each DataFrame to only include the common indices
filtered_dataframes = {key: df.loc[common_indices] for key, df in dataframes.items() if df is not None}

# Optionally, sort each filtered DataFrame by index
sorted_dataframes = {key: df.sort_index() for key, df in filtered_dataframes.items()}

# Output results
print("Common Indices:", len(common_indices.tolist()))


In [None]:
# Update the gene list with the sorted common indices
gene_list = common_indices.tolist()
X_combined = None
y_labels = []
current_label = 0  # Start labeling from 0

for key, df in filtered_dataframes.items():
    data = df.to_numpy()  # Convert DataFrame to NumPy array
    if X_combined is None:
        X_combined = data  # Initialize combined array
    else:
        X_combined = np.concatenate((X_combined, data), axis=1)  # Concatenate along columns
    y_labels.extend([current_label] * data.shape[1])  # Append labels for this DataFrame
    current_label += 1

# Convert y_labels to a NumPy array
y_labels = np.array(y_labels)

# Output results
print("Gene list:", gene_list)
print("X_combined shape:", X_combined.shape)
print("y_labels shape:", y_labels.shape)

In [None]:
import pickle

# Define the path where you want to save the .pkl file
output_path = "/data/sr933/scRCC/combined_data/RCC_data_dict.pkl"
data_dict={"X": X_combined, "y":y_labels, "Genes": gene_list }
# Save the dictionary as a .pkl file
with open(output_path, "wb") as f:
    pickle.dump(data_dict, f)



