In [12]:
import scanpy as sc
import pandas as pd
import tarfile
import gzip
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor

# Path to your .tar file
tar_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW.tar'

# Path to your Excel sheet
excel_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Supplementary.xlsx'

# Function to read gzipped text content
def read_gzipped_text(file):
    with gzip.open(file, 'rt') as f:
        content = f.read()
    return content

# Load tissue_barcode info from the Excel sheet
tissue_barcode_df = pd.read_excel(excel_file_path)

# Create an empty DataFrame to store counts data
counts_df = pd.DataFrame()

# Function to load counts from a gzipped text file
def load_counts(member, tar):
    gzipped_content = tar.extractfile(member).read()
    member_counts_df = pd.read_csv(BytesIO(gzip.decompress(gzipped_content)), sep='\t', index_col=0)
    return member_counts_df.T

# Load counts from the .tar file using ThreadPoolExecutor
with tarfile.open(tar_file_path, 'r') as tar:
    with ThreadPoolExecutor(max_workers=4) as executor:
        # List of future objects for parallel execution
        futures = [executor.submit(load_counts, member, tar) for member in tar.getmembers() if member.name.endswith('.txt.gz')]
        
        # Extract results from futures as they complete
        for future in futures:
            member_counts_df = future.result()
            counts_df = counts_df.append(member_counts_df)

# Reset the index of the counts DataFrame
counts_df.reset_index(drop=True, inplace=True)

# Create an AnnData object for Scanpy
adata = sc.AnnData(X=counts_df.values, obs=tissue_barcode_df)
# Now you can perform various analyses using Scanpy on the 'adata' object
# For example, you can perform clustering, dimensionality reduction, etc.

# Save the AnnData object to a file if needed
#sc.write('your_output.h5ad', adata)


KeyboardInterrupt



In [35]:
import scanpy as sc
import pandas as pd
import tarfile
import gzip
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path

# Path to the extracted folder containing individual count files
extracted_folder_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201445_RAW'

# Path to the Excel sheet
excel_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Supplementary.xlsx'

# Function to read gzipped text content
def read_gzipped_text(file):
    with gzip.open(file, 'rt') as f:
        content = f.read()
    return content

# Load tissue_barcode info from the Excel sheet
tissue_barcode_df = pd.read_excel(excel_file_path)

# Create an empty DataFrame to store counts data
counts_df = pd.DataFrame()

# Function to load counts from a gzipped text file
def load_counts(file_path):
    # Extract the sample ID from the file name
    sample_id = file_path.stem
    
    # Read the gzipped content into a DataFrame
    member_counts_df = pd.read_csv(BytesIO(gzip.decompress(read_gzipped_text(file_path))), sep='\t', index_col=0)
    
    # Transpose the DataFrame to have samples as columns
    member_counts_df = member_counts_df.T
    
    # Add a column for the sample ID
    member_counts_df['SampleID'] = sample_id
    
    return member_counts_df


# Load counts from individual files in the extracted folder using ThreadPoolExecutor
with ThreadPoolExecutor(max_workers=4) as executor:
    # List of future objects for parallel execution
    futures = [executor.submit(load_counts, file_path) for file_path in Path('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201445_RAW').glob('*.txt.gz')]
    print(futures)
    # Extract results from futures as they complete
    for future in futures:
        member_counts_df = future.result()
        counts_df = counts_df.append(member_counts_df)

# Reset the index of the counts DataFrame
counts_df.reset_index(drop=True, inplace=True)

# Create an AnnData object for Scanpy
adata = sc.AnnData(X=counts_df.values[:, :-1], obs=tissue_barcode_df)

# Now you can perform various analyses using Scanpy on the 'adata' object
# For example, you can perform clustering, dimensionality reduction, etc.

# Save the AnnData object to a file if needed
#sc.write('your_output.h5ad', adata)


[]




ValueError: Observations annot. `obs` must have number of rows of `X` (0), but has 299 rows.

In [4]:
import pandas as pd

# Specify the correct sheet name or index
sheet_name = 'Tissue barcode'  # Replace with the actual sheet name

# Read the Excel file
try:
    df = pd.read_excel('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_Supplementary_Table_Tissue_barcode_seq_mode_v2.xlsx', sheet_name=sheet_name)
    # Now you can work with the DataFrame 'df'
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: File is not a zip file


In [9]:
from pathlib import Path
import pandas as pd

file_path = Path("/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Supplementary.xlsx")
file_extension = file_path.suffix.lower()[1:]

if file_extension == 'xlsx':
    df = pd.read_excel(file_path, engine='openpyxl')
elif file_extension == 'xls':
    df = pd.read_excel(file_path)
elif file_extension == 'csv':
    df = pd.read_csv(file_path)
else:
    raise Exception("File not supported")


In [17]:
tissue_barcode_df

Unnamed: 0,Tissue barcode base,Tissue barcode Base location,Unnamed: 2,Unnamed: 3,Tissue barcode,Tissue barcode.1,Tissue barcode.2,Tissue barcode.3,Tissue barcode.4,Tissue barcode.5,...,Tissue barcode.8,Tissue barcode.9,Tissue barcode.10,Tissue barcode.11,Tissue barcode.12,Tissue barcode.13,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21
0,"NEO1,NEO2",Read1:42-51,,Batch,Brain,Lung,Cloacal chamber,Liver,Intestine,Stomach,...,Heart,Eye,Kidney,Spleen,Skin,Gill,,,,
1,"NEO3,NEO4,META1,META2",Read1:19-28,,Neotenic_1(NEO1),AACTGCTGCC,GGAGCGGCCG,AACTGCTGCC,GAAGGTTGCC,GCGCGGTAGT,AGACTCAAGC,...,GGATGCAGCA,TATTACTCAT,GAACTGCATC,GGAGCGGCCG,TTACGTATAC,CGGATGAAGG,,,,
2,,,,Neotenic_1(NEO1),TGCGCGATGC,CGCGTACGAC,TGCGCGATGC,GTTGAAGGAT,CTGGATTAGT,GCAGGCGACG,...,CCGCTATATT,AACTGATCTT,ACTCTCTCAA,CGCGTACGAC,ACTTAACTAG,TATCGTCGGC,,,,
3,,,,Neotenic_1(NEO1),ATTGAGATTG,CGATGGCGCC,ATTGAGATTG,TGCGCCAGAA,TTGGATCCTT,AATACTCTTC,...,ATCGAGTCGC,CCGCGGACCG,GTCGCTCAGT,CGATGGCGCC,GCAGACCGGT,GCCGTATGCT,,,,
4,,,,Neotenic_1(NEO1),TTGATATATT,TGGTATTCAT,TTGATATATT,CGAATAATTC,TTGGAATCTC,CCAACTAACC,...,GCGACGCAGA,AATACGCAGG,CCGTCGGAGG,TGGTATTCAT,TGAGTCCAGA,CTGAACTGGT,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,,,,Metamorphosed_2(META2),CGAATAATTC,,,,,,...,,,TTGGAATCTC,,,,,,,
295,,,,Metamorphosed_2(META2),GCGACGCCTT,,,,,,...,,,ACCTGGACGC,,,,,,,
296,,,,Metamorphosed_2(META2),ATCAACGATT,,,,,,...,,,GCGTTCAGCT,,,,,,,
297,,,,Metamorphosed_2(META2),GTTCTGAATT,,,,,,...,,,TTAGCAATAA,,,,,,,


In [21]:
result = load_counts('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/')

AttributeError: 'str' object has no attribute 'stem'

In [37]:

result = load_counts(Path('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6063999_NEO1_1_count.txt.gz'))

TypeError: a bytes-like object is required, not 'str'

In [31]:
files = list(Path('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW').glob('*.txt.gz'))

In [32]:
files

[PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064093_NEO1_95_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064567_META1_46_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064336_NEO2_141_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064532_META1_11_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064036_NEO1_38_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064500_NEO4_74_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/GSM6064007_NEO1_9_count.txt.gz'),
 PosixPath('/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW/G

In [29]:
extracted_folder_path

'/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201445_RAW/'

In [38]:
import scanpy as sc
import pandas as pd
import tarfile
from io import BytesIO

# Path to your .tar file
tar_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW.tar'

# Path to your Excel sheet
excel_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Supplementary.xlsx'

# Load counts from the .tar file
with tarfile.open(tar_file_path, 'r') as tar:
    # Assuming the counts file is named 'counts.csv' within the tar archive
    counts_csv = tar.extractfile('counts.csv')

    # Load counts into a pandas DataFrame
    counts_df = pd.read_csv(counts_csv)

# Load tissue_barcode info from the Excel sheet
tissue_barcode_df = pd.read_excel(excel_file_path)

# Create an AnnData object for Scanpy
adata = sc.AnnData(X=counts_df.values.T, obs=tissue_barcode_df)

# Now you can perform various analyses using Scanpy on the 'adata' object
# For example, you can perform clustering, dimensionality reduction, etc.

# Save the AnnData object to a file if needed
#sc.write('your_output.h5ad', adata)

KeyError: "filename 'counts.csv' not found"

In [None]:
import scanpy as sc
import pandas as pd
import tarfile
import gzip
from io import BytesIO

# Path to your .tar file
tar_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/GSE201446_RAW.tar'

# Path to your Excel sheet
excel_file_path = '/home/nikvaku/snic2022-6-312/LabMemberScratchDir/Nikhilesh/Raw_data/Supplementary.xlsx'

# Function to read gzipped text content
def read_gzipped_text(file):
    with gzip.open(file, 'rt') as f:
        content = f.read()
    return content

# Function to load counts from a gzipped text file
def load_counts(file_content):
    # Read the gzipped content into a DataFrame
    member_counts_df = pd.read_csv(BytesIO(gzip.decompress(file_content)), sep='\t', index_col=0)
    
    # Transpose the DataFrame to have samples as columns
    member_counts_df = member_counts_df.T
    
    return member_counts_df

# Load tissue_barcode info from the Excel sheet
tissue_barcode_df = pd.read_excel(excel_file_path)

# Create an empty DataFrame to store counts data
counts_df = pd.DataFrame()

# Load counts from individual files in the tar archive
with tarfile.open(tar_file_path, 'r') as tar:
    # List all file names in the tar archive
    file_names = tar.getnames()
    
    # Filter files based on the pattern
    relevant_files = [file_name for file_name in file_names if file_name.endswith('.txt.gz')]
    
    # Loop through relevant files and load counts
    for file_name in relevant_files:
        # Extract the file content
        file_content = tar.extractfile(file_name).read()
        
        # Extract the sample ID from the file name
        sample_id = file_name.split('_')[1].split('.')[0]
        
        # Load counts into a DataFrame
        member_counts_df = load_counts(file_content)
        
        # Add a column for the sample ID
        member_counts_df['SampleID'] = sample_id
        
        # Concatenate the DataFrame to the main counts DataFrame
        counts_df = pd.concat([counts_df, member_counts_df], ignore_index=True)

# Reset the index of the counts DataFrame
counts_df.reset_index(drop=True, inplace=True)

# Create an AnnData object for Scanpy
adata = sc.AnnData(X=counts_df.values[:, :-1], obs=tissue_barcode_df)

# Now you can perform various analyses using Scanpy on the 'adata' object
# For example, you can perform clustering, dimensionality reduction, etc.

# Save the AnnData object to a file if needed
# sc.write('your_output.h5ad', adata)
