In [3]:
import pandas as pd
import scipy.io
import numpy as np

In [11]:
in_dir = 'result/dataset1/StarSolo_mapping/dataset1_StarSolo_mapping/SJ/raw/'

barcodes = pd.read_csv('{}/barcodes.tsv'.format(in_dir), header=None, sep='\t')
barcodes = barcodes[0].tolist()  # 转换为列表

features = pd.read_csv('{}/features.tsv'.format(in_dir), header=None, sep='\t')
features['coord.intron'] = features[0].astype(str) + ':' + features[1].astype(str) + ':' + features[2].astype(str)

matrix = scipy.io.mmread('{}/matrix.mtx'.format(in_dir)).tocsc()  # 读取稀疏矩阵
expression_matrix = pd.DataFrame(matrix.toarray(), index=features['coord.intron'], columns=barcodes).replace(0, np.nan)

min_cell = 5
non_missing_count = expression_matrix.notnull().sum(axis=1)
filtered_expression_matrix = expression_matrix[non_missing_count >= min_cell]
filtered_expression_matrix.to_csv('result/marvel/dataset1/SJ.txt',sep='\t',)

In [17]:
def prepare_meta(f='dataset1/srr_2_cell_type.txt'):
    df = pd.read_table(f,index_col=0,header=0)
    meta = df[['cell_id','cell_type']].copy()
    meta.columns = ['sample.id','cell.type']
    meta['sample.type'] = 'Single Cell'; meta['qc.seq'] = 'pass'
    meta.to_csv('result/marvel/dataset1_SJ_phenoData.txt',index=False,sep='\t')
    
prepare_meta()

In [35]:
bam_solo = 'result/dataset1/StarSolo_mapping/dataset1_Aligned.out.bam'
bam_1_cell = 'result/dataset1/mapping/sample1/sample1_Aligned.out.bam'
def prepare_rmats():
    with open('result/marvel/dataset1/rMATS_out//path_to_BAM_sample_1.txt','w') as o:
        o.write(bam_solo+',\n')
    with open('result/marvel/dataset1/rMATS_out//path_to_BAM_sample_2.txt','w') as o:
        o.write(bam_1_cell+',\n')
        
prepare_rmats()

In [10]:
def prepare_bed_file(file_path):
    # Read the file as a tab-separated CSV with header
    df = pd.read_csv(file_path, sep="\t", header=0)
    
    # Remove the "X" prefix from column names if present
    df.columns = df.columns.str.replace("^X", "", regex=True)
    
    # Extract data for the positive strand ("+")
    df_pos = df[df['strand'] == "+"].copy()
    
    # Select columns: chr, upstreamEE, and downstreamES for positive strand
    df_pos = df_pos[['chr', 'upstreamEE', 'downstreamES']]
    
    # Extract data for the negative strand ("-")
    df_neg = df[df['strand'] == "-"].copy()
    
    # Select columns: chr, upstreamEE, and downstreamES for negative strand
    df_neg = df_neg[['chr', 'upstreamEE', 'downstreamES']]
    
    # Combine positive and negative strand dataframes
    df = pd.concat([df_pos, df_neg], ignore_index=True)
    
    # Define chromosome sorting order (chr1 to chr22, then chrX)
    chr_order = [f"chr{i}" for i in range(1, 23)] + ["chrX"]
    df['chr'] = pd.Categorical(df['chr'], categories=chr_order, ordered=True)
    
    # Sort dataframe by chromosome and upstreamEE position
    df = df.sort_values(by=['chr', 'upstreamEE']).reset_index(drop=True)
    
    # Check if downstreamES is greater than upstreamEE for all rows
    if not all(df['downstreamES'] > df['upstreamEE']):
        print("Warning: Some downstreamES values are not greater than upstreamEE.")
    
    # Remove duplicate rows and drop rows with missing values
    df = df.drop_duplicates()
    df = df.dropna()
    
    return df

In [4]:
file_path = "result/marvel/dataset1/rMATS_out/fromGTF.RI.txt"
df = prepare_bed_file(file_path)
df.to_csv('result/marvel/dataset1/RI_Coordinates.bed',sep='\t',index=False)