In [34]:
import csv
import pandas as pd
from itertools import islice
import h5py
import numpy as np
from scipy import sparse
import re
import scanpy as sc

## Enhancer

In [53]:
data_path = '../../../results_scbasset/enhancer/'
type_data = 'enhancer'

## Promotor

In [62]:
data_path = '../../../results_scbasset/promotor/'
type_data = 'promotor'

# Import data

In [63]:
file = f'../../../results_scbasset/data/{type_data}.csv'

data = pd.read_csv(file, sep = ',', header = 0)

data

Unnamed: 0,Region,s15t1p2sq1,s15t2p2sq1,s15t3p2sq1,s15t4p2sq1,s15t5p2sq1,s15t6p2sq1,s15t7p2sq1,s15t8p2sq1,s1t1p1sq1,...,MDAMB231crmdrep1,MDAMB231crmdrep2,Me16CcdHMECdRepd1,Me16CcdHMECdRepd2,ZRd75d1dRepd3,ZRd75d1dRepd4,ZRd75d1Repd1,ZRd75d1Repd2,ZRd75d30Repd1,ZRd75d30Repd2
0,chr1:629175-629265:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,110.266253,98.925682
1,chr1:629806-629930:+,0.000000,0.259874,0.000000,0.000000,0.000000,0.000000,0.159497,0.0,0.144932,...,0.431663,0.148921,0.000000,0.317584,0.971207,0.267795,1.280505,2.318278,8.685275,4.856002
2,chr1:630223-630315:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.148921,0.000000,0.000000,0.223255,0.154746,0.000000,0.000000,0.218153,0.000000
3,chr1:631294-631361:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,20.443834,17.180723,0.182066,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,chr1:632231-632378:+,0.000000,0.000000,0.000000,0.000000,0.049271,0.049293,0.000000,0.0,0.000000,...,0.000000,0.046636,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.457854,8.808877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24693,chrUn_KI270438v1:109820-110002:-,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.128586,0.093273,1.253179,2.591921,4.932149,6.839729,2.579147,2.878873,4.958816,7.729800
24694,chrUn_KI270744v1:6128-6215:-,1.317066,0.789345,0.092825,0.188534,0.096010,0.306186,0.000000,0.0,0.443378,...,0.000000,0.000000,20.900049,31.272995,0.823424,0.577287,1.099023,0.219339,0.000000,0.662847
24695,chrUn_KI270744v1:105258-105259:-,0.000000,0.000000,0.046413,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
24696,chrUn_KI270744v1:112243-112244:-,0.524793,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


## Completing the chromosomal coordinates

In [64]:
def adjust_coordinates(position):
    """
    If the end is missing in the coordinate, add the start as the end.
    
    Takes the column with the chromosomal position as input.
    Adds a column to the data frame with the correction of missing values.
    
    """
    full_format_regex = r'(chr[\w]+):(\d+)-(\d+)(:[\+\-])?'
    
    # If the position matches the description.
    match = re.match(full_format_regex, position)
    if match:
        return position  
    
    # Else, add the start as the end.
    chromosome, start = re.match(r'(chr[\w]+):(\d+)', position).groups()
    return f"{chromosome}:{start}-{start}"

In [65]:
df_adjust_coordinates = pd.DataFrame(data)

# Insert the new column.
df_adjust_coordinates.insert(1, 'adjusted_coordinate', data['Region'].apply(adjust_coordinates))

df_adjust_coordinates

Unnamed: 0,Region,adjusted_coordinate,s15t1p2sq1,s15t2p2sq1,s15t3p2sq1,s15t4p2sq1,s15t5p2sq1,s15t6p2sq1,s15t7p2sq1,s15t8p2sq1,...,MDAMB231crmdrep1,MDAMB231crmdrep2,Me16CcdHMECdRepd1,Me16CcdHMECdRepd2,ZRd75d1dRepd3,ZRd75d1dRepd4,ZRd75d1Repd1,ZRd75d1Repd2,ZRd75d30Repd1,ZRd75d30Repd2
0,chr1:629175-629265:+,chr1:629175-629265:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,110.266253,98.925682
1,chr1:629806-629930:+,chr1:629806-629930:+,0.000000,0.259874,0.000000,0.000000,0.000000,0.000000,0.159497,0.0,...,0.431663,0.148921,0.000000,0.317584,0.971207,0.267795,1.280505,2.318278,8.685275,4.856002
2,chr1:630223-630315:+,chr1:630223-630315:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.148921,0.000000,0.000000,0.223255,0.154746,0.000000,0.000000,0.218153,0.000000
3,chr1:631294-631361:+,chr1:631294-631361:+,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,20.443834,17.180723,0.182066,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,chr1:632231-632378:+,chr1:632231-632378:+,0.000000,0.000000,0.000000,0.000000,0.049271,0.049293,0.000000,0.0,...,0.000000,0.046636,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,12.457854,8.808877
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24693,chrUn_KI270438v1:109820-110002:-,chrUn_KI270438v1:109820-110002:-,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.128586,0.093273,1.253179,2.591921,4.932149,6.839729,2.579147,2.878873,4.958816,7.729800
24694,chrUn_KI270744v1:6128-6215:-,chrUn_KI270744v1:6128-6215:-,1.317066,0.789345,0.092825,0.188534,0.096010,0.306186,0.000000,0.0,...,0.000000,0.000000,20.900049,31.272995,0.823424,0.577287,1.099023,0.219339,0.000000,0.662847
24695,chrUn_KI270744v1:105258-105259:-,chrUn_KI270744v1:105258-105259:-,0.000000,0.000000,0.046413,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
24696,chrUn_KI270744v1:112243-112244:-,chrUn_KI270744v1:112243-112244:-,0.524793,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


# Create bed file (peaks coordinates)

In [66]:
def create_bed_file(df_peak_position, output_file='peaks.bed'):
    """
    Creates a BED file from peak coordinates in a pandas DataFrame.

    Args:
    - df_peak_position (pd.Series): A pandas Series containing peak coordinates.
    - output_file (str): The output BED file name. Default is 'peaks.bed'.
    """
    # Extract the information of chromosome, start, end.
    bed_data = df_peak_position.str.extract(r'(chr[\w]+):(\d+)-(\d+)', expand=True)

    # Write into a BED file.
    bed_data.to_csv(output_file, sep='\t', index=False, header=False)

    print(f"Successfully created BED file: {output_file}")

In [67]:
df_peak_position = df_adjust_coordinates['adjusted_coordinate']

#create bed file for the enhancer
output_csv = f'../../../results_scbasset/{type_data}/peaks_{type_data}.bed'
create_bed_file(df_peak_position, output_csv)

Successfully created BED file: ../../../results_scbasset/promotor/peaks_promotor.bed


In [68]:
peak = pd.read_csv(output_csv, sep='\t', names=['chr','start','end'])
peak

Unnamed: 0,chr,start,end
0,chr1,629175,629265
1,chr1,629806,629930
2,chr1,630223,630315
3,chr1,631294,631361
4,chr1,632231,632378
...,...,...,...
24693,chrUn_KI270438v1,109820,110002
24694,chrUn_KI270744v1,6128,6215
24695,chrUn_KI270744v1,105258,105259
24696,chrUn_KI270744v1,112243,112244


# Create count matrix H5 file

## Extract the usefull informations

In [69]:
# Extract the barcodes (patient IDs), features (peaks), and count matrix from the CSV file.

# Barcodes
barcodes = data.columns[1:]
barcodes_list = list(barcodes.astype(str))

# Features
features = df_peak_position
features_list = list(features.astype(str))
num_features = len(features_list)
feature_types_data = np.array(['Peaks'] * len(features_list), dtype='S20')
genome_data = np.array(['NA'] * len(features_list), dtype='S20')

# Count matrix
data_matrix = data.iloc[:, 1:].values
data_matrix_transposed = np.transpose(data_matrix)  # Barcodes (patients) in rows and peaks in columns

print("Number of barcodes (patients): ", data_matrix_transposed.shape[0])
print("Number of peaks: ", data_matrix_transposed.shape[1])

Number of barcodes (patients):  98
Number of peaks:  24698


## Create the h5 file

In [70]:
def create_count_matrix_h5(barcodes_list, data_matrix_transposed, features_list, feature_types_data, genome_data, output_file):
    """
    Creates an HDF5 file with specified groups and datasets for count matrix.

    Args:
    - barcodes_list (list): List of barcodes (patient)
    - data_matrix_transposed: Transposed count matrix data with informations of the activities of the enhancer
    - features_list (list): List of feature (peaks position)
    - feature_types_data: Type of the features
    - genome_data : Data of genome.
    - output_file (str): Output HDF5 file name.
    """
    with h5py.File(output_file, 'w') as f:
        # Create the /matrix group
        grp_matrix = f.create_group('matrix')
        
        # Write the barcodes into the /matrix/barcodes dataset
        grp_matrix.create_dataset('barcodes', data=barcodes_list)
        
        # Convert the count matrix to CSR format
        data_csr = sparse.csr_matrix(data_matrix_transposed)

        # Write the count matrix CSR into the /matrix/data dataset
        grp_matrix.create_dataset('data', data=data_csr.data)
        
        # Invert the dimensions of the matrix
        data_matrix_shape = data_csr.shape
        data_matrix_shape_inverse = (data_matrix_shape[1], data_matrix_shape[0])
        
        # Write the inverted shape into the /matrix/shape dataset
        grp_matrix.create_dataset('shape', data=data_matrix_shape_inverse)
        
        # Write the indices into the /matrix/indices dataset
        grp_matrix.create_dataset('indices', data=data_csr.indices)
        
        # Write the pointers into the /matrix/indptr dataset
        grp_matrix.create_dataset('indptr', data=data_csr.indptr)
        
        # Create the /matrix/features group
        grp_features = grp_matrix.create_group('features')
        
        # Write information about the features
        grp_features.create_dataset('_all_tag_keys', data=features_list)
        grp_features.create_dataset('feature_type', data=feature_types_data)
        grp_features.create_dataset('genome', data=genome_data)
        grp_features.create_dataset('id', data=features_list)
        grp_features.create_dataset('interval', data=features_list)
        grp_features.create_dataset('name', data=features_list)

        print(f"Successfully created {output_file} file")


In [71]:
output = f'{data_path}/count_matrix_{type_data}.h5'
create_count_matrix_h5(barcodes_list, data_matrix_transposed, features_list, feature_types_data, genome_data, output)


Successfully created ../../../results_scbasset/promotor//count_matrix_promotor.h5 file


In [72]:
def Visualize_h5_file(file_name, group):
    """
    Visualize the contents of an HDF5 file.

    Parameters:
    - file_name (str): The name of the HDF5 file.
    - group (str): The name of the group within the HDF5 file.

    """
    with h5py.File(file_name, 'r') as f:
        # Retrieve the keys of the HDF5 file
        keys = list(f.keys())
        print(f"Keys available in the HDF5 file: {keys}\n")

        # Access the specified group
        group_data = f[group]

        # Iterate over each member of the group
        for member in group_data.keys():
            # Check if the member is a group or a dataset
            if isinstance(group_data[member], h5py.Group):
                # If it's a group, display its keys
                print(f"Keys of subgroup '{member}':", list(group_data[member].keys()))
            else:
                # Otherwise, display the dataset's value
                dimensions = group_data[member].shape
                print(f"Dimension: {dimensions}")
                print(f"Value of dataset '{member}':", group_data[member][:])

In [73]:
Visualize_h5_file(output, 'matrix')

Keys available in the HDF5 file: ['matrix']

Dimension: (98,)
Value of dataset 'barcodes': [b's15t1p2sq1' b's15t2p2sq1' b's15t3p2sq1' b's15t4p2sq1' b's15t5p2sq1'
 b's15t6p2sq1' b's15t7p2sq1' b's15t8p2sq1' b's1t1p1sq1' b's1t2p1sq1'
 b's1t3p1sq1' b's1t4p1sq1' b's1t5p2sq1' b's1t6p2sq1' b's1t7p2sq1'
 b's1t8p2sq1' b's7t1p1sq1' b's7t2p1sq1' b's7t3p1sq1' b's7t4p1sq1'
 b's7t5p1sq1' b's7t6p1sq1' b's7t7p1sq1' b's7t8p1sq1' b's2t1p3sq2'
 b's2t2p3sq2' b's2t3p3sq2' b's2t4p3sq2' b's2t5p3sq2' b's2t6p3sq2'
 b's2t7p3sq2' b's2t8p3sq2' b's3t1p3sq2' b's3t2p3sq2' b's3t3p3sq2'
 b's3t4p3sq2' b's3t5p4sq2' b's3t6p4sq2' b's3t7p4sq2' b's3t8p4sq2'
 b's4t1p4sq2' b's4t2p4sq2' b's4t3p4sq2' b's4t4p4sq2' b's4t5p4sq2'
 b's4t6p4sq2' b's4t7p4sq2' b's4t8p4sq2' b's5t1p5sq3' b's5t2p5sq3'
 b's5t3p5sq3' b's5t4p5sq3' b's5t5p5sq3' b's5t6p5sq3' b's5t7p5sq3'
 b's5t8p5sq3' b's6t1p5sq3' b's6t2p5sq3' b's6t3p5sq3' b's6t4p5sq3'
 b's6t5p6sq3' b's6t6p6sq3' b's6t7p6sq3' b's6t8p6sq3' b's9t1p6sq3'
 b's9t2p6sq3' b's9t3p6sq3' b's9t4p6sq3' b's

## visualize the anndata file 

In [46]:
#REGARDER DE OU VIENNENT LES NB_GENES !!!!

"""
Obs: cells
Var: peaks
X: information about open chromatine ?

"""
Visualize_h5_file('atac_ad.h5ad', 'obs')

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'atac_ad.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)