In [3]:
import os
import numpy
import polars as pl
import matplotlib.pyplot as plt
import seaborn
import glob
import pandas as pd
from typing import Dict, List


In [None]:
from os import sep

def load_files_from_directory(
    directory_path: str, 
    file_pattern: str
) -> Dict[str, pl.DataFrame]:
    """
    Loads files matching a specific pattern from a directory using Polars and stores them
    with their base file name (without extension) as the key in a dictionary.
    
    Args:
    - directory_path (str): The directory to scan for files.
    - file_pattern (str): The glob pattern to match files (e.g., '*_10000_*.phased.snpden').

    Returns:
    - Dict[str, pl.DataFrame]: A dictionary with base file names as keys and Polars DataFrames as values.
    """
    loaded_files_dict: Dict[str, pl.DataFrame] = {}
    
    # Use glob to get a list of files matching the pattern
    files: List[str] = glob.glob(os.path.join(directory_path, file_pattern))
    
    if not files:
        print(f"No files matching the pattern '{file_pattern}' were found in the directory '{directory_path}'.")
        return loaded_files_dict

    # Loop through all matching files
    for file in files:
        if os.path.isfile(file):  # Ensure it's a file
            # Extract the base file name (without path and extension)
            file_basename: str = os.path.splitext(os.path.basename(file))[0]

            try:
                # Read the CSV file into a Polars DataFrame
                df: pl.DataFrame = pl.read_csv(file, separator='\t')
                loaded_files_dict[file_basename] = df
            except Exception as e:
                print(f"Error reading {file}: {e}")
                continue
    
    return loaded_files_dict

# Example usage
file_directory: str = "."  # Specify the directory
pattern: str = "*_10000_*.phased.snpden"  # Define the file pattern

# Load files based on the pattern
files_data: Dict[str, pl.DataFrame] = load_files_from_directory(file_directory, pattern)


Error reading ./snp_density_10000_sorted_chr_10.phased.snpden: empty CSV


In [None]:
def merge_dataframes_from_dict(
    dataframes: Dict[str, pl.DataFrame], 
    first_file_key: str
) -> pl.DataFrame:
    """
    Merges DataFrames stored in a dictionary based on a common column ('CHROM').
    Only the first DataFrame's column names will be preserved.

    Args:
    - dataframes: A dictionary where the keys are file names and the values are DataFrames.
    - first_file_key: The key for the first DataFrame to use as the base for column names.

    Returns:
    - A single merged DataFrame with rows from all DataFrames, retaining the column names of the first DataFrame.
    """
    # Get the first DataFrame from the dictionary
    merged_df: pl.DataFrame = dataframes.get(first_file_key)
    
    # Iterate over the rest of the DataFrames and merge them
    for key, df in dataframes.items():
        if key != first_file_key:
            merged_df = merged_df.join(df, on="CHROM", how="inner")
    
    return merged_df

shape: (100_943, 4)
┌───────┬───────────┬───────────┬─────────────┐
│ CHROM ┆ BIN_START ┆ SNP_COUNT ┆ VARIANTS/KB │
│ ---   ┆ ---       ┆ ---       ┆ ---         │
│ str   ┆ i64       ┆ i64       ┆ f64         │
╞═══════╪═══════════╪═══════════╪═════════════╡
│ chr_1 ┆ 0         ┆ 101       ┆ 10.1        │
│ chr_1 ┆ 10000     ┆ 468       ┆ 46.8        │
│ chr_1 ┆ 20000     ┆ 330       ┆ 33.0        │
│ chr_1 ┆ 30000     ┆ 281       ┆ 28.1        │
│ chr_1 ┆ 40000     ┆ 324       ┆ 32.4        │
│ …     ┆ …         ┆ …         ┆ …           │
│ chr_Z ┆ 88530000  ┆ 298       ┆ 29.8        │
│ chr_Z ┆ 88540000  ┆ 253       ┆ 25.3        │
│ chr_Z ┆ 88550000  ┆ 216       ┆ 21.6        │
│ chr_Z ┆ 88560000  ┆ 174       ┆ 17.4        │
│ chr_Z ┆ 88570000  ┆ 174       ┆ 17.4        │
└───────┴───────────┴───────────┴─────────────┘


In [None]:
from cProfile import label


def plot_chromosome_density(dataFrame: pl.DataFrame):
    
    pandas_dataFrame: pd.DataFrame = dataFrame.to_pandas()
    
    seaborn.set_style("darkgrid")
    
    chromosomes: List[str] = pandas_dataFrame["CHROM"].unique()
    
    fig, axes = plt.subplots(len(chromosomes), 1, figsize=(14, 7 * len(chromosomes)))
    
    if len(chromosomes) == 1:
        axes = [axes]
    
    for i, chromosome in enumerate(chromosomes):
        
        chromosome_data = pandas_dataFrame[pandas_dataFrame["CHROM"] == chromosome]
        
        seaborn.lineplot(data=chromosome_data, x="BIN_START", y = "VARIANTS/KB", color="darkred",ax = axes[i], label=chromosome)
        axes[i].set_title(f"Chromosome {chrom} SNP Density", fontsize=16)
        axes[i].set_xlabel("Bin Start", fontsize=12)
        axes[i].set_ylabel("SNP Variants/Kb", fontsize=12)
        axes[i].legend(title="Chromosome")
    # Adjust the layout to ensure everything fits
    plt.tight_layout()

    # Display the plot
    plt.show()