In [4]:
import pandas as pd
import os

def split_large_chromosomes(input_file, output_dir, variants_per_file=100000):
    """
    Split large chromosome VCFs into smaller files with sequential numbering
    """
    # Read the original file
    columns = ['Project', 'Sample', 'ID', 'Genome', 'mut_type', 
              'chrom', 'chrom2', 'pos_end', 'ref', 'alt', 'Type']
    
    df = pd.read_csv(input_file, sep='\t', names=columns, skiprows=1)
    
    # Process each chromosome
    for chrom, group in df.groupby('chrom'):
        n_variants = len(group)
        
        # Format data for VCF
        vcf_data = pd.DataFrame({
            'chr': group['chrom'],
            'pos': group['pos_end'],
            'id': group['Sample'],
            'ref': group['ref'],
            'alt': group['alt']
        }).sort_values('pos')
        
        if n_variants > variants_per_file:
            # Calculate number of splits needed
            n_splits = (n_variants // variants_per_file) + 1
            
            # Split the chromosome data
            for i in range(n_splits):
                start_idx = i * variants_per_file
                end_idx = start_idx + variants_per_file
                chunk = vcf_data.iloc[start_idx:end_idx]
                
                # Create output filename with sequential numbering
                output_file = os.path.join(output_dir, f'chr{chrom}_{i+1}.vcf')
                
                # Save chunk
                chunk.to_csv(output_file, sep='\t', index=False, header=False)
                print(f"Created {output_file} with {len(chunk)} variants")
        else:
            # Small enough chromosome - save as is
            output_file = os.path.join(output_dir, f'chr{chrom}_1.vcf')
            vcf_data.to_csv(output_file, sep='\t', index=False, header=False)
            print(f"Created {output_file} with {n_variants} variants")

# Usage
split_large_chromosomes('560.txt', 'by_chr', variants_per_file=100000)


Created by_chr/chr1_1.vcf with 100000 variants
Created by_chr/chr1_2.vcf with 100000 variants
Created by_chr/chr1_3.vcf with 90930 variants
Created by_chr/chr10_1.vcf with 100000 variants
Created by_chr/chr10_2.vcf with 56151 variants
Created by_chr/chr11_1.vcf with 100000 variants
Created by_chr/chr11_2.vcf with 52980 variants
Created by_chr/chr12_1.vcf with 100000 variants
Created by_chr/chr12_2.vcf with 56740 variants
Created by_chr/chr13_1.vcf with 100000 variants
Created by_chr/chr13_2.vcf with 5920 variants
Created by_chr/chr14_1.vcf with 97479 variants
Created by_chr/chr15_1.vcf with 85759 variants
Created by_chr/chr16_1.vcf with 92524 variants
Created by_chr/chr17_1.vcf with 89496 variants
Created by_chr/chr18_1.vcf with 90650 variants
Created by_chr/chr19_1.vcf with 67265 variants
Created by_chr/chr2_1.vcf with 100000 variants
Created by_chr/chr2_2.vcf with 100000 variants
Created by_chr/chr2_3.vcf with 87040 variants
Created by_chr/chr20_1.vcf with 81035 variants
Created by_c

In [5]:
import os

def create_vcf_report(directory):
    # Initialize empty list to store results
    report_lines = []
    total_variants = 0
    
    # Get all vcf files in directory
    vcf_files = [f for f in os.listdir(directory) if f.endswith('.vcf')]
    
    # Count lines for each vcf
    for vcf_file in sorted(vcf_files):  # sorted for consistent output
        file_path = os.path.join(directory, vcf_file)
        with open(file_path, 'r') as f:
            line_count = sum(1 for line in f)
            
        # Store the results
        sample_name = vcf_file.replace('.vcf', '')
        report_lines.append(f"{sample_name}\t{line_count}")
        total_variants += line_count
    
    # Create the report
    report_path = os.path.join(directory, 'vcf_line_counts.txt')
    with open(report_path, 'w') as f:
        # Write header
        f.write("Sample\tVariant_Count\n")
        
        # Write counts for each sample
        for line in report_lines:
            f.write(line + '\n')
            
        # Write total
        f.write(f"\nTotal variants: {total_variants}")
    
    print(f"Report created at: {report_path}")
    return report_path

# Usage example:
create_vcf_report('by_chr')


Report created at: by_chr/vcf_line_counts.txt


'by_chr/vcf_line_counts.txt'