In [1]:
import os
import shutil

def distribute_vcfs_with_largest_group(vcf_directory, files_per_group=80, num_groups=7):
    # Get all VCF files and their sizes
    file_sizes = []
    for vcf_file in os.listdir(vcf_directory):
        if vcf_file.endswith('.vcf'):
            file_path = os.path.join(vcf_directory, vcf_file)
            with open(file_path, 'r') as f:
                line_count = sum(1 for _ in f)
            file_sizes.append((vcf_file, line_count))
    
    # Sort files by size (largest to smallest)
    file_sizes.sort(key=lambda x: x[1], reverse=True)
    
    # Create groups
    groups = [[] for _ in range(num_groups)]
    
    # Put the largest 80 files in group 0
    groups[0] = file_sizes[:files_per_group]
    
    # Distribute remaining files evenly among the other groups
    remaining_files = file_sizes[files_per_group:]
    remaining_files.sort(key=lambda x: x[1])  # Sort smaller files (smallest first)
    
    for i, file_info in enumerate(remaining_files):
        group_idx = (i % (num_groups - 1)) + 1  # Start from group 1 (index 1)
        groups[group_idx].append(file_info)
    
    # Create group folders and move files
    for i, group in enumerate(groups):
        # Create group folder
        group_dir = os.path.join(vcf_directory, f'group_{i}')
        os.makedirs(group_dir, exist_ok=True)
        
        # Create report
        report_path = os.path.join(group_dir, f'group_{i}_report.txt')
        with open(report_path, 'w') as f:
            f.write(f"Files in group {i}:\n")
            f.write("Filename\tVariants\n")
            total_variants = 0
            
            # Sort files by variant count (ascending)
            for vcf_file, line_count in sorted(group, key=lambda x: x[1]):
                # Move file
                src = os.path.join(vcf_directory, vcf_file)
                dst = os.path.join(group_dir, vcf_file)
                shutil.move(src, dst)
                
                # Write to report
                f.write(f"{vcf_file}\t{line_count}\n")
                total_variants += line_count
            
            # Add summary
            f.write(f"\nSummary for group {i}:\n")
            f.write(f"Total files: {len(group)}\n")
            f.write(f"Total variants: {total_variants}\n")
            avg_variants = total_variants/len(group) if group else 0
            f.write(f"Average variants per file: {avg_variants:.2f}\n")
    
    # Create main summary
    with open(os.path.join(vcf_directory, 'groups_summary.txt'), 'w') as f:
        f.write("Groups Summary:\n")
        f.write("==============\n")
        f.write("Group\tFiles\tTotal_Variants\tAvg_Variants\n")
        
        for i, group in enumerate(groups):
            total = sum(size for _, size in group)
            avg = total/len(group) if group else 0
            f.write(f"Group_{i}\t{len(group)}\t{total}\t{avg:.2f}\n")
            
        # Add note about special distribution
        f.write("\nNote: Group_0 contains the 80 largest files. The remaining files are distributed evenly among the other groups.")
    
    print(f"Created {num_groups} groups. Group 0 contains the 80 largest files, the rest are distributed evenly.")
    return groups

# Usage
distribute_vcfs_with_largest_group('sim2')


Created 7 groups. Group 0 contains the 80 largest files, the rest are distributed evenly.


[[('PD13604a+.vcf', 93102),
  ('PD23579a+.vcf', 87311),
  ('PD23564a+.vcf', 83447),
  ('PD5937a+.vcf', 76097),
  ('PD23561a+.vcf', 58028),
  ('PD9063a+.vcf', 50045),
  ('PD4977a+.vcf', 42258),
  ('PD6405a+.vcf', 35317),
  ('PD13425a+.vcf', 34502),
  ('PD9568a+.vcf', 30395),
  ('PD24189a+.vcf', 29590),
  ('PD4072a+.vcf', 27931),
  ('PD8832a+.vcf', 26409),
  ('PD8660a2+.vcf', 26128),
  ('PD4607a+.vcf', 25953),
  ('PD24208a+.vcf', 25624),
  ('PD6412a+.vcf', 25187),
  ('PD24326a+.vcf', 23620),
  ('PD24333a+.vcf', 22884),
  ('PD6043a+.vcf', 22764),
  ('PD11465a+.vcf', 22551),
  ('PD8982a+.vcf', 21557),
  ('PD24201a+.vcf', 20478),
  ('PD24197a+.vcf', 20215),
  ('PD18045a+.vcf', 19468),
  ('PD11379a+.vcf', 19382),
  ('PD13296a+.vcf', 18582),
  ('PD7426a+.vcf', 18497),
  ('PD9576a+.vcf', 17838),
  ('PD24192a+.vcf', 17827),
  ('PD18020a+.vcf', 17772),
  ('PD11751a+.vcf', 16648),
  ('PD24327a+.vcf', 16185),
  ('PD7428a+.vcf', 15950),
  ('PD8652a2+.vcf', 15856),
  ('PD24320a+.vcf', 15695),
  ('PD