In [1]:
import csv
import random
import string
import os
import time # Import time for progress updates

def generate_random_string(length=8):
  """Generates a random string of fixed length."""
  letters = string.ascii_lowercase
  return ''.join(random.choice(letters) for i in range(length))

def generate_csv_by_size(filename, target_size_gb):
  """
  Generates a CSV file aiming for a target size in GB,
  matching the schema of the provided 'cells_top_1000.csv'.

  Args:
      filename (str): The name of the CSV file to create.
      target_size_gb (float): The target file size in Gigabytes (GB).
  """
  # Define column headers based on the provided file
  headers = [
      'area', 'eccentricity', 'major_axis_length', 'minor_axis_length', 'perimeter', 'aSMA', 'CCR2', 'CCR6', 'CD107a',
      'CD10', 'CD114', 'CD115', 'CD14', 'CD15', 'CD161', 'CD16', 'CD172a', 'CD31', 'CD3', 'CD45', 'CD45RO', 'CD4',
      'CD56', 'CD68', 'CD71', 'CD74', 'CD8a', 'Collagen1', 'DAP12', 'DNA1', 'DNA3', 'EpCAM', 'GranzymeB', 'HLADR',
      'IFNb', 'IFNg', 'Ki67', 'PAI1', 'PanK', 'PF4', 'RAGE', 'Va7-2', 'original_clusters', 'sample_name', 'ROI',
      'condition', 'sample_id', 'UMAP_1', 'UMAP_2', 'UMAP_3', 'x', 'y', 'cellID', 'broad_celltypes', 'annotations',
      'global_annotations', 'global_anno_actual', 'disease_state', 'myeloid_clusters', 'myeloid_UMAP1',
      'myeloid_UMAP2', 'myeloid_UMAP3', 'myeloid_annotations', 'structural_clusters', 'structural_UMAP1',
      'structural_UMAP2', 'structural_UMAP3', 'structural_annotations', 'lymphocyte_clusters',
      'lymphocyte_UMAP1', 'lymphocyte_UMAP2', 'lymphocyte_UMAP3', 'lymphocyte_annotations'
  ]

  # Define approximate data types and generation logic
  data_generators = {
      'area': lambda: random.randint(10, 1000),
      'eccentricity': lambda: random.uniform(0.1, 1.0),
      'major_axis_length': lambda: random.uniform(5, 25),
      'minor_axis_length': lambda: random.uniform(2, 15),
      'perimeter': lambda: random.uniform(10, 50),
      'x': lambda: random.uniform(0, 2000),
      'y': lambda: random.uniform(0, 2000),
      'DNA1': lambda: random.uniform(1.5, 4.5),
      'DNA3': lambda: random.uniform(2.0, 5.0),
      'UMAP_1': lambda: random.uniform(-10, 10),
      'UMAP_2': lambda: random.uniform(-10, 10),
      'UMAP_3': lambda: random.uniform(-10, 10),
      'myeloid_UMAP1': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '', # Handle potential NAs
      'myeloid_UMAP2': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'myeloid_UMAP3': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'structural_UMAP1': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'structural_UMAP2': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'structural_UMAP3': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'lymphocyte_UMAP1': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'lymphocyte_UMAP2': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      'lymphocyte_UMAP3': lambda: random.uniform(-10, 10) if random.random() > 0.1 else '',
      # String columns
      'original_clusters': lambda: f"cl{random.randint(1, 25):02d}",
      'sample_name': lambda: f"SAMPLE_{random.randint(1, 10)}",
      'ROI': lambda: f"ROI_{random.randint(1, 5)}",
      'condition': lambda: random.choice(['COVID', 'Control']),
      'sample_id': lambda: f"SAMPLE_{random.randint(1, 10)}_ROI_{random.randint(1, 5)}",
      'cellID': lambda: f"SAMPLE_{random.randint(1, 10)}_ROI_{random.randint(1, 5)}_CELL_{random.randint(0, 999999)}", # Wider range for cell ID
      'broad_celltypes': lambda: random.choice(['myeloid', 'structural', 'lymphocyte', 'ND']),
      'annotations': lambda: random.choice(['Mac3', 'Myofibroblast', 'UD proliferating', 'Neut CD8 ADJ', 'NA', 'Endothelial cells', 'Fibroblast']),
      'global_annotations': lambda: random.choice(['Myeloid', 'Structural', 'Lymphoid', 'Unidentified']),
      'global_anno_actual': lambda: random.choice(['Myeloid', 'Structural', 'Lymphoid', 'Unidentified', 'Myeloid/Lymphoid', 'Structural/Myeloid']),
      'disease_state': lambda: random.choice(['DAD', 'Control']),
      'myeloid_clusters': lambda: f"cl{random.randint(1, 10):02d}" if random.random() > 0.1 else 'ND',
      'myeloid_annotations': lambda: random.choice(['Mac3', 'Neut_CD8_ADJ', 'UD', 'Mono PAI-1 ADJ2', 'CD15lo iNeut', 'CD15mid iNeut', 'Mono3', 'Mono1', 'Mono2']) if random.random() > 0.1 else 'NA',
      'structural_clusters': lambda: f"cl{random.randint(1, 10):02d}" if random.random() > 0.1 else 'ND',
      'structural_annotations': lambda: random.choice(['Myofibroblast', 'UD proliferating', 'Endothelial cells', 'Bronchial epit', 'Blood Vessels', 'UD structural SM', 'Mono PAI-1 ADJ', 'Fibroblast', 'UD RAGElo cells', 'Prolif fibroblast', 'UD RAGEmid cells', 'Prolif alveolar epit', 'HLADRlo bronchial epit', 'HLADRhi bronchial epit']) if random.random() > 0.1 else 'NA',
      'lymphocyte_clusters': lambda: f"cl{random.randint(1, 10):02d}" if random.random() > 0.1 else 'ND',
      'lymphocyte_annotations': lambda: random.choice(['Va7.2lo cells', 'IFNgneg CD4 T cells', 'CD8 Neut Endothelial ADJ', 'CD107neg CD8', 'CCR6lo UD', 'PAI-1mid UD', 'IFNglo MAIT cells', 'CCR6mid UD', 'Megakaryocytes', 'CD107pos CD4 T cells']) if random.random() > 0.1 else 'NA',
      # Default for other numeric columns (assuming float between 0 and 2, can be adjusted)
      'default_numeric': lambda: random.uniform(0, 2)
  }

  target_size_bytes = target_size_gb * 1024 * 1024 * 1024
  current_size_bytes = 0
  row_count = 0
  check_size_interval = 5000  # Check file size every N rows to reduce overhead
  start_time = time.time()

  print(f"Generating CSV file: {filename} aiming for ~{target_size_gb:.2f} GB...")

  try:
      with open(filename, 'w', newline='') as csvfile:
          writer = csv.writer(csvfile)
          writer.writerow(headers) # Write header row

          while True: # Loop indefinitely until size target is reached
              row = []
              for header in headers:
                  if header in data_generators:
                      row.append(data_generators[header]())
                  # Check if it looks like a UMAP coordinate or similar float column
                  elif 'UMAP' in header or header in ['eccentricity', 'major_axis_length', 'minor_axis_length', 'aSMA', 'CCR2', 'CCR6', 'CD107a', 'CD10', 'CD114', 'CD115', 'CD14', 'CD15', 'CD161', 'CD16', 'CD172a', 'CD31', 'CD3', 'CD45', 'CD45RO', 'CD4', 'CD56', 'CD68', 'CD71', 'CD74', 'CD8a', 'Collagen1', 'DAP12', 'EpCAM', 'GranzymeB', 'HLADR', 'IFNb', 'IFNg', 'Ki67', 'PAI1', 'PanK', 'PF4', 'RAGE', 'Va7-2']:
                     row.append(f"{data_generators['default_numeric']():.8f}") # Format floats
                  else:
                     # Default for any other columns assumed numeric for simplicity
                     row.append(f"{data_generators['default_numeric']():.8f}")
              writer.writerow(row)
              row_count += 1

              # Check size periodically
              if row_count % check_size_interval == 0:
                  current_size_bytes = os.path.getsize(filename)
                  elapsed_time = time.time() - start_time
                  progress = (current_size_bytes / target_size_bytes) * 100
                  print(f"\rGenerated {row_count} rows... Current size: {current_size_bytes / (1024*1024):.2f} MB ({progress:.2f}%) | Time: {elapsed_time:.1f}s", end="")

                  if current_size_bytes >= target_size_bytes:
                      print(f"\nReached target size of {target_size_gb:.2f} GB after {row_count} rows.")
                      break

  except KeyboardInterrupt:
      print("\nGeneration interrupted by user.")
  except Exception as e:
      print(f"\nAn error occurred: {e}")

  print(f"\nSuccessfully generated {filename}")
  final_size_bytes = os.path.getsize(filename)
  final_size_gb = final_size_bytes / (1024 * 1024 * 1024)
  print(f"Final file size: {final_size_gb:.3f} GB ({final_size_bytes / (1024*1024):.2f} MB)")
  print(f"Total rows generated: {row_count}")

# # --- How to use ---
# # 1. Set the desired output filename
# output_filename = 'generated_cells_data_large.csv'

# # 2. Set the desired target file size in Gigabytes (GB)
# #    Examples:
# #    target_gb_size = 1  # For 1 GB
# #    target_gb_size = 10 # For 10 GB
# #    target_gb_size = 0.5 # For 500 MB
# target_gb_size = 1 # Example: Generate a file of approximately 1 GB

# # Generate the CSV
# generate_csv_by_size(output_filename, target_gb_size)

# Example usage
if __name__ == "__main__":
    generate_csv_by_size('generated_cells_data_large_1gb.csv', 1)  # Generates a file of approximately 1 GB

Generating CSV file: generated_cells_data_large_1gb.csv aiming for ~1.00 GB...
Generated 1170000 rows... Current size: 1024.95 MB (100.09%) | Time: 83.8s
Reached target size of 1.00 GB after 1170000 rows.

Successfully generated generated_cells_data_large_1gb.csv
Final file size: 1.001 GB (1024.96 MB)
Total rows generated: 1170000
