# NonBDNAFinder - Single Cell Analysis

**Quick Start:** Run this single cell to analyze DNA sequences from file(s) and generate results.

### Instructions:
1. **Modify input:** Set `file_patterns` to your FASTA file(s) - can be a single file, list of files, or glob pattern
2. **Run cell:** Execute this cell (Shift+Enter)
3. **Check output:** Results saved to CSV files with prefix specified in `output_prefix`

### Input Options:
- Single file: `file_patterns = 'genome.fasta'`
- Multiple files: `file_patterns = ['file1.fasta', 'file2.fasta']`
- Glob pattern: `file_patterns = 'data/*.fasta'` or `'data/**/*.fa'`

### Output:
- `{output_prefix}_results.csv` - All detected motifs
- `{output_prefix}_summary.csv` - Summary statistics by motif class
- `{output_prefix}_visualization.pdf` - Visual plots and charts

In [None]:
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# CONFIGURATION - Edit these settings
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

# Input: Single file, list of files, or glob pattern (e.g., '*.fasta', 'data/**/*.fa')
file_patterns = 'example.fasta'  # EDIT THIS

# Output prefix for result files
output_prefix = 'nonbdna'  # EDIT THIS (optional)

# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê
# ANALYSIS CODE - Run as-is
# ‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê‚ïê

import sys, os, glob, pandas as pd
from pathlib import Path

# Add repository to path for local execution
repo_root = Path.cwd()
if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

# Import NonBDNAFinder
from Utilities.nonbscanner import analyze_sequence
from Utilities.utilities import parse_fasta, plot_motif_distribution, plot_nested_pie_chart, plot_length_distribution, plot_score_distribution
import matplotlib
matplotlib.use('Agg')  # Use non-interactive backend
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Resolve file patterns to list of files
def resolve_files(patterns):
    """Convert file pattern(s) to list of file paths"""
    if isinstance(patterns, str):
        patterns = [patterns]
    files = []
    for pattern in patterns:
        if '*' in pattern or '?' in pattern:
            files.extend(glob.glob(pattern, recursive=True))
        else:
            files.append(pattern)
    return [f for f in files if os.path.isfile(f)]

# Get files to process
input_files = resolve_files(file_patterns)
if not input_files:
    print(f"‚ùå No files found matching pattern: {file_patterns}")
    print("\nPlease check:")
    print("  1. File path is correct")
    print("  2. File exists in the specified location")
    print("  3. Glob pattern syntax is correct (e.g., '*.fasta', 'data/**/*.fa')")
else:
    print(f"üìÅ Found {len(input_files)} file(s) to process")
    for f in input_files:
        print(f"   ‚Ä¢ {f}")
    print()
    
    # Process all files
    all_motifs = []
    for i, fasta_file in enumerate(input_files, 1):
        print(f"\n{'='*80}")
        print(f"Processing file {i}/{len(input_files)}: {fasta_file}")
        print('='*80)
        
        try:
            # Read FASTA file
            with open(fasta_file, 'r') as f:
                fasta_content = f.read()
            
            # Parse sequences
            sequences = parse_fasta(fasta_content)
            print(f"üìä Found {len(sequences)} sequence(s) in file")
            
            # Analyze each sequence
            for seq_name, seq in sequences.items():
                print(f"\n  üî¨ Analyzing: {seq_name} ({len(seq):,} bp)")
                
                # Run analysis
                motifs = analyze_sequence(seq, seq_name)
                
                # Add file source
                for motif in motifs:
                    motif['Source_File'] = os.path.basename(fasta_file)
                
                all_motifs.extend(motifs)
                print(f"  ‚úì Detected {len(motifs)} motifs")
        
        except Exception as e:
            print(f"  ‚ùå Error processing {fasta_file}: {str(e)}")
            continue
    
    # Generate results
    if all_motifs:
        print(f"\n\n{'='*80}")
        print("üìä ANALYSIS COMPLETE")
        print('='*80)
        
        # Convert to DataFrame
        df = pd.DataFrame(all_motifs)
        
        # Summary statistics
        print(f"\n‚úì Total motifs detected: {len(df):,}")
        print(f"‚úì Motif classes found: {df['Class'].nunique()}")
        print("\nMotifs by class:")
        class_counts = df['Class'].value_counts()
        for motif_class, count in class_counts.items():
            print(f"  ‚Ä¢ {motif_class}: {count:,}")
        
        # Save detailed results
        results_file = f"{output_prefix}_results.csv"
        df.to_csv(results_file, index=False)
        print(f"\nüíæ Detailed results saved to: {results_file}")
        
        # Save summary by class
        summary = df.groupby('Class').agg({
            'Start': 'count',
            'Length': ['mean', 'min', 'max'],
            'Score': ['mean', 'min', 'max']
        }).round(2)
        summary.columns = ['Count', 'Avg_Length', 'Min_Length', 'Max_Length', 'Avg_Score', 'Min_Score', 'Max_Score']
        summary_file = f"{output_prefix}_summary.csv"
        summary.to_csv(summary_file)
        print(f"üíæ Summary statistics saved to: {summary_file}")
        
        # Generate visualizations as PDF
        print("\nüìä Generating visualizations...")
        visualization_file = f"{output_prefix}_visualization.pdf"
        
        try:
            successful_plots = 0
            with PdfPages(visualization_file) as pdf:
                # Page 1: Motif class distribution
                try:
                    fig1 = plot_motif_distribution(all_motifs, by='Class', 
                                                   title='Motif Class Distribution')
                    pdf.savefig(fig1, bbox_inches='tight')
                    plt.close(fig1)
                    successful_plots += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Skipped class distribution plot: {e}")
                
                # Page 2: Nested pie chart (Class -> Subclass hierarchy)
                try:
                    fig2 = plot_nested_pie_chart(all_motifs)
                    pdf.savefig(fig2, bbox_inches='tight')
                    plt.close(fig2)
                    successful_plots += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Skipped nested pie chart: {e}")
                
                # Page 3: Length distribution by class
                try:
                    fig3 = plot_length_distribution(all_motifs, by_class=True)
                    pdf.savefig(fig3, bbox_inches='tight')
                    plt.close(fig3)
                    successful_plots += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Skipped length distribution plot: {e}")
                
                # Page 4: Score distribution by class
                try:
                    fig4 = plot_score_distribution(all_motifs, by_class=True)
                    pdf.savefig(fig4, bbox_inches='tight')
                    plt.close(fig4)
                    successful_plots += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Skipped score distribution plot: {e}")
                
                # Page 5: Summary statistics table
                try:
                    fig5, ax = plt.subplots(figsize=(11, 8))
                    ax.axis('tight')
                    ax.axis('off')
                    
                    # Create table data
                    table_data = []
                    for motif_class, count in class_counts.items():
                        class_motifs = [m for m in all_motifs if m.get('Class') == motif_class]
                        avg_len = sum(m.get('Length', 0) for m in class_motifs) / len(class_motifs)
                        avg_score = sum(m.get('Score', 0) for m in class_motifs) / len(class_motifs)
                        table_data.append([motif_class, count, f"{avg_len:.1f}", f"{avg_score:.3f}"])
                    
                    table = ax.table(cellText=table_data, 
                                   colLabels=['Motif Class', 'Count', 'Avg Length (bp)', 'Avg Score'],
                                   cellLoc='left', loc='center', 
                                   colWidths=[0.4, 0.2, 0.2, 0.2])
                    table.auto_set_font_size(False)
                    table.set_fontsize(10)
                    table.scale(1, 2)
                    
                    # Style header
                    for i in range(4):
                        table[(0, i)].set_facecolor('#4472C4')
                        table[(0, i)].set_text_props(weight='bold', color='white')
                    
                    ax.set_title('Summary Statistics by Motif Class', 
                               fontsize=14, fontweight='bold', pad=20)
                    
                    pdf.savefig(fig5, bbox_inches='tight')
                    plt.close(fig5)
                    successful_plots += 1
                except Exception as e:
                    print(f"  ‚ö†Ô∏è  Skipped summary table: {e}")
                
                # If no plots succeeded, add a message page
                if successful_plots == 0:
                    fig_msg, ax_msg = plt.subplots(figsize=(8, 6))
                    ax_msg.axis('off')
                    ax_msg.text(0.5, 0.5, 
                              'No visualizations could be generated.\n\nPlease check the CSV output files for results.',
                              ha='center', va='center', fontsize=14, 
                              bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
                    pdf.savefig(fig_msg, bbox_inches='tight')
                    plt.close(fig_msg)
                
                # Add metadata
                d = pdf.infodict()
                d['Title'] = f'NonBDNAFinder Analysis Results - {output_prefix}'
                d['Author'] = 'NonBDNAFinder'
                d['Subject'] = 'Non-B DNA Motif Analysis'
                d['Keywords'] = 'Non-B DNA, Motif Detection, Genomics'
            
            if successful_plots > 0:
                print(f"üìä Visualizations saved to: {visualization_file} ({successful_plots} plots)")
            else:
                print(f"‚ö†Ô∏è  Visualization file created but no plots could be generated")
        except Exception as e:
            print(f"‚ö†Ô∏è  Warning: Could not generate visualizations: {e}")
            print("  CSV files were saved successfully.")
        
        print("\n‚ú® Analysis complete! Check the output files for results.")
    else:
        print("\n‚ö†Ô∏è  No motifs detected in any of the analyzed sequences.")
        print("This could mean:")
        print("  ‚Ä¢ Sequences are too short")
        print("  ‚Ä¢ Sequences lack Non-B DNA forming patterns")
        print("  ‚Ä¢ Input files were empty or invalid")