# üß¨ NonBScanner - Comprehensive Analysis Notebook

**Complete workflow for Non-B DNA motif detection and analysis with Excel output**

---

## Features:
1. ‚úÖ Detect all 11 Non-B DNA classes
2. ‚úÖ Export to Excel with separate sheets for each motif class
3. ‚úÖ Comprehensive class/subclass detection analysis
4. ‚úÖ Identify which motifs were not predicted
5. ‚úÖ Publication-ready visualizations

---

## üì¶ Step 1: Import Required Libraries

In [None]:
# Import NonBScanner modules
import nonbscanner as nbs
from utilities import (
    export_to_excel,
    analyze_class_subclass_detection,
    print_detection_report
)

# Import standard libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 8)
plt.rcParams['figure.dpi'] = 100

print("‚úì Libraries imported successfully!")
print(f"NonBScanner version: {nbs.__version__}")

## üîç Step 2: Check Available Motif Classes

In [None]:
# Get motif classification information
info = nbs.get_motif_info()

print("="*80)
print(f"NonBScanner v{info['version']} - Motif Detection System")
print("="*80)
print(f"\nTotal Classes: {info['total_classes']}")
print(f"Total Subclasses: {info['total_subclasses']}")
print("\nSupported Non-B DNA Classes:")
print("-"*80)

for class_num, class_info in info['classification'].items():
    print(f"\n{class_num:2}. {class_info['name']}")
    print(f"   Subclasses ({len(class_info['subclasses'])}):", end=" ")
    print(", ".join(class_info['subclasses']))

print("\n" + "="*80)

## üìÑ Step 3: Analyze FASTA File

In [None]:
# Specify your FASTA file (or use the example)
fasta_file = "example_motifs_multiline.fasta"

# Check if file exists
if not Path(fasta_file).exists():
    print(f"‚ö†Ô∏è  File '{fasta_file}' not found!")
    print("Please provide a valid FASTA file path.")
else:
    print(f"üìÅ Analyzing file: {fasta_file}")
    print("‚è≥ Processing... (this may take a few moments)\n")
    
    # Analyze the FASTA file
    results = nbs.analyze_file(fasta_file)
    
    print("‚úì Analysis complete!")
    print(f"\nProcessed {len(results)} sequence(s):")
    print("-"*80)
    
    # Display summary for each sequence
    for seq_name, motifs in results.items():
        # Count by class
        class_counts = {}
        for m in motifs:
            cls = m.get('Class', 'Unknown')
            class_counts[cls] = class_counts.get(cls, 0) + 1
        
        print(f"\n{seq_name}:")
        print(f"  Total motifs: {len(motifs)}")
        print(f"  Classes detected: {len(class_counts)}")
        
        if class_counts:
            print("  Distribution:")
            for cls, count in sorted(class_counts.items()):
                print(f"    - {cls}: {count} motifs")
    
    print("\n" + "="*80)

## üìä Step 4: Comprehensive Class/Subclass Detection Analysis

In [None]:
# Combine all motifs from all sequences
all_motifs = []
for seq_name, motifs in results.items():
    all_motifs.extend(motifs)

# Analyze class/subclass detection
detection_report = analyze_class_subclass_detection(all_motifs)

# Print the report
report_text = print_detection_report(detection_report)
print(report_text)

# Save the report to a text file
with open('detection_report.txt', 'w') as f:
    f.write(report_text)

print("\n‚úì Detection report saved to 'detection_report.txt'")

## üíæ Step 5: Export Results to Excel

**Excel file structure:**
- **Sheet 1**: Consolidated non-overlapping motifs (excludes Hybrid and Cluster)
- **Subsequent sheets**: Individual motif classes and subclasses

In [None]:
# Export to Excel
excel_filename = "nonbscanner_results.xlsx"

print(f"üìù Exporting results to Excel: {excel_filename}")
print("‚è≥ Creating sheets for each motif class...\n")

# Export using the Excel export function
result_message = export_to_excel(all_motifs, excel_filename)
print(f"‚úì {result_message}")

# Display Excel file structure
print("\nüìë Excel File Structure:")
print("-"*80)
print("Sheet 1: Consolidated_NonOverlapping (all motifs except Hybrid & Cluster)")

# Count sheets by class
class_counts = {}
for m in all_motifs:
    cls = m.get('Class', 'Unknown')
    class_counts[cls] = class_counts.get(cls, 0) + 1

for i, (cls, count) in enumerate(sorted(class_counts.items()), start=2):
    print(f"Sheet {i}: {cls} ({count} motifs)")

print("\n" + "="*80)

## üìà Step 6: Visualizations

### 6.1 Motif Class Distribution

In [None]:
# Count motifs by class
class_counts = {}
for m in all_motifs:
    cls = m.get('Class', 'Unknown')
    class_counts[cls] = class_counts.get(cls, 0) + 1

# Create bar plot
plt.figure(figsize=(14, 8))
classes = list(class_counts.keys())
counts = list(class_counts.values())

colors = plt.cm.Set3(range(len(classes)))
bars = plt.bar(classes, counts, color=colors, edgecolor='black', linewidth=1.5)

plt.xlabel('Motif Class', fontsize=14, fontweight='bold')
plt.ylabel('Count', fontsize=14, fontweight='bold')
plt.title('Non-B DNA Motif Distribution by Class', fontsize=16, fontweight='bold', pad=20)
plt.xticks(rotation=45, ha='right', fontsize=11)
plt.yticks(fontsize=11)
plt.grid(axis='y', alpha=0.3, linestyle='--')

# Add count labels on bars
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}',
            ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.tight_layout()
plt.savefig('motif_class_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Figure saved as 'motif_class_distribution.png'")

### 6.2 Detection Status Pie Chart

In [None]:
# Create detection status pie chart
detected = detection_report['detected_classes']
not_detected = detection_report['total_classes'] - detected

plt.figure(figsize=(10, 8))
labels = [f'Detected\n({detected} classes)', f'Not Detected\n({not_detected} classes)']
sizes = [detected, not_detected]
colors = ['#66c2a5', '#fc8d62']
explode = (0.05, 0.05)

plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct='%1.1f%%', shadow=True, startangle=90,
        textprops={'fontsize': 12, 'fontweight': 'bold'})

plt.title('Non-B DNA Class Detection Status', fontsize=16, fontweight='bold', pad=20)
plt.axis('equal')

plt.tight_layout()
plt.savefig('detection_status.png', dpi=300, bbox_inches='tight')
plt.show()

print("‚úì Figure saved as 'detection_status.png'")

### 6.3 Score Distribution by Class

In [None]:
# Prepare data for score distribution
class_scores = {}
for m in all_motifs:
    cls = m.get('Class', 'Unknown')
    score = m.get('Score', 0)
    if isinstance(score, (int, float)) and score > 0:
        if cls not in class_scores:
            class_scores[cls] = []
        class_scores[cls].append(score)

if class_scores:
    # Create violin plot
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    data_to_plot = []
    labels = []
    for cls in sorted(class_scores.keys()):
        data_to_plot.append(class_scores[cls])
        labels.append(cls)
    
    # Create violin plot
    parts = plt.violinplot(data_to_plot, positions=range(len(data_to_plot)),
                          showmeans=True, showmedians=True)
    
    # Customize colors
    colors = plt.cm.Set3(range(len(data_to_plot)))
    for i, pc in enumerate(parts['bodies']):
        pc.set_facecolor(colors[i])
        pc.set_alpha(0.7)
    
    plt.xticks(range(len(labels)), labels, rotation=45, ha='right', fontsize=11)
    plt.xlabel('Motif Class', fontsize=14, fontweight='bold')
    plt.ylabel('Score', fontsize=14, fontweight='bold')
    plt.title('Score Distribution by Motif Class', fontsize=16, fontweight='bold', pad=20)
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    
    plt.tight_layout()
    plt.savefig('score_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úì Figure saved as 'score_distribution.png'")
else:
    print("‚ö†Ô∏è  No score data available for visualization")

### 6.4 Length Distribution by Class

In [None]:
# Prepare data for length distribution
class_lengths = {}
for m in all_motifs:
    cls = m.get('Class', 'Unknown')
    length = m.get('Length', 0)
    if isinstance(length, int) and length > 0:
        if cls not in class_lengths:
            class_lengths[cls] = []
        class_lengths[cls].append(length)

if class_lengths:
    # Create box plot
    plt.figure(figsize=(14, 8))
    
    # Prepare data
    data_to_plot = []
    labels = []
    for cls in sorted(class_lengths.keys()):
        data_to_plot.append(class_lengths[cls])
        labels.append(cls)
    
    # Create box plot
    bp = plt.boxplot(data_to_plot, labels=labels, patch_artist=True,
                     medianprops=dict(color='red', linewidth=2),
                     flierprops=dict(marker='o', markersize=5, alpha=0.5))
    
    # Customize colors
    colors = plt.cm.Set3(range(len(data_to_plot)))
    for patch, color in zip(bp['boxes'], colors):
        patch.set_facecolor(color)
        patch.set_alpha(0.7)
    
    plt.xticks(rotation=45, ha='right', fontsize=11)
    plt.xlabel('Motif Class', fontsize=14, fontweight='bold')
    plt.ylabel('Length (bp)', fontsize=14, fontweight='bold')
    plt.title('Length Distribution by Motif Class', fontsize=16, fontweight='bold', pad=20)
    plt.grid(axis='y', alpha=0.3, linestyle='--')
    
    plt.tight_layout()
    plt.savefig('length_distribution.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print("‚úì Figure saved as 'length_distribution.png'")
else:
    print("‚ö†Ô∏è  No length data available for visualization")

## üìã Step 7: Summary Statistics Table

In [None]:
# Create summary statistics DataFrame
summary_data = []

for cls in sorted(class_counts.keys()):
    cls_motifs = [m for m in all_motifs if m.get('Class') == cls]
    
    # Calculate statistics
    count = len(cls_motifs)
    
    # Length stats
    lengths = [m.get('Length', 0) for m in cls_motifs if isinstance(m.get('Length'), int)]
    avg_length = sum(lengths) / len(lengths) if lengths else 0
    
    # Score stats
    scores = [m.get('Score', 0) for m in cls_motifs if isinstance(m.get('Score'), (int, float))]
    avg_score = sum(scores) / len(scores) if scores else 0
    
    summary_data.append({
        'Class': cls,
        'Count': count,
        'Avg Length (bp)': f"{avg_length:.1f}",
        'Avg Score': f"{avg_score:.3f}"
    })

summary_df = pd.DataFrame(summary_data)

print("\n" + "="*80)
print("SUMMARY STATISTICS BY CLASS")
print("="*80)
print(summary_df.to_string(index=False))
print("\n" + "="*80)

# Save summary to CSV
summary_df.to_csv('summary_statistics.csv', index=False)
print("\n‚úì Summary statistics saved to 'summary_statistics.csv'")

## ‚úÖ Step 8: Analysis Complete!

### Generated Files:

1. **nonbscanner_results.xlsx** - Excel file with multiple sheets
   - Consolidated non-overlapping motifs
   - Individual sheets for each motif class

2. **detection_report.txt** - Detailed class/subclass detection analysis

3. **Visualization PNG files:**
   - motif_class_distribution.png
   - detection_status.png
   - score_distribution.png
   - length_distribution.png

4. **summary_statistics.csv** - Summary table

---

### Next Steps:

- Review the detection report to see which classes were not detected
- Open the Excel file to explore individual motif classes
- Use the visualizations for presentations or publications
- Analyze specific motif subclasses of interest

---

In [None]:
# Final summary
print("="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print(f"\nTotal motifs detected: {len(all_motifs)}")
print(f"Classes detected: {detection_report['detected_classes']} / {detection_report['total_classes']}")
print(f"Detection rate: {detection_report['summary']['Detection Rate']}")
print("\nGenerated files:")
print("  ‚úì nonbscanner_results.xlsx")
print("  ‚úì detection_report.txt")
print("  ‚úì summary_statistics.csv")
print("  ‚úì 4 visualization PNG files")
print("\n" + "="*80)
print("\nüéâ Thank you for using NonBScanner!")