## 1. Install Dependencies

In [None]:
# Install required packages
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "biopython", "numpy", "pandas", "matplotlib", "seaborn"])
print("✓ All dependencies installed")

## 2. Import Libraries

In [None]:
from Bio import SeqIO
import Bio.pairwise2 as pairwise2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

print(f'✓ BioPython version: {Bio.__version__}')
print('✓ All libraries imported successfully')

## 3. Load FASTA Sequences

In [None]:
# Example: Load sequences from FASTA files
# Update these paths according to your data location

seq1_file = '../data/sequence1.fasta'
seq2_file = '../data/sequence2.fasta'

try:
    # Parse FASTA files
    records1 = list(SeqIO.parse(seq1_file, "fasta"))
    records2 = list(SeqIO.parse(seq2_file, "fasta"))
    
    seq1 = str(records1[0].seq).upper()
    seq2 = str(records2[0].seq).upper()
    
    print(f"✓ Sequence 1: {len(seq1)} bp")
    print(f"✓ Sequence 2: {len(seq2)} bp")
    print(f"\nPreview Seq1: {seq1[:50]}...")
    print(f"Preview Seq2: {seq2[:50]}...")
    
except FileNotFoundError:
    print("✓ Using example sequences (FASTA files not found)")
    seq1 = "ATCGATCGATCGATCGATCGATCG"
    seq2 = "ATCGATCGATCGATCGATCGATCG"
    print(f"Sequence 1: {len(seq1)} bp")
    print(f"Sequence 2: {len(seq2)} bp")

## 4. Set Scoring Parameters

In [None]:
# Define scoring parameters
match_score = 2        # Reward for matching nucleotides
mismatch_score = -1    # Penalty for mismatches
gap_penalty = -2       # Penalty for gaps (indels)

print(f"Match score: {match_score}")
print(f"Mismatch score: {mismatch_score}")
print(f"Gap penalty: {gap_penalty}")

## 5. Perform Needleman-Wunsch Alignment

In [None]:
# Perform global alignment
alignments = pairwise2.align.globalms(
    seq1, seq2,
    match_score,
    mismatch_score,
    gap_penalty,
    gap_penalty
)

# Get best alignment
best_alignment = alignments[0]
aligned_seq1, aligned_seq2, score, begin, end = best_alignment

print(f"✓ Alignment Score: {score}")
print(f"✓ Alignment Length: {len(aligned_seq1)} bp")
print(f"✓ Number of optimal alignments: {len(alignments)}")

## 6. Calculate Statistics

In [None]:
# Calculate alignment statistics
matches = sum(1 for i, j in zip(aligned_seq1, aligned_seq2) if i == j)
mismatches = sum(1 for i, j in zip(aligned_seq1, aligned_seq2) if i != j and i != '-' and j != '-')
gaps = aligned_seq1.count('-') + aligned_seq2.count('-')

alignment_length = len(aligned_seq1)
identity = (matches / alignment_length) * 100

# Create statistics table
stats = pd.DataFrame({
    'Metric': ['Alignment Length', 'Matches', 'Mismatches', 'Gaps', 'Identity %', 'Score'],
    'Value': [alignment_length, matches, mismatches, gaps, f'{identity:.2f}%', score]
})

display(stats)

## 7. Visualize Alignment

In [None]:
# Create visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Needleman-Wunsch Alignment Results', fontsize=16, fontweight='bold')

# 1. Identity pie chart
ax1 = axes[0]
colors = ['#2ecc71', '#e74c3c']
ax1.pie([matches, mismatches], labels=['Matches', 'Mismatches'], 
        autopct='%1.1f%%', colors=colors, startangle=90)
ax1.set_title('Identity Analysis', fontweight='bold')

# 2. Alignment composition
ax2 = axes[1]
categories = ['Matches', 'Mismatches', 'Gaps']
values = [matches, mismatches, gaps]
bars = ax2.bar(categories, values, color=['#2ecc71', '#e74c3c', '#f39c12'])
ax2.set_ylabel('Count')
ax2.set_title('Alignment Composition', fontweight='bold')

for bar in bars:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom')

plt.tight_layout()
plt.show()

print(f'✓ Visualization complete')

## 8. Display Alignment

In [None]:
# Display alignment in readable format
line_width = 60

print("\nAlignment (first 300 bp):")
print("="*100)

for i in range(0, min(300, len(aligned_seq1)), line_width):
    seq1_block = aligned_seq1[i:i+line_width]
    seq2_block = aligned_seq2[i:i+line_width]
    
    match_block = ""
    for s1, s2 in zip(seq1_block, seq2_block):
        if s1 == s2:
            match_block += "|"
        elif s1 == "-" or s2 == "-":
            match_block += " "
        else:
            match_block += "."
    
    print(f"Seq1: {seq1_block}")
    print(f"      {match_block}")
    print(f"Seq2: {seq2_block}")
    print()

print("\nLegend: | = match, . = mismatch, (space) = gap")

## 9. Using the Package Functions

For more advanced usage, you can import the package functions directly.

In [None]:
# Example of using the package (if installed)
# from nw_alignment.alignment import NWAlignment
# from nw_alignment.parser import FASTAParser
# 
# parser = FASTAParser()
# seq1, seq2 = parser.load_fasta('data/sequence1.fasta', 'data/sequence2.fasta')
# 
# aligner = NWAlignment(seq1, seq2)
# result = aligner.align()
# print(result)

print("✓ For full production use, see: scripts/run_nw_algorithm.py")

## 10. Recommendations

- **For interactive exploration:** Use this Jupyter notebook
- **For full FASTA analysis:** Use `scripts/run_nw_algorithm.py` (handles large files efficiently)
- **For batch processing:** Use `scripts/batch_analysis.py`
- **For comparisons:** Use `scripts/compare_sequences.py`