diff --git a/DIRECTORY.md b/DIRECTORY.md index 1b8c4b0c316..bd76d7d153a 100644 --- a/DIRECTORY.md +++ b/DIRECTORY.md @@ -103,10 +103,11 @@ * [Minimum Cost Path](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/minimum_cost_path.rs) * [Optimal Bst](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/optimal_bst.rs) * [Rod Cutting](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/rod_cutting.rs) + * [Smith-Waterman](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/smith_waterman.rs) * [Snail](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/snail.rs) * [Subset Generation](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_generation.rs) - * [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs) * [Subset Sum](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/subset_sum.rs) + * [Task Assignment](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/task_assignment.rs) * [Trapped Rainwater](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/trapped_rainwater.rs) * [Word Break](https://github.com/TheAlgorithms/Rust/blob/master/src/dynamic_programming/word_break.rs) * Financial diff --git a/src/dynamic_programming/mod.rs b/src/dynamic_programming/mod.rs index b3dc169201a..1bad6b4018f 100644 --- a/src/dynamic_programming/mod.rs +++ b/src/dynamic_programming/mod.rs @@ -14,6 +14,7 @@ mod maximum_subarray; mod minimum_cost_path; mod optimal_bst; mod rod_cutting; +mod smith_waterman; mod snail; mod subset_generation; mod subset_sum; @@ -45,6 +46,7 @@ pub use self::maximum_subarray::maximum_subarray; pub use self::minimum_cost_path::minimum_cost_path; pub use self::optimal_bst::optimal_search_tree; pub use self::rod_cutting::rod_cut; +pub use self::smith_waterman::{score_function, smith_waterman, traceback}; pub use self::snail::snail; pub use self::subset_generation::list_subset; pub use self::subset_sum::is_sum_subset; diff --git a/src/dynamic_programming/smith_waterman.rs b/src/dynamic_programming/smith_waterman.rs new file mode 100644 index 00000000000..3bd7fef2896 --- /dev/null +++ b/src/dynamic_programming/smith_waterman.rs @@ -0,0 +1,427 @@ +//! This module contains the Smith-Waterman algorithm implementation for local sequence alignment. +//! +//! The Smith-Waterman algorithm is a dynamic programming algorithm used for determining +//! similar regions between two sequences (nucleotide or protein sequences). It is particularly +//! useful in bioinformatics for identifying optimal local alignments. +//! +//! # Algorithm Overview +//! +//! The algorithm works by: +//! 1. Creating a scoring matrix where each cell represents the maximum alignment score +//! ending at that position +//! 2. Using match, mismatch, and gap penalties to calculate scores +//! 3. Allowing scores to reset to 0 (ensuring local rather than global alignment) +//! 4. Tracing back from the highest scoring position to reconstruct the alignment +//! +//! # Time Complexity +//! +//! O(m * n) where m and n are the lengths of the two sequences +//! +//! # Space Complexity +//! +//! O(m * n) for the scoring matrix +//! +//! # References +//! +//! - [Smith, T.F., Waterman, M.S. (1981). "Identification of Common Molecular Subsequences"](https://doi.org/10.1016/0022-2836(81)90087-5) +//! - [Wikipedia: Smith-Waterman algorithm](https://en.wikipedia.org/wiki/Smith%E2%80%93Waterman_algorithm) + +use std::cmp::max; + +/// Calculates the score for a character pair based on match, mismatch, or gap scoring. +/// +/// # Arguments +/// +/// * `source_char` - Character from the source sequence +/// * `target_char` - Character from the target sequence +/// * `match_score` - Score awarded for matching characters (typically positive) +/// * `mismatch_score` - Score penalty for mismatching characters (typically negative) +/// * `gap_score` - Score penalty for gaps (typically negative) +/// +/// # Returns +/// +/// The calculated score for the character pair +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::score_function; +/// +/// let score = score_function('A', 'A', 1, -1, -2); +/// assert_eq!(score, 1); // Match +/// +/// let score = score_function('A', 'C', 1, -1, -2); +/// assert_eq!(score, -1); // Mismatch +/// +/// let score = score_function('-', 'A', 1, -1, -2); +/// assert_eq!(score, -2); // Gap +/// ``` +pub fn score_function( + source_char: char, + target_char: char, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> i32 { + if source_char == '-' || target_char == '-' { + gap_score + } else if source_char == target_char { + match_score + } else { + mismatch_score + } +} + +/// Performs the Smith-Waterman local sequence alignment algorithm. +/// +/// This function creates a scoring matrix using dynamic programming to find the +/// optimal local alignment between two sequences. The algorithm is case-insensitive. +/// +/// # Arguments +/// +/// * `query` - The query sequence (e.g., DNA, protein) +/// * `subject` - The subject sequence to align against +/// * `match_score` - Score for matching characters (default: 1) +/// * `mismatch_score` - Penalty for mismatching characters (default: -1) +/// * `gap_score` - Penalty for gaps/indels (default: -2) +/// +/// # Returns +/// +/// A 2D vector representing the dynamic programming scoring matrix +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::smith_waterman; +/// +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); +/// assert_eq!(score_matrix.len(), 5); // query length + 1 +/// assert_eq!(score_matrix[0].len(), 3); // subject length + 1 +/// ``` +pub fn smith_waterman( + query: &str, + subject: &str, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> Vec> { + let query_upper: Vec = query.to_uppercase().chars().collect(); + let subject_upper: Vec = subject.to_uppercase().chars().collect(); + + let m = query_upper.len(); + let n = subject_upper.len(); + + // Initialize scoring matrix with zeros + let mut score = vec![vec![0; n + 1]; m + 1]; + + // Fill the scoring matrix using dynamic programming + for i in 1..=m { + for j in 1..=n { + // Calculate score for match/mismatch + let match_or_mismatch = score[i - 1][j - 1] + + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ); + + // Calculate score for deletion (gap in subject) + let delete = score[i - 1][j] + gap_score; + + // Calculate score for insertion (gap in query) + let insert = score[i][j - 1] + gap_score; + + // Take maximum of all options, but never go below 0 (local alignment) + score[i][j] = max(0, max(match_or_mismatch, max(delete, insert))); + } + } + + score +} + +/// Performs traceback on the Smith-Waterman score matrix to reconstruct the optimal alignment. +/// +/// This function starts from the highest scoring cell and traces back through the matrix +/// to reconstruct the aligned sequences. The traceback stops when a cell with score 0 +/// is encountered. +/// +/// # Arguments +/// +/// * `score` - The score matrix from the Smith-Waterman algorithm +/// * `query` - Original query sequence used in alignment +/// * `subject` - Original subject sequence used in alignment +/// * `match_score` - Score for matching characters (should match smith_waterman call) +/// * `mismatch_score` - Score for mismatching characters (should match smith_waterman call) +/// * `gap_score` - Penalty for gaps (should match smith_waterman call) +/// +/// # Returns +/// +/// A String containing the two aligned sequences separated by a newline, +/// or an empty string if no significant alignment is found +/// +/// # Examples +/// +/// ``` +/// use the_algorithms_rust::dynamic_programming::{smith_waterman, traceback}; +/// +/// let score_matrix = smith_waterman("ACAC", "CA", 1, -1, -2); +/// let alignment = traceback(&score_matrix, "ACAC", "CA", 1, -1, -2); +/// assert_eq!(alignment, "CA\nCA"); +/// ``` +pub fn traceback( + score: &[Vec], + query: &str, + subject: &str, + match_score: i32, + mismatch_score: i32, + gap_score: i32, +) -> String { + let query_upper: Vec = query.to_uppercase().chars().collect(); + let subject_upper: Vec = subject.to_uppercase().chars().collect(); + + // Find the cell with maximum score + let mut max_value = 0; + let (mut i_max, mut j_max) = (0, 0); + + for (i, row) in score.iter().enumerate() { + for (j, &value) in row.iter().enumerate() { + if value > max_value { + max_value = value; + i_max = i; + j_max = j; + } + } + } + + // If no significant alignment found, return empty string + if max_value <= 0 { + return String::new(); + } + + // Traceback from the maximum scoring cell + let (mut i, mut j) = (i_max, j_max); + let mut align1 = Vec::new(); + let mut align2 = Vec::new(); + + // Continue tracing back until we hit a cell with score 0 + while i > 0 && j > 0 && score[i][j] > 0 { + let current_score = score[i][j]; + + // Check if we came from diagonal (match/mismatch) + if current_score + == score[i - 1][j - 1] + + score_function( + query_upper[i - 1], + subject_upper[j - 1], + match_score, + mismatch_score, + gap_score, + ) + { + align1.push(query_upper[i - 1]); + align2.push(subject_upper[j - 1]); + i -= 1; + j -= 1; + } + // Check if we came from above (deletion/gap in subject) + else if current_score == score[i - 1][j] + gap_score { + align1.push(query_upper[i - 1]); + align2.push('-'); + i -= 1; + } + // Otherwise we came from left (insertion/gap in query) + else { + align1.push('-'); + align2.push(subject_upper[j - 1]); + j -= 1; + } + } + + // Reverse the sequences (we built them backwards) + align1.reverse(); + align2.reverse(); + + format!( + "{}\n{}", + align1.into_iter().collect::(), + align2.into_iter().collect::() + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + macro_rules! smith_waterman_tests { + ($($name:ident: $test_cases:expr,)*) => { + $( + #[test] + fn $name() { + let (query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases; + assert_eq!(smith_waterman(query, subject, match_score, mismatch_score, gap_score), expected); + } + )* + } + } + + macro_rules! traceback_tests { + ($($name:ident: $test_cases:expr,)*) => { + $( + #[test] + fn $name() { + let (score, query, subject, match_score, mismatch_score, gap_score, expected) = $test_cases; + assert_eq!(traceback(&score, query, subject, match_score, mismatch_score, gap_score), expected); + } + )* + } + } + + smith_waterman_tests! { + test_acac_ca: ("ACAC", "CA", 1, -1, -2, vec![ + vec![0, 0, 0], + vec![0, 0, 1], + vec![0, 1, 0], + vec![0, 0, 2], + vec![0, 1, 0], + ]), + test_agt_agt: ("AGT", "AGT", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + vec![0, 0, 0, 3], + ]), + test_agt_gta: ("AGT", "GTA", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 0, 0, 1], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + ]), + test_agt_g: ("AGT", "G", 1, -1, -2, vec![ + vec![0, 0], + vec![0, 0], + vec![0, 1], + vec![0, 0], + ]), + test_g_agt: ("G", "AGT", 1, -1, -2, vec![ + vec![0, 0, 0, 0], + vec![0, 0, 1, 0], + ]), + test_empty_query: ("", "CA", 1, -1, -2, vec![vec![0, 0, 0]]), + test_empty_subject: ("ACAC", "", 1, -1, -2, vec![vec![0], vec![0], vec![0], vec![0], vec![0]]), + test_both_empty: ("", "", 1, -1, -2, vec![vec![0]]), + } + + traceback_tests! { + test_traceback_acac_ca: ( + vec![ + vec![0, 0, 0], + vec![0, 0, 1], + vec![0, 1, 0], + vec![0, 0, 2], + vec![0, 1, 0], + ], + "ACAC", + "CA", + 1, -1, -2, + "CA\nCA", + ), + test_traceback_agt_agt: ( + vec![ + vec![0, 0, 0, 0], + vec![0, 1, 0, 0], + vec![0, 0, 2, 0], + vec![0, 0, 0, 3], + ], + "AGT", + "AGT", + 1, -1, -2, + "AGT\nAGT", + ), + test_traceback_empty: (vec![vec![0, 0, 0]], "ACAC", "", 1, -1, -2, ""), + test_traceback_custom_scoring: ( + vec![ + vec![0, 0, 0], + vec![0, 0, 2], + vec![0, 2, 0], + vec![0, 0, 4], + vec![0, 2, 0], + ], + "ACAC", + "CA", + 2, -2, -3, + "CA\nCA", + ), + } + + #[test] + fn test_score_function_match() { + assert_eq!(score_function('A', 'A', 1, -1, -2), 1); + assert_eq!(score_function('G', 'G', 2, -1, -1), 2); + } + + #[test] + fn test_score_function_mismatch() { + assert_eq!(score_function('A', 'C', 1, -1, -2), -1); + assert_eq!(score_function('G', 'T', 1, -2, -1), -2); + } + + #[test] + fn test_score_function_gap() { + assert_eq!(score_function('-', 'A', 1, -1, -2), -2); + assert_eq!(score_function('A', '-', 1, -1, -2), -2); + } + + #[test] + fn test_case_insensitive() { + let result1 = smith_waterman("acac", "CA", 1, -1, -2); + let result2 = smith_waterman("ACAC", "ca", 1, -1, -2); + let result3 = smith_waterman("AcAc", "Ca", 1, -1, -2); + + assert_eq!(result1, result2); + assert_eq!(result2, result3); + } + + #[test] + fn test_custom_scoring_end_to_end() { + // Test with custom scoring parameters (match=2, mismatch=-2, gap=-3) + let query = "ACGT"; + let subject = "ACGT"; + let match_score = 2; + let mismatch_score = -2; + let gap_score = -3; + + // Generate score matrix with custom parameters + let score_matrix = smith_waterman(query, subject, match_score, mismatch_score, gap_score); + + // Traceback using the same custom parameters + let alignment = traceback( + &score_matrix, + query, + subject, + match_score, + mismatch_score, + gap_score, + ); + + // With perfect match and match_score=2, we expect alignment "ACGT\nACGT" + assert_eq!(alignment, "ACGT\nACGT"); + + // Verify the score is correct (4 matches × 2 = 8) + assert_eq!(score_matrix[4][4], 8); + } + + #[test] + fn test_alignment_at_boundary() { + // Test case where optimal alignment might be at row/column 0 + let query = "A"; + let subject = "A"; + let score_matrix = smith_waterman(query, subject, 1, -1, -2); + let alignment = traceback(&score_matrix, query, subject, 1, -1, -2); + + // Should find the alignment even though it's near the boundary + assert_eq!(alignment, "A\nA"); + assert_eq!(score_matrix[1][1], 1); + } +}