From 8555eb65671905154af365cbfe5a7658372ba5e9 Mon Sep 17 00:00:00 2001 From: Microindole <1513979779@qq.com> Date: Thu, 16 Oct 2025 21:03:57 +0800 Subject: [PATCH 1/3] feat(compression): Add LZW and Arithmetic Coding algorithms --- .../compression/ArithmeticCoding.java | 158 ++++++++++++++++++ .../com/thealgorithms/compression/LZW.java | 136 +++++++++++++++ .../compression/ArithmeticCodingTest.java | 119 +++++++++++++ .../thealgorithms/compression/LZWTest.java | 78 +++++++++ 4 files changed, 491 insertions(+) create mode 100644 src/main/java/com/thealgorithms/compression/ArithmeticCoding.java create mode 100644 src/main/java/com/thealgorithms/compression/LZW.java create mode 100644 src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java create mode 100644 src/test/java/com/thealgorithms/compression/LZWTest.java diff --git a/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java b/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java new file mode 100644 index 000000000000..210a920d5206 --- /dev/null +++ b/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java @@ -0,0 +1,158 @@ +package com.thealgorithms.compression; + +import java.math.BigDecimal; +import java.math.MathContext; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * An implementation of the Arithmetic Coding algorithm. + * + *

+ * Arithmetic coding is a form of entropy encoding used in lossless data + * compression. It encodes an entire message into a single number, a fraction n + * where (0.0 <= n < 1.0). Unlike Huffman coding, which assigns a specific + * bit sequence to each symbol, arithmetic coding represents the message as a + * sub-interval of the [0, 1) interval. + *

+ * + *

+ * This implementation uses BigDecimal for precision to handle the shrinking + * intervals, making it suitable for educational purposes to demonstrate the + * core logic. + *

+ * + *

+ * Time Complexity: O(n*m) for compression and decompression where n is the + * length of the input and m is the number of unique symbols, due to the need + * to calculate symbol probabilities. + *

+ * + *

+ * References: + *

+ *

+ */ +public final class ArithmeticCoding { + + private ArithmeticCoding() { + } + + /** + * Compresses a string using the Arithmetic Coding algorithm. + * + * @param uncompressed The string to be compressed. + * @return The compressed representation as a BigDecimal number. + * @throws IllegalArgumentException if the input string is null or empty. + */ + public static BigDecimal compress(String uncompressed) { + if (uncompressed == null || uncompressed.isEmpty()) { + throw new IllegalArgumentException("Input string cannot be null or empty."); + } + + Map probabilityTable = calculateProbabilities(uncompressed); + + BigDecimal low = BigDecimal.ZERO; + BigDecimal high = BigDecimal.ONE; + + for (char symbol : uncompressed.toCharArray()) { + BigDecimal range = high.subtract(low); + Symbol sym = probabilityTable.get(symbol); + + high = low.add(range.multiply(sym.high())); + low = low.add(range.multiply(sym.low())); + } + + return low; // Return the lower bound of the final interval + } + + /** + * Decompresses a BigDecimal number back into the original string. + * + * @param compressed The compressed BigDecimal number. + * @param length The length of the original uncompressed string. + * @param probabilityTable The probability table used during compression. + * @return The original, uncompressed string. + */ + public static String decompress(BigDecimal compressed, int length, Map probabilityTable) { + StringBuilder decompressed = new StringBuilder(); + + // Create a sorted list of symbols for deterministic decompression, matching the + // order used in calculateProbabilities + List> sortedSymbols = new ArrayList<>(probabilityTable.entrySet()); + sortedSymbols.sort(Map.Entry.comparingByKey()); + + BigDecimal low = BigDecimal.ZERO; + BigDecimal high = BigDecimal.ONE; + + for (int i = 0; i < length; i++) { + BigDecimal range = high.subtract(low); + + // Find which symbol the compressed value falls into + for (Map.Entry entry : sortedSymbols) { + Symbol sym = entry.getValue(); + + // Calculate the actual range for this symbol in the current interval + BigDecimal symLow = low.add(range.multiply(sym.low())); + BigDecimal symHigh = low.add(range.multiply(sym.high())); + + // Check if the compressed value falls within this symbol's range + if (compressed.compareTo(symLow) >= 0 && compressed.compareTo(symHigh) < 0) { + decompressed.append(entry.getKey()); + + // Update the interval for the next iteration + low = symLow; + high = symHigh; + break; + } + } + } + + return decompressed.toString(); + } + + /** + * Calculates the frequency and probability range for each character in the + * input string in a deterministic order. + * + * @param text The input string. + * @return A map from each character to a Symbol object containing its + * probability range. + */ + public static Map calculateProbabilities(String text) { + Map frequencies = new HashMap<>(); + for (char c : text.toCharArray()) { + frequencies.put(c, frequencies.getOrDefault(c, 0) + 1); + } + + // Sort the characters to ensure a deterministic order for the probability table + List sortedKeys = new ArrayList<>(frequencies.keySet()); + Collections.sort(sortedKeys); + + Map probabilityTable = new HashMap<>(); + BigDecimal currentLow = BigDecimal.ZERO; + int total = text.length(); + + for (char symbol : sortedKeys) { + BigDecimal probability = BigDecimal.valueOf(frequencies.get(symbol)).divide(BigDecimal.valueOf(total), MathContext.DECIMAL128); + BigDecimal high = currentLow.add(probability); + probabilityTable.put(symbol, new Symbol(currentLow, high)); + currentLow = high; + } + + return probabilityTable; + } + + /** + * Helper class to store the probability range [low, high) for a symbol. + */ + public record Symbol(BigDecimal low, BigDecimal high) { + + } +} diff --git a/src/main/java/com/thealgorithms/compression/LZW.java b/src/main/java/com/thealgorithms/compression/LZW.java new file mode 100644 index 000000000000..c8383815ad4f --- /dev/null +++ b/src/main/java/com/thealgorithms/compression/LZW.java @@ -0,0 +1,136 @@ +package com.thealgorithms.compression; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * An implementation of the Lempel-Ziv-Welch (LZW) algorithm. + * + *

+ * LZW is a universal lossless data compression algorithm created by Abraham + * Lempel, Jacob Ziv, and Terry Welch. It works by building a dictionary of + * strings encountered during compression and replacing occurrences of those + * strings with a shorter code. + *

+ * + *

+ * This implementation handles standard ASCII characters and provides methods for + * both compression and decompression. + *

    + *
  • Compressing "TOBEORNOTTOBEORTOBEORNOT" results in a list of integer + * codes.
  • + *
  • Decompressing that list of codes results back in the original + * string.
  • + *
+ *

+ * + *

+ * Time Complexity: O(n) for both compression and decompression, where n is the + * length of the input string. + *

+ * + *

+ * References: + *

+ *

+ */ +public final class LZW { + + /** + * Private constructor to prevent instantiation of this utility class. + */ + private LZW() { + } + + /** + * Compresses a string using the LZW algorithm. + * + * @param uncompressed The string to be compressed. Can be null. + * @return A list of integers representing the compressed data. Returns an empty + * list if the input is null or empty. + */ + public static List compress(String uncompressed) { + if (uncompressed == null || uncompressed.isEmpty()) { + return new ArrayList<>(); + } + + // Initialize dictionary with single characters (ASCII 0-255) + int dictSize = 256; + Map dictionary = new HashMap<>(); + for (int i = 0; i < dictSize; i++) { + dictionary.put("" + (char) i, i); + } + + String w = ""; + List result = new ArrayList<>(); + for (char c : uncompressed.toCharArray()) { + String wc = w + c; + if (dictionary.containsKey(wc)) { + // If the new string is in the dictionary, extend the current string + w = wc; + } else { + // Otherwise, output the code for the current string + result.add(dictionary.get(w)); + // Add the new string to the dictionary + dictionary.put(wc, dictSize++); + // Start a new current string + w = "" + c; + } + } + + // Output the code for the last remaining string + result.add(dictionary.get(w)); + return result; + } + + /** + * Decompresses a list of integers back into a string using the LZW algorithm. + * + * @param compressed A list of integers representing the compressed data. Can be + * null. + * @return The original, uncompressed string. Returns an empty string if the + * input is null or empty. + */ + public static String decompress(List compressed) { + if (compressed == null || compressed.isEmpty()) { + return ""; + } + + // Initialize dictionary with single characters (ASCII 0-255) + int dictSize = 256; + Map dictionary = new HashMap<>(); + for (int i = 0; i < dictSize; i++) { + dictionary.put(i, "" + (char) i); + } + + // Decompress the first code + String w = "" + (char) (int) compressed.removeFirst(); + StringBuilder result = new StringBuilder(w); + + for (int k : compressed) { + String entry; + if (dictionary.containsKey(k)) { + // The code is in the dictionary + entry = dictionary.get(k); + } else if (k == dictSize) { + // Special case for sequences like "ababab" + entry = w + w.charAt(0); + } else { + throw new IllegalArgumentException("Bad compressed k: " + k); + } + + result.append(entry); + + // Add new sequence to the dictionary + dictionary.put(dictSize++, w + entry.charAt(0)); + + w = entry; + } + return result.toString(); + } +} diff --git a/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java b/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java new file mode 100644 index 000000000000..0c12eedb8140 --- /dev/null +++ b/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java @@ -0,0 +1,119 @@ +package com.thealgorithms.compression; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.math.BigDecimal; +import java.util.Map; + +import org.junit.jupiter.api.Test; + +class ArithmeticCodingTest { + + @Test + void testThrowsExceptionForNullOrEmptyInput() { + // Test that null input throws IllegalArgumentException + assertThrows(IllegalArgumentException.class, () -> ArithmeticCoding.compress(null)); + + // Test that empty string throws IllegalArgumentException + assertThrows(IllegalArgumentException.class, () -> ArithmeticCoding.compress("")); + } + + @Test + void testCompressionAndDecompressionSimple() { + String original = "BABA"; + Map probTable = ArithmeticCoding.calculateProbabilities(original); + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Verify that compression produces a valid number in [0, 1) + assertNotNull(compressed); + assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0); + assertTrue(compressed.compareTo(BigDecimal.ONE) < 0); + + // Verify decompression restores the original string + String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable); + assertEquals(original, decompressed); + } + + @Test + void testSymmetryWithComplexString() { + String original = "THE_QUICK_BROWN_FOX_JUMPS_OVER_THE_LAZY_DOG"; + Map probTable = ArithmeticCoding.calculateProbabilities(original); + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Verify compression produces a number in valid range + assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0); + assertTrue(compressed.compareTo(BigDecimal.ONE) < 0); + + // Verify symmetry: decompress(compress(x)) == x + String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable); + assertEquals(original, decompressed); + } + + @Test + void testSymmetryWithRepetitions() { + String original = "MISSISSIPPI"; + Map probTable = ArithmeticCoding.calculateProbabilities(original); + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Verify compression produces a number in valid range + assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0); + assertTrue(compressed.compareTo(BigDecimal.ONE) < 0); + + // Verify the compression-decompression cycle + String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable); + assertEquals(original, decompressed); + } + + @Test + void testSingleCharacterString() { + String original = "AAAAA"; + Map probTable = ArithmeticCoding.calculateProbabilities(original); + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Even with a single unique character, compression should work + assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0); + assertTrue(compressed.compareTo(BigDecimal.ONE) < 0); + + String decompressed = ArithmeticCoding.decompress(compressed, original.length(), probTable); + assertEquals(original, decompressed); + } + + @Test + void testCompressionOutputDemo() { + // Demonstrate actual compression output similar to LZW test + String original = "BABA"; + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Example: "BABA" compresses to approximately 0.625 + // This shows that the entire message is encoded as a single number + System.out.println("Original: " + original); + System.out.println("Compressed to: " + compressed); + System.out.println("Compression: " + original.length() + " characters -> 1 BigDecimal number"); + + // Verify the compressed value is in valid range [0, 1) + assertTrue(compressed.compareTo(BigDecimal.ZERO) >= 0); + assertTrue(compressed.compareTo(BigDecimal.ONE) < 0); + } + + @Test + void testProbabilityTableCalculation() { + // Test that probability table is calculated correctly + String text = "AABBC"; + Map probTable = ArithmeticCoding.calculateProbabilities(text); + + // Verify all characters are in the table + assertTrue(probTable.containsKey('A')); + assertTrue(probTable.containsKey('B')); + assertTrue(probTable.containsKey('C')); + + // Verify probability ranges are valid + for (ArithmeticCoding.Symbol symbol : probTable.values()) { + assertTrue(symbol.low().compareTo(BigDecimal.ZERO) >= 0); + assertTrue(symbol.high().compareTo(BigDecimal.ONE) <= 0); + assertTrue(symbol.low().compareTo(symbol.high()) < 0); + } + } +} diff --git a/src/test/java/com/thealgorithms/compression/LZWTest.java b/src/test/java/com/thealgorithms/compression/LZWTest.java new file mode 100644 index 000000000000..c1abc786a7b7 --- /dev/null +++ b/src/test/java/com/thealgorithms/compression/LZWTest.java @@ -0,0 +1,78 @@ +package com.thealgorithms.compression; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.util.Collections; +import java.util.List; + +import org.junit.jupiter.api.Test; + +class LZWTest { + + @Test + void testNullAndEmptyInputs() { + // Test that a null input to compress returns an empty list + assertTrue(LZW.compress(null).isEmpty()); + + // Test that a null input to decompress returns an empty string + assertEquals("", LZW.decompress(null)); + + // Test that an empty input to compress returns an empty list + assertTrue(LZW.compress("").isEmpty()); + + // Test that an empty input to decompress returns an empty string + assertEquals("", LZW.decompress(Collections.emptyList())); + } + + @Test + void testCompressionAndDecompressionWithSimpleString() { + // Test a classic example string + String original = "TOBEORNOTTOBEORTOBEORNOT"; + List compressed = LZW.compress(original); + + // Create the expected output list + List expectedOutput = List.of(84, 79, 66, 69, 79, 82, 78, 79, 84, 256, 258, 260, 265, 259, 261, 263); + + // This assertion will fail if the output is not what we expect + assertEquals(expectedOutput, compressed); + + // This assertion ensures the decompressed string is correct + String decompressed = LZW.decompress(compressed); + assertEquals(original, decompressed); + } + + @Test + void testCompressionWithRepeatedChars() { + // Test a string with long runs of the same character + String original = "AAAAABBBBBAAAAA"; + List compressed = LZW.compress(original); + String decompressed = LZW.decompress(compressed); + assertEquals(original, decompressed); + } + + @Test + void testCompressionWithUniqueChars() { + // Test a string with no repetitions + String original = "ABCDEFG"; + List compressed = LZW.compress(original); + String decompressed = LZW.decompress(compressed); + assertEquals(original, decompressed); + } + + @Test + void testSymmetry() { + // Test that compressing and then decompressing a complex string returns the + // original + String original = "THE_QUICK_BROWN_FOX_JUMPS_OVER_THE_LAZY_DOG"; + List compressed = LZW.compress(original); + String decompressed = LZW.decompress(compressed); + assertEquals(original, decompressed); + + // Another symmetry test with special characters and patterns + String original2 = "ababcbababa"; + List compressed2 = LZW.compress(original2); + String decompressed2 = LZW.decompress(compressed2); + assertEquals(original2, decompressed2); + } +} From c8d2da3fc242a18bfe9fff786fe13d0c827991f0 Mon Sep 17 00:00:00 2001 From: Microindole <1513979779@qq.com> Date: Thu, 16 Oct 2025 21:24:43 +0800 Subject: [PATCH 2/3] test(compression): Improve test coverage for LZW and ArithmeticCoding --- .../compression/ArithmeticCoding.java | 1 - .../compression/ArithmeticCodingTest.java | 37 ++++++++++++++++++- .../thealgorithms/compression/LZWTest.java | 33 ++++++++++++++++- 3 files changed, 67 insertions(+), 4 deletions(-) diff --git a/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java b/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java index 210a920d5206..b5ccf359d1be 100644 --- a/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java +++ b/src/main/java/com/thealgorithms/compression/ArithmeticCoding.java @@ -153,6 +153,5 @@ public static Map calculateProbabilities(String text) { * Helper class to store the probability range [low, high) for a symbol. */ public record Symbol(BigDecimal low, BigDecimal high) { - } } diff --git a/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java b/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java index 0c12eedb8140..8e51fe5eb463 100644 --- a/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java +++ b/src/test/java/com/thealgorithms/compression/ArithmeticCodingTest.java @@ -6,8 +6,8 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.math.BigDecimal; +import java.util.HashMap; import java.util.Map; - import org.junit.jupiter.api.Test; class ArithmeticCodingTest { @@ -116,4 +116,39 @@ void testProbabilityTableCalculation() { assertTrue(symbol.low().compareTo(symbol.high()) < 0); } } + + @Test + void testDecompressionWithMismatchedProbabilityTable() { + // Test decompression with a probability table that doesn't match the original + String original = "ABCD"; + BigDecimal compressed = ArithmeticCoding.compress(original); + + // Create a different probability table (for "XYZ" instead of "ABCD") + Map wrongProbTable = ArithmeticCoding.calculateProbabilities("XYZ"); + + // Decompression with wrong probability table should produce incorrect output + String decompressed = ArithmeticCoding.decompress(compressed, original.length(), wrongProbTable); + + // The decompressed string will be different from original (likely all 'X', 'Y', or 'Z') + // This tests the edge case where the compressed value doesn't fall into expected ranges + assertNotNull(decompressed); + assertEquals(original.length(), decompressed.length()); + } + + @Test + void testDecompressionWithValueOutsideSymbolRanges() { + // Create a custom probability table + Map probTable = new HashMap<>(); + probTable.put('A', new ArithmeticCoding.Symbol(new BigDecimal("0.0"), new BigDecimal("0.5"))); + probTable.put('B', new ArithmeticCoding.Symbol(new BigDecimal("0.5"), new BigDecimal("1.0"))); + + // Use a compressed value that should decode properly + BigDecimal compressed = new BigDecimal("0.25"); // Falls in 'A' range + + String decompressed = ArithmeticCoding.decompress(compressed, 3, probTable); + + // Verify decompression completes (even if result might not be meaningful) + assertNotNull(decompressed); + assertEquals(3, decompressed.length()); + } } diff --git a/src/test/java/com/thealgorithms/compression/LZWTest.java b/src/test/java/com/thealgorithms/compression/LZWTest.java index c1abc786a7b7..a8e1c609964b 100644 --- a/src/test/java/com/thealgorithms/compression/LZWTest.java +++ b/src/test/java/com/thealgorithms/compression/LZWTest.java @@ -1,11 +1,12 @@ package com.thealgorithms.compression; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; +import java.util.ArrayList; import java.util.Collections; import java.util.List; - import org.junit.jupiter.api.Test; class LZWTest { @@ -75,4 +76,32 @@ void testSymmetry() { String decompressed2 = LZW.decompress(compressed2); assertEquals(original2, decompressed2); } -} + + @Test + void testInvalidCompressedData() { + // Test that decompressing with an invalid code throws IllegalArgumentException + // Create a list with a code that doesn't exist in the dictionary + List invalidCompressed = new ArrayList<>(); + invalidCompressed.add(65); // 'A' - valid + invalidCompressed.add(999); // Invalid code (not in dictionary) + + // This should throw IllegalArgumentException with message "Bad compressed k: 999" + IllegalArgumentException exception = assertThrows( + IllegalArgumentException.class, + () -> LZW.decompress(invalidCompressed) + ); + + assertTrue(exception.getMessage().contains("Bad compressed k: 999")); + } + + @Test + void testDecompressionWithGapInDictionary() { + // Test with codes that skip dictionary entries + List invalidCompressed = new ArrayList<>(); + invalidCompressed.add(84); // 'T' - valid + invalidCompressed.add(500); // Way beyond current dictionary size + + // This should throw IllegalArgumentException + assertThrows(IllegalArgumentException.class, () -> LZW.decompress(invalidCompressed)); + } +} \ No newline at end of file From 30db94505a8b30a1373c21f6d7645ff279540600 Mon Sep 17 00:00:00 2001 From: Microindole <1513979779@qq.com> Date: Thu, 16 Oct 2025 21:35:11 +0800 Subject: [PATCH 3/3] style(compression): fix code style --- .../java/com/thealgorithms/compression/LZWTest.java | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/src/test/java/com/thealgorithms/compression/LZWTest.java b/src/test/java/com/thealgorithms/compression/LZWTest.java index a8e1c609964b..7f0c7503c822 100644 --- a/src/test/java/com/thealgorithms/compression/LZWTest.java +++ b/src/test/java/com/thealgorithms/compression/LZWTest.java @@ -82,14 +82,11 @@ void testInvalidCompressedData() { // Test that decompressing with an invalid code throws IllegalArgumentException // Create a list with a code that doesn't exist in the dictionary List invalidCompressed = new ArrayList<>(); - invalidCompressed.add(65); // 'A' - valid + invalidCompressed.add(65); // 'A' - valid invalidCompressed.add(999); // Invalid code (not in dictionary) // This should throw IllegalArgumentException with message "Bad compressed k: 999" - IllegalArgumentException exception = assertThrows( - IllegalArgumentException.class, - () -> LZW.decompress(invalidCompressed) - ); + IllegalArgumentException exception = assertThrows(IllegalArgumentException.class, () -> LZW.decompress(invalidCompressed)); assertTrue(exception.getMessage().contains("Bad compressed k: 999")); } @@ -98,10 +95,10 @@ void testInvalidCompressedData() { void testDecompressionWithGapInDictionary() { // Test with codes that skip dictionary entries List invalidCompressed = new ArrayList<>(); - invalidCompressed.add(84); // 'T' - valid - invalidCompressed.add(500); // Way beyond current dictionary size + invalidCompressed.add(84); // 'T' - valid + invalidCompressed.add(500); // Way beyond current dictionary size // This should throw IllegalArgumentException assertThrows(IllegalArgumentException.class, () -> LZW.decompress(invalidCompressed)); } -} \ No newline at end of file +}