SforAiDl · parantak · May 20, 2020 · May 20, 2020 · May 20, 2020 · May 20, 2020
diff --git a/decepticonlp/__init__.py b/decepticonlp/__init__.py
@@ -6,3 +6,4 @@
 
 from .transforms import *
 from .metrics import *
+from .extractor import *
diff --git a/decepticonlp/metrics/char_metrics.py b/decepticonlp/metrics/char_metrics.py
@@ -13,7 +13,7 @@ class CharacterMetrics(metaclass=abc.ABCMeta):
         An abstract class used to represent the character metrics. Subclasses implement the calculate method.
         Methods
         -------
-        apply(self, text1: str, text2: str, **kwargs)
+        calculate(self, text1: str, text2: str, **kwargs)
             - calculates the similarity/distance between two strings using the appropriate metric.
     """
 
@@ -36,36 +36,36 @@ class Levenshtein(CharacterMetrics):
             - calculates levenshtein distance and returns the same
     """
 
-    def calculate(self, text1: str, text2: str, normalize="none", **kwargs):
+    def calculate(self, text1: str, text2: str, normalize="none", osa=False, **kwargs):
         """
         Calculate Levenshtein Distance using dynamic programming optimized with (np)
         DP - O(m*n) complexity - Recursive approach - O(3^m)
-
         Example:
         from perturb import levenshtein
         print(levenshtein("Hey","HEY"))
         2.0
-
         #Normalize Levenshtein Distance - Total strategy
         print(levenshtein("Hey", "HEY", normalize="sum"))
         0.33333
-
         #Normalize LCS - Max Strategy
         print(levenshtein("HeyS", "HEY", normalize="lcs"))
         0.75
-
-
+        #osa
+        print(levenshtein("Hey", "Hye",osa=True))
+        2
+
+        if Optimal string alignment (OSA) flag is set, transpositions are considered too.
+        Assumption: no substring will be edited more than once.
         :params
         :text1 : First string to be compared
         :text2 : Second string to be compared
         :normalize: pass "sum" for total Levenshtein distance, "lcs" for maximum normalization, "none" default
+        :osa: pass True for Optimal String Alignment (Levenshtein + transpositions), False default
         :type text1: String
         :type text2: String
         :type normalize: String
-
         returns levenshtein distance
         :return type: float
-
         IMPORTANT NOTE :
         The normalized distance is not a metric, as it violates the triangle inequality.
         https://stackoverflow.com/questions/45783385/normalizing-the-edit-distance
@@ -79,15 +79,22 @@ def calculate(self, text1: str, text2: str, normalize="none", **kwargs):
 
         for x in range(1, size_x):
             for y in range(1, size_y):
-                if text1[x - 1] == text2[y - 1]:
-                    matrix[x, y] = min(
-                        matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1
-                    )
-                else:
+                cost_substitution = 0 if text1[x - 1] == text2[y - 1] else 1
+                matrix[x, y] = min(
+                    matrix[x - 1, y] + 1,
+                    matrix[x - 1, y - 1] + cost_substitution,
+                    matrix[x, y - 1] + 1,
+                )
+
+                if (
+                    osa
+                    and x > 1
+                    and y > 1
+                    and text1[x - 1] == text2[y - 2]
+                    and text1[x - 2] == text2[y - 1]
+                ):
                     matrix[x, y] = min(
-                        matrix[x - 1, y] + 1,
-                        matrix[x - 1, y - 1] + 1,
-                        matrix[x, y - 1] + 1,
+                        matrix[x, y], matrix[x - 2, y - 2] + cost_substitution
                     )
         distance = matrix[size_x - 1, size_y - 1]
         if normalize == "sum":

diff --git a/decepticonlp/preprocessing/__init__.py b/decepticonlp/preprocessing/__init__.py
@@ -0,0 +1,5 @@
+"""Preprocessing subpackage for decepticonlp."""
+
+__author__ = """Rajaswa Ravindra Patil"""
+__email__ = "rajp4480@gmail.com"
+__version__ = "0.1.0"
diff --git a/decepticonlp/preprocessing/preprocessing.py b/decepticonlp/preprocessing/preprocessing.py
@@ -0,0 +1,66 @@
+import abc
+import random
+from nltk.stem import *
+from nltk.tokenize import sent_tokenize, word_tokenize
+
+
+class Preprocessing(metaclass=abc.ABCMeta):
+    """
+        An abstract class used to preprocess textual data from the user. Subclasses implement the apply method.
+        Methods
+        -------
+        apply(self, text1: str, **kwargs)
+            - applies the respective filter and return the new string.
+    """
+
+    @abc.abstractmethod
+    def apply(self, text1: str, **kwargs):  # pragma: no cover
+        """applies filter and returns new string"""
+        raise NotImplementedError
+
+    def get_ignore_default_value(self):
+        return True
+
+
+class Stem(Preprocessing):
+    """
+        A class used to stem the words in a string
+        -------
+        apply(self, text1:str, **kwargs)
+            - applies the stem filter and returns the string
+
+        NOTE: The PorterStemmer has been used the default here.
+
+        Example:
+        st = Stem()
+        print(st.apply("Dinosaurs were killed by asteroids"))
+        dinosaur were kill by asteroid 
+
+        print(st.apply("Dinosaurs were killed by asteroids".type="lancaster"))
+        dinosa wer kil by asteroid 
+
+        :params
+        :text1 : String to be stemmed
+        :type: The kind of stemmer to be used
+        :type text1: String
+        :type type: String
+
+        ## MORE FUNCTIONALITY NEEDS TO BE ADDED
+    """
+
+    def apply(self, text1: str, type="default", **kwargs):
+        """
+        """
+        stemmer = None
+        if type == "default":
+            stemmer = PorterStemmer()
+        elif type == "lancaster":
+            stemmer = LancasterStemmer()
+
+        word_tokens = text1.split()
+        stem_word_tokens = []
+        for word in word_tokens:
+            stem_word_tokens.append(stemmer.stem(word))
+            stem_word_tokens.append(" ")
+
+        return "".join(stem_word_tokens)
diff --git a/requirements.txt b/requirements.txt
@@ -9,9 +9,11 @@ Sphinx==1.8.5
 twine==1.14.0
 tensorflow==2.2.0
 tensorflow-hub==0.8.0
+nltk==3.4.5
+numpy==1.18.1
 
 setuptools
 pytest==5.4.2
 pytest-runner==5.2
 pytest-cov==2.8.1
-pre-commit
+pre-commit
diff --git a/tests/test_char_metrics.py b/tests/test_char_metrics.py
@@ -18,6 +18,14 @@ def test_levenshtein(text1, text2, expected_result):
     assert levenshtein_distance.calculate(text1, text2) == expected_result
 
 
+@pytest.mark.parametrize(
+    "text1, text2, expected_result", [("Word", "Wrod", 1)],
+)
+def test_levenshtein_osa(text1, text2, expected_result):
+    levenshtein_distance = char_metrics.Levenshtein()
+    assert levenshtein_distance.calculate(text1, text2, osa=True) == expected_result
+
+
 @pytest.mark.parametrize(
     "text1, text2, expected_result",
     [("Word", "Wordy", 0.1111111111111111), ("Word", "Wrod", 0.25), ("H", "H", 0)],

diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py
@@ -0,0 +1,31 @@
+import random
+
+import pytest
+
+from decepticonlp.preprocessing import preprocessing
+
+import math
+
+
+@pytest.mark.parametrize(
+    "text1, expected_result",
+    [
+        ("Dinosaurs were killed by asteroids", "dinosaur were kill by asteroid "),
+        ("Adversarial Library", "adversari librari "),
+    ],
+)
+def test_stemmer_default(text1, expected_result):
+    stem = preprocessing.Stem()
+    assert stem.apply(text1) == expected_result
+
+
+@pytest.mark.parametrize(
+    "text1, expected_result",
+    [
+        ("Dinosaurs were killed by asteroids", "dinosa wer kil by asteroid "),
+        ("Adversarial Library", "advers libr "),
+    ],
+)
+def test_stemmer_lancaster(text1, expected_result):
+    stem = preprocessing.Stem()
+    assert stem.apply(text1, type="lancaster") == expected_result