Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Functionalities #75

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
1 change: 1 addition & 0 deletions decepticonlp/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@

from .transforms import *
from .metrics import *
from .extractor import *
41 changes: 24 additions & 17 deletions decepticonlp/metrics/char_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class CharacterMetrics(metaclass=abc.ABCMeta):
An abstract class used to represent the character metrics. Subclasses implement the calculate method.
Methods
-------
apply(self, text1: str, text2: str, **kwargs)
calculate(self, text1: str, text2: str, **kwargs)
- calculates the similarity/distance between two strings using the appropriate metric.
"""

Expand All @@ -36,36 +36,36 @@ class Levenshtein(CharacterMetrics):
- calculates levenshtein distance and returns the same
"""

def calculate(self, text1: str, text2: str, normalize="none", **kwargs):
def calculate(self, text1: str, text2: str, normalize="none", osa=False, **kwargs):
"""
Calculate Levenshtein Distance using dynamic programming optimized with (np)
DP - O(m*n) complexity - Recursive approach - O(3^m)

Example:
from perturb import levenshtein
print(levenshtein("Hey","HEY"))
2.0

#Normalize Levenshtein Distance - Total strategy
print(levenshtein("Hey", "HEY", normalize="sum"))
0.33333

#Normalize LCS - Max Strategy
print(levenshtein("HeyS", "HEY", normalize="lcs"))
0.75


#osa
print(levenshtein("Hey", "Hye",osa=True))
2

if Optimal string alignment (OSA) flag is set, transpositions are considered too.
Assumption: no substring will be edited more than once.
:params
:text1 : First string to be compared
:text2 : Second string to be compared
:normalize: pass "sum" for total Levenshtein distance, "lcs" for maximum normalization, "none" default
:osa: pass True for Optimal String Alignment (Levenshtein + transpositions), False default
:type text1: String
:type text2: String
:type normalize: String

returns levenshtein distance
:return type: float

IMPORTANT NOTE :
The normalized distance is not a metric, as it violates the triangle inequality.
https://stackoverflow.com/questions/45783385/normalizing-the-edit-distance
Expand All @@ -79,15 +79,22 @@ def calculate(self, text1: str, text2: str, normalize="none", **kwargs):

for x in range(1, size_x):
for y in range(1, size_y):
if text1[x - 1] == text2[y - 1]:
matrix[x, y] = min(
matrix[x - 1, y] + 1, matrix[x - 1, y - 1], matrix[x, y - 1] + 1
)
else:
cost_substitution = 0 if text1[x - 1] == text2[y - 1] else 1
matrix[x, y] = min(
matrix[x - 1, y] + 1,
matrix[x - 1, y - 1] + cost_substitution,
matrix[x, y - 1] + 1,
)

if (
osa
and x > 1
and y > 1
and text1[x - 1] == text2[y - 2]
and text1[x - 2] == text2[y - 1]
):
matrix[x, y] = min(
matrix[x - 1, y] + 1,
matrix[x - 1, y - 1] + 1,
matrix[x, y - 1] + 1,
matrix[x, y], matrix[x - 2, y - 2] + cost_substitution
)
distance = matrix[size_x - 1, size_y - 1]
if normalize == "sum":
Expand Down
5 changes: 5 additions & 0 deletions decepticonlp/preprocessing/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Preprocessing subpackage for decepticonlp."""

__author__ = """Rajaswa Ravindra Patil"""
__email__ = "rajp4480@gmail.com"
__version__ = "0.1.0"
66 changes: 66 additions & 0 deletions decepticonlp/preprocessing/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import abc
import random
from nltk.stem import *
from nltk.tokenize import sent_tokenize, word_tokenize


class Preprocessing(metaclass=abc.ABCMeta):
"""
An abstract class used to preprocess textual data from the user. Subclasses implement the apply method.
Methods
-------
apply(self, text1: str, **kwargs)
- applies the respective filter and return the new string.
"""

@abc.abstractmethod
def apply(self, text1: str, **kwargs): # pragma: no cover
"""applies filter and returns new string"""
raise NotImplementedError

def get_ignore_default_value(self):
return True


class Stem(Preprocessing):
"""
A class used to stem the words in a string
-------
apply(self, text1:str, **kwargs)
- applies the stem filter and returns the string

NOTE: The PorterStemmer has been used the default here.

Example:
st = Stem()
print(st.apply("Dinosaurs were killed by asteroids"))
dinosaur were kill by asteroid

print(st.apply("Dinosaurs were killed by asteroids".type="lancaster"))
dinosa wer kil by asteroid

:params
:text1 : String to be stemmed
:type: The kind of stemmer to be used
:type text1: String
:type type: String

## MORE FUNCTIONALITY NEEDS TO BE ADDED
"""

def apply(self, text1: str, type="default", **kwargs):
"""
"""
stemmer = None
if type == "default":
stemmer = PorterStemmer()
elif type == "lancaster":
stemmer = LancasterStemmer()

word_tokens = text1.split()
stem_word_tokens = []
for word in word_tokens:
stem_word_tokens.append(stemmer.stem(word))
stem_word_tokens.append(" ")

return "".join(stem_word_tokens)
4 changes: 3 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,11 @@ Sphinx==1.8.5
twine==1.14.0
tensorflow==2.2.0
tensorflow-hub==0.8.0
nltk==3.4.5
numpy==1.18.1

setuptools
pytest==5.4.2
pytest-runner==5.2
pytest-cov==2.8.1
pre-commit
pre-commit
8 changes: 8 additions & 0 deletions tests/test_char_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ def test_levenshtein(text1, text2, expected_result):
assert levenshtein_distance.calculate(text1, text2) == expected_result


@pytest.mark.parametrize(
"text1, text2, expected_result", [("Word", "Wrod", 1)],
)
def test_levenshtein_osa(text1, text2, expected_result):
levenshtein_distance = char_metrics.Levenshtein()
assert levenshtein_distance.calculate(text1, text2, osa=True) == expected_result


@pytest.mark.parametrize(
"text1, text2, expected_result",
[("Word", "Wordy", 0.1111111111111111), ("Word", "Wrod", 0.25), ("H", "H", 0)],
Expand Down
31 changes: 31 additions & 0 deletions tests/test_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import random

import pytest

from decepticonlp.preprocessing import preprocessing

import math


@pytest.mark.parametrize(
"text1, expected_result",
[
("Dinosaurs were killed by asteroids", "dinosaur were kill by asteroid "),
("Adversarial Library", "adversari librari "),
],
)
def test_stemmer_default(text1, expected_result):
stem = preprocessing.Stem()
assert stem.apply(text1) == expected_result


@pytest.mark.parametrize(
"text1, expected_result",
[
("Dinosaurs were killed by asteroids", "dinosa wer kil by asteroid "),
("Adversarial Library", "advers libr "),
],
)
def test_stemmer_lancaster(text1, expected_result):
stem = preprocessing.Stem()
assert stem.apply(text1, type="lancaster") == expected_result