In [3]:
# import re

# class Extractor:

#     def __init__(self):
#         self.start_rules = []
#         self.end_rules = []

#     def add_start_rule(self, rule, is_regex=False):
#         self.start_rules.append((rule, is_regex))

#     def add_end_rule(self, rule, is_regex=False):
#         self.end_rules.append((rule, is_regex))

#     def extract(self, text):
#         start_index = None
#         end_index = None

#         for rule, is_regex in self.start_rules:
#             if is_regex:
#                 match = re.search(rule, text)
#                 if match is not None:
#                     start_index = match.end()  # We want the index after the start rule
#                     break  # If we've found a match, we can break
#             else:
#                 start_index = text.find(rule)
#                 if start_index != -1:
#                     start_index += len(rule)  # We want the index after the start rule
#                     break  # If we've found a match, we can break

#         if start_index is None:  # If we haven't found a match, return an empty string
#             return ''

#         for rule, is_regex in self.end_rules:
#             if is_regex:
#                 match = re.search(rule, text[start_index:])
#                 if match is not None:
#                     end_index = start_index + match.start()  # We want the index before the end rule
#                     break  # If we've found a match, we can break
#             else:
#                 end_index = text.find(rule, start_index)  # We search after the start index
#                 if end_index != -1:
#                     break  # If we've found a match, we can break

#         if end_index is None:  # If we haven't found a match, return an empty string
#             return ''

#         return text[start_index:end_index]


In [None]:
import re

class Extractor:
    '''
    
To request an appropriate pattern or string match for this class, you could ask:

"Please provide a string or a regular expression pattern that we should use for the start 
rule or end rule. If you provide a regular expression pattern, please specify that it is 
a regex. Also, note that for regular expressions, we're using Python's 're' module, so the 
pattern should be compatible with it. If you want to extract from the start or end of the 
text when no matching rule is found, please indicate that as well."
    '''

    def __init__(self):
        self.start_rules = []
        self.end_rules = []

    def add_start_rule(self, rule, is_regex=False):
        self.start_rules.append((rule, is_regex))

    def add_end_rule(self, rule, is_regex=False):
        self.end_rules.append((rule, is_regex))

    def extract(self, text, extract_if_no_start=False, extract_if_no_end=False):
        if len(self.start_rules) > 0 and not extract_if_no_start:
            start_index = None
        else:
            start_index = 0

        if len(self.end_rules) > 0 and not extract_if_no_end:
            end_index = None
        else:
            end_index = len(text)



        for rule, is_regex in self.start_rules:
            if is_regex:
                match = re.search(rule, text)
                if match is not None:
                    start_index = match.end()  # We want the index after the start rule
                    break  # If we've found a match, we can break
            else:
                idx = text.find(rule)
                if idx != -1:
                    start_index = idx + len(rule)  # We want the index after the start rule
                    break  # If we've found a match, we can break

        for rule, is_regex in self.end_rules:
            if is_regex:
                match = re.search(rule, text[start_index if start_index is not None else 0:])
                if match is not None:
                    end_index = (start_index if start_index is not None else 0) + match.start()  # We want the index before the end rule
                    break  # If we've found a match, we can break
            else:
                idx = text.find(rule, start_index if start_index is not None else 0)  # We search after the start index
                if idx != -1:
                    end_index = idx
                    break  # If we've found a match, we can break

        if start_index is None or end_index is None:
            return ''
        
        return text[start_index:end_index]


In [12]:
import re

class Extractor:

    def __init__(self):
        self.start_rules = []
        self.end_rules = []

    def add_start_rule(self, rule, is_regex=False):
        self.start_rules.append((rule, is_regex))

    def add_end_rule(self, rule, is_regex=False):
        self.end_rules.append((rule, is_regex))

    def extract(self, text):
        if len(self.start_rules)==0:
            start_index = 0
        if len(self.end_rules)==0:
            end_index = len(text)
        

        for rule, is_regex in self.start_rules:
            if is_regex:
                match = re.search(rule, text)
                if match is not None:
                    start_index = match.end()  # We want the index after the start rule
                    break  # If we've found a match, we can break
            else:
                idx = text.find(rule)
                if idx != -1:
                    start_index = idx + len(rule)  # We want the index after the start rule
                    break  # If we've found a match, we can break

        for rule, is_regex in self.end_rules:
            if is_regex:
                match = re.search(rule, text[start_index:])
                if match is not None:
                    end_index = start_index + match.start()  # We want the index before the end rule
                    break  # If we've found a match, we can break
            else:
                idx = text.find(rule, start_index)  # We search after the start index
                if idx != -1:
                    end_index = idx
                    break  # If we've found a match, we can break

        return text[start_index:end_index]


In [4]:
extractor = Extractor()

extractor.add_start_rule('my name is ')
extractor.add_end_rule(r'\band\b', is_regex=True)  # \b denotes a word boundary in regex

print(extractor.extract('Hello, my name is John Doe and I am a Data Scientist.'))  # Outputs 'John Doe'


John Doe 


....
----------------------------------------------------------------------
Ran 4 tests in 0.004s

OK


<unittest.runner.TextTestResult run=4 errors=0 failures=0>

In [16]:
class TestExtractor(unittest.TestCase):

    def setUp(self):
        self.extractor = Extractor()

    def test_simple_regex(self):
        self.extractor.add_start_rule('Hello, my name is ')
        self.extractor.add_end_rule(r'\band\b', is_regex=True)
        result = self.extractor.extract('Hello, my name is John Doe and I am a Data Scientist.')
        self.assertEqual(result, 'John Doe ')
        
    def test_complex_regex(self):
        self.extractor.add_start_rule(r'(?<=have )\$[0-9]+', is_regex=True)
        result = self.extractor.extract('I have $100 and you have $50.')
        self.assertEqual(result, ' and you have $50.')
        
#     def test_complex_regex(self):
#         self.extractor.add_start_rule(r'(?<=name is ).*?(?= and)', is_regex=True)
#         result = self.extractor.extract('Hello, my name is John Doe and I am a Data Scientist.')
#         self.assertEqual(result, 'John Doe')

#     def test_complex_regex(self):
#         self.extractor.add_start_rule(r'(?<=name is ).*?(?= and)', is_regex=True)
#         result = self.extractor.extract('Hello, my name is John Doe and I am a Data Scientist.')
#         self.assertEqual(result, 'John Doe')

#     def test_non_matching_regex(self):
#         self.extractor.add_start_rule('Hello, my name is ')
#         self.extractor.add_end_rule(r'\bcat\b', is_regex=True)  # 'cat' does not exist in the string
#         result = self.extractor.extract('Hello, my name is John Doe and I am a Data Scientist.')
#         self.assertEqual(result, '')

#     def test_numeric_regex(self):
#         self.extractor.add_start_rule(r'(?<=Page ).*?(?= of)', is_regex=True)
#         result = self.extractor.extract('Page 23 of 42')
#         self.assertEqual(result, '23')

# Create a Test Suite and add the test cases
suite = unittest.TestSuite()
suite.addTest(TestExtractor('test_simple_regex'))
suite.addTest(TestExtractor('test_complex_regex'))
# suite.addTest(TestExtractor('test_non_matching_regex'))
# suite.addTest(TestExtractor('test_numeric_regex'))

# Run the tests
runner = unittest.TextTestRunner()
runner.run(suite)


..
----------------------------------------------------------------------
Ran 2 tests in 0.001s

OK


<unittest.runner.TextTestResult run=2 errors=0 failures=0>