In [None]:
import pandas as pd
import re
import time

# function to extract BIRADS scores and sides
def extract_birads_scores(report):
    report = report.lower()
    
    # initialize variables
    left_score, right_score, unclear_score = None, None, None

    # regex patterns for BIRADS scores and side indicators
    birads_pattern = r"(?:bi[ -]?rads|level)[ :\-]?(?:kategorie )?([0-6]\s*(?:[a-d](?=\s|$|[.,;:)?]))?|[ivx]+\s*(?:[a-d](?=\s|$|[.,;:)?]))?)(?:\s*(links|rechts))?(?:,\s*([0-6]\s*(?:[a-d](?=\s|$|[.,;:)?]))?|[ivx]+\s*(?:[a-d](?=\s|$|[.,;:)?]))?))?"

    side_patterns = {
    "left": [r"\blinks\b[.,;:)?]?", r"\blinke\b[.,;:)?]?", r"\blinken\b[.,;:)?]?", r"\bli\b[.,;:)?]?", r"\blinksseitig\b[.,;:)?]?", r"\brechtsseitiger\b[.,;:)?]?"],
    "right": [r"\brechts\b[.,;:)?]?", r"\brechte\b[.,;:)?]?", r"\brechten\b[.,;:)?]?", r"\bre\b[.,;:)?]?", r"\brechtsseitig\b[.,;:)?]?", r"\blinksseitiger\b[.,;:)?]?"],
    "both": [r"\bbeidseits\b[.,;:)?]?", r"\bbeiden\b[.,;:)?]?", r"\bbeidseitig\b[.,;:)?]?", r"\bbds\b[.,;:)?]?", r"\bbeiderseits\b[.,;:)?]?", r"\bbilateral\b[.,;:)?]?"]
    }
    
    # spacy for text segmentation
    matches = re.finditer(birads_pattern, report)
    
    # collect BIRADS scores with their positions
    birads_scores = []
    
    for match in matches:
        match_position = match.start()

        # extract the first BIRADS score
        first_score = match.group(1)  # group 1 to get only the number part
        birads_scores.append((first_score.strip(), match_position))
        
        # check if a second score exists within the same match
        second_score = match.group(3)
        if second_score:
            second_position = match.start(3)
            birads_scores.append((second_score.strip(), second_position))

    # only keep the last two scores if there are more than two
    if len(birads_scores) > 2:
        birads_scores = birads_scores[-2:]

    # identify side indicators and their positions
    sides = {"left": [], "right": [], "both": []}

    for side, patterns in side_patterns.items():
        for pattern in patterns:
            for match in re.finditer(pattern, report):
                sides[side].append(match.start())

    # match scores to sides
    if sides["both"] and len(birads_scores) >= 1 and not sides["right"] and not sides["left"]:
        # get the position of the last "both" indicator
        both_position = max(sides["both"])

        # initialize variables to find the closest score
        closest_score = None
        min_distance = float("inf")

        # iterate over scores to find the one closest to the "both" position
        for score, position in birads_scores:
            distance = abs(position - both_position)
            if distance < min_distance:
                min_distance = distance
                closest_score = score

        score = roman_to_int(closest_score)
        left_score, right_score = score, score

    elif len(birads_scores) == 2 and sides["left"] and sides["right"]:
            # if there are both side indicators and two scores, compare all distances
            left_to_first_score = min(abs(pos - birads_scores[0][1]) for pos in sides["left"])
            left_to_second_score = min(abs(pos - birads_scores[1][1]) for pos in sides["left"])
            right_to_first_score = min(abs(pos - birads_scores[0][1]) for pos in sides["right"])
            right_to_second_score = min(abs(pos - birads_scores[1][1]) for pos in sides["right"])

            # calculate total distances for both possible assignments
            distance_assignment_1 = left_to_first_score + right_to_second_score
            distance_assignment_2 = right_to_first_score + left_to_second_score

            # assign based on the minimum distance
            if distance_assignment_1 <= distance_assignment_2:
                left_score = roman_to_int(birads_scores[0][0])
                right_score = roman_to_int(birads_scores[1][0])
            else:
                left_score = roman_to_int(birads_scores[1][0])
                right_score = roman_to_int(birads_scores[0][0])

            if sides["both"]:
                # get the position of the last score
                last_score, last_position = birads_scores[-1]

                # combine all side indicator positions into a single list
                all_positions = sides["left"] + sides["right"] + sides["both"]

                # find the closest position
                closest_position = None
                min_distance = float("inf")

                for position in all_positions:
                    distance = abs(position - last_position)
                    if distance < min_distance:
                        min_distance = distance
                        closest_position = position

                # check if the closest position is in "both"
                if closest_position in sides["both"]:
                    score = roman_to_int(last_score)
                    left_score, right_score = score, score

    elif len(birads_scores) >= 1 and sides["left"] and not sides["right"] and not sides["both"]:
        # case when there's only "left" side indication
        left_score = roman_to_int(birads_scores[-1][0])

    elif len(birads_scores) >= 1 and sides["right"] and not sides["left"] and not sides["both"]:
        # case when there's only "right" side indication
        right_score = roman_to_int(birads_scores[-1][0])

    elif len(birads_scores) >= 1 and (sides["left"] or sides["right"]) and sides["both"]:
        # get the position of the last BIRADS score
        last_score, last_position = birads_scores[-1]

        # combine all side indicator positions into a single list
        all_positions = sides["left"] + sides["right"] + sides["both"]

        # find the closest position
        closest_position = None
        min_distance = float("inf")

        for i in all_positions:
            distance = abs(i - last_position)
            if distance < min_distance:
                min_distance = distance
                closest_position = i

        score = roman_to_int(last_score)

        # check if the closest position is in "both"
        if closest_position in sides["both"]:
            left_score, right_score = score, score

        else:
            if sides["right"]:
                right_score = score

            if sides["left"]:
                left_score = score

        # check if a side appears at the beginning of the report
        for side, positions in sides.items():
            for pos in positions:
                if pos <= 20:
                    if side == "left":
                        left_score = score

                    if side == "right":
                        right_score = score

                    if side == "both":
                        left_score, right_score = score, score

    elif len(birads_scores) == 1 and sides["left"] and sides ["right"]:
        # assume a single score applies to both sides
        last_score, last_position = birads_scores[-1]
        score = roman_to_int(last_score)
        left_score, right_score = score, score
    
    else:
        # otherwise, assign scores to unclear
        for score, position in birads_scores:
            unclear_score = roman_to_int(score)
    
    return left_score, right_score, unclear_score

# helper function to convert BIRADS roman numerals to integers
def roman_to_int(birads):
    roman_to_int_map = {'i': 1, 'ii': 2, 'iii': 3, 'iv': 4, 'v': 5, 'vi': 6}
    
    # regex pattern to separate roman numerals from any letter suffix (e.g., IVa)
    match = re.match(r"([ivx]+)([a-d]?)", birads.lower())
    
    if match:
        # extract roman numeral and optional suffix letter
        roman_part = match.group(1)
        suffix = match.group(2)
        
        # convert roman numeral to integer if it's in the map
        if roman_part in roman_to_int_map:
            converted_number = roman_to_int_map[roman_part]
            return f"{converted_number}{suffix}"
    
    # return the original input if it doesn't match the expected pattern
    return birads


if __name__ == "__main__":
    start_time = time.time()
    file_path = 'birads.csv'
    df = pd.read_csv(file_path, header=None)
    df.columns = ["Report"]

    # apply the function to each report and collect the results
    results = df['Report'].apply(extract_birads_scores)

    # create new columns from the results
    df[['links', 'rechts', 'unklar']] = pd.DataFrame(results.tolist(), index=df.index)

    # save the modified DataFrame to a new CSV
    df.to_csv(f'regex(result).csv', index=False)
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"Time taken: {elapsed_time:.2f} seconds")
    print('results saved')

Time taken: 0.10 seconds
results saved
