**Segmentation Overlap Processing**

The below code was used in our data science competition to process overlapping discourse segments and ensure discrete elements.

In [None]:
df_sub = pd.read_csv(DATA_PATH/"prediction_file.csv")

def modified(pred_string, diff_words):
    return len(pred_string) > len(diff_words)

def get_original(modified, pred_string):
    if modified:
        return pred_string
    else:
        return "--"

def remove_overlaps(pred_df, by_discourse=False, verbose=False):
    pred_df['predictionstring'] = [[int(x) for x in pred.split(' ')] for pred in pred_df['predictionstring']]  # Map to integers. Keep as list for better vectorization.
    pred_df["nb_words"] = pred_df["predictionstring"].map(lambda x: len(x))
    pred_df["start_word"] = pred_df["predictionstring"].map(lambda x: min(x))  # Some Dataframes already have this information, but computing in case they don't in future
    # pred_df["end_word"] = pred_df["predictionstring"].map(lambda x: max(x))

    if by_discourse:  # If only removing overlaps within the same discourse element
        grouped_df = pred_df.groupby(["id", "discourse_type"])  # Group by essay and discourse type
    else:
        grouped_df = pred_df.groupby("id") # Group by essay only
    
    processed_groups = []
    removed_groups = []  # Keep track of cleaned up segments
    nb_groups = len(grouped_df)
    pbar = tqdm(total=nb_groups)
    segments_affected = 0
    for name, grp in grouped_df:
        pbar.update(1)
        grp_sorted = grp.sort_values(["start_word",'nb_words'],ascending=True)
        grp_sorted["cum_words"] = grp_sorted["predictionstring"].cumsum().apply(set)  # Compute cumulative words seen so far
        grp_sorted["cum_words"] = np.concatenate((np.array([{}]), grp_sorted["cum_words"].iloc[0:-1])) # Shift by 1
        # Need to shift to consider cumulation up to but excluding current segment.
        grp_sorted["set_diff_words"] = (grp_sorted["predictionstring"].map(set) - grp_sorted["cum_words"].map(set)).apply(list)  # Compute set difference with current segment in a vectorized way
        # Reference: https://stackoverflow.com/questions/28457149/how-to-map-a-function-using-multiple-columns-in-pandas -> ListComp faster than apply for some reason.
        grp_sorted["modified"] = [modified(pred, diff_w) for pred, diff_w in zip(grp_sorted["predictionstring"], grp_sorted["set_diff_words"])]  # Check if set difference is shorter (i.e., there is an overlap) in a vect way
        grp_sorted["noncontiguous"] = grp_sorted["set_diff_words"].map(lambda x: np.any(np.diff(x) > 1))  # Non-contiguous segments post overlap removal will be deleted
        grp_sorted["nonzero"] = grp_sorted["set_diff_words"].map(lambda x: len(x) > 0)  # Same for empty segments
        grp_sorted['original_predstring'] = [get_original(mod, pred) for mod, pred in zip(grp_sorted["modified"], grp_sorted["predictionstring"])]
        grp_sorted["predictionstring"] = grp_sorted["set_diff_words"].map(lambda x: " ".join([str(y) for y in x]))
        # Now filter out
        grp_processed = grp_sorted[((~grp_sorted["noncontiguous"]) & grp_sorted["nonzero"])]
        processed_groups.append(grp_processed)
        grp_removed = grp_sorted[((grp_sorted["noncontiguous"]) | ~grp_sorted["nonzero"])]
        removed_groups.append(grp_removed)
        if verbose and len(grp_removed) > 0:
            print("Discarded segments")
            print(grp_removed)
            
    # End of all group for loops
    new_segments = pd.concat(processed_groups)[["id", "discourse_type", "score_discourse_effectiveness_0", 
                                               "score_discourse_effectiveness_1", 
                                               "predictionstring", "modified", "original_predstring"]].to_csv("processed.csv")
    removed_segments = pd.concat(removed_groups)[["id", "discourse_type", "score_discourse_effectiveness_0", 
                                               "score_discourse_effectiveness_1", 
                                                 "predictionstring", "modified", "original_predstring"]].to_csv("removed.csv")
    if verbose:
        print(str(segments_affected) + " segments have been modified/removed by pre-processing")
    pbar.close()

remove_overlaps(df_sub, by_discourse=False)