### Extract comments

In [None]:
import re

def extract_all_line_comments(file_name):
    comments = []
    current_comment = ""

    with open(file_name, 'r', encoding="utf8") as f:
        xmlstr = f.read()

    for line in xmlstr.splitlines():
        match = re.search(r'(<comment type=\"line\">(.*?)<\/comment>|<comment type=\"line\" format="doxygen">(.*?)<\/comment>)', line.strip())
        if match and match.span()[0] == 0:
            current_comment += (match.group(2) if match.group(2) is not None else match.group(3)) + "\n"
        elif match and match.span()[0] != 0:
            if current_comment:
                comments.append(current_comment.rstrip())
            current_comment = ""
            comments.append(match.group(2) if match.group(2) is not None else match.group(3))
        else:
            if current_comment:
                comments.append(current_comment.rstrip())
            current_comment = ""
            
    if current_comment:
        comments.append(current_comment.rstrip())
    comments = [comment for comment in comments if comment != "//"]
    return comments

def extract_all_block_comments(file_name):
    with open(file_name, 'r', encoding="utf8") as f:
        xmlstr = f.read()
    pattern1 = '<comment type=\"block\">(.*?)<\/comment>'
    pattern2 = '<comment type=\"block\" format="doxygen">(.*?)<\/comment>'
    comments1 = re.findall(pattern1, xmlstr, re.S)
    comments2 = re.findall(pattern2, xmlstr, re.S) 
    return comments1 + comments2

def extract_all_comments(file_name):
    comments = extract_all_line_comments(file_name) + extract_all_block_comments(file_name)
    return comments

In [None]:
file_path = 'path_to_xml_file'
line_comments = extract_all_line_comments(file_path)
print("Num of line cmts: ", len(line_comments))
line_comments

In [None]:
block_comments = extract_all_block_comments(file_path)
print("Num of block cmts: ", len(block_comments))
block_comments

### Extract preceding code - line comments

In [None]:
def make_unique_list(lst): 
  seen = set()
  return [x for x in lst if x not in seen and not seen.add(x)]

unique_line_comments = make_unique_list(line_comments)
unique_block_comments = make_unique_list(block_comments)

print("Num of unique line cmts: ", len(unique_line_comments))
print("Num of unique block cmts: ", len(unique_block_comments))

In [None]:
def escape_special_chars_except_s(input_str):
    placeholder = "PLACEHOLDER_FOR_S"
    input_str = input_str.replace(r'\s*', placeholder)
    
    special_chars = r"[\[\](){}.*+?^$|\\]"
    
    def escape_special(match):
        char = match.group(0)
        return '\\' + char

    escaped_str = re.sub(special_chars, escape_special, input_str)
    
    escaped_str = escaped_str.replace(placeholder, r'\s*')
    
    return escaped_str

In [None]:
def is_a_pre_line_cmt(xmlstr, i):
    cur_str = ""
    while i >= 0 and xmlstr[i] != "\n":
        cur_str += xmlstr[i]
        i -= 1
    cur_str = cur_str[::-1].lstrip()
    if cur_str.find("//") == 0:
        return (cur_str, i)
    return False

In [None]:
def is_a_pre_block_cmt(xmlstr, i):
    cur_str = xmlstr[i] + xmlstr[i-1]
    i -= 2
    if cur_str[::-1] == "*/": 
        while i >= 0 and cur_str[-2:][::-1] != "/*": 
            cur_str += xmlstr[i]
            i -= 1
        cur_str = cur_str[::-1].lstrip()
        return (cur_str, i)
    else:
        return False

In [None]:
def extract_preceding_code(unique_comments, file_name):
    preceding_code = ""
    count_new_line = 0
    comments_and_preceding_code = {"comment": [], "preceding_code": [], "is_first_comment_of_block_level": []}
    is_first_comment_of_block_level = False
    have_reached_normal_chars = False
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()

    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&")
        initial_comment = comment

        if comment.count("//") >= 2: 
            first_comment_index = comment.find("//")
            second_comment_index = 0
            regex_consecutive_line_cmts = ""

            for j in range(comment.count("//")):
                if first_comment_index == comment.find("//"):
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
                    first_comment_index += 2
                elif j == comment.count("//") - 1:
                    first_comment_index = second_comment_index
                    regex_consecutive_line_cmts += comment[first_comment_index:].strip()
                else:
                    first_comment_index = second_comment_index
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
            
            regex_consecutive_line_cmts = escape_special_chars_except_s(regex_consecutive_line_cmts)
            start_index = [m.start() for m in re.finditer(regex_consecutive_line_cmts, xmlstr)]
        else:
            start_index = [m.start() for m in re.finditer(escape_comment + "\n", xmlstr)]
        
        for s_idx in start_index:
            cur_code = ""
            i = s_idx - 1
            comment = initial_comment
            
            while i >= 0:
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False: 
                        if is_a_pre_line_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_line_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        elif is_a_pre_block_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code[::-1].lstrip().find("//") == 0:
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        else:
                            if cur_code.isspace() == False and cur_code != "": 
                                have_reached_code = True
                            preceding_code += cur_code
                            cur_code = xmlstr[i]
                        if is_a_pre_block_cmt(xmlstr, i-1) != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                                comment = cur_str + "\n" + comment
                                cur_code = xmlstr[i]
                                if i == -1:
                                    break
                            else:
                                break
                elif xmlstr[i] == "{" and have_reached_normal_chars == False:
                    count_new_line = 0 
                    have_reached_normal_chars = True
                    is_first_comment_of_block_level = True
                    cur_code += xmlstr[i]
                else:
                    count_new_line = 0
                    if xmlstr[i] != " ":
                        have_reached_normal_chars = True
                    cur_code += xmlstr[i]
                i -= 1
                if i == -1: 
                    if cur_code[::-1].lstrip().find("//") == 0:
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    else:
                        preceding_code += cur_code
            comments_and_preceding_code["comment"].append(comment) 
            comments_and_preceding_code["preceding_code"].append(preceding_code[::-1])
            comments_and_preceding_code["is_first_comment_of_block_level"].append(is_first_comment_of_block_level)
            preceding_code = ""
            count_new_line = 0
            is_first_comment_of_block_level = False
            have_reached_normal_chars = False
            have_reached_code = False
    return comments_and_preceding_code

### Extract preceding code - block comments

In [None]:
def extract_preceding_code_block_cmts(unique_comments, file_name):
    preceding_code = ""
    count_new_line = 0
    comments_and_preceding_code = {"comment": [], "preceding_code": [], "is_first_comment_of_block_level": []}
    is_first_comment_of_block_level = False
    have_reached_normal_chars = False
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()
    
    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&")
        initial_comment = comment
        start_index = [m.start() for m in re.finditer(escape_comment, xmlstr)]
        
        for s_idx in start_index:
            cur_code = ""
            i = s_idx - 1
            comment = initial_comment
           
            while i >= 0:
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False:
                        if is_a_pre_line_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_line_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1:
                                break
                        elif is_a_pre_block_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code[::-1].lstrip().find("//") == 0:
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        else:
                            if cur_code.isspace() == False and cur_code != "": 
                                have_reached_code = True
                            preceding_code += cur_code
                            cur_code = xmlstr[i]
                        if is_a_pre_block_cmt(xmlstr, i-1) != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                                comment = cur_str + "\n" + comment
                                cur_code = xmlstr[i]
                                if i == -1:
                                    break
                            else:
                                break
                elif xmlstr[i] == "{" and have_reached_normal_chars == False:
                    count_new_line = 0 
                    have_reached_normal_chars = True
                    is_first_comment_of_block_level = True
                    cur_code += xmlstr[i]
                else:
                    count_new_line = 0
                    if xmlstr[i] != " ":
                        have_reached_normal_chars = True
                    cur_code += xmlstr[i]
                i -= 1
                if i == -1: 
                    if cur_code[::-1].lstrip().find("//") == 0:
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    else:
                        preceding_code += cur_code
            comments_and_preceding_code["comment"].append(comment) 
            comments_and_preceding_code["preceding_code"].append(preceding_code[::-1])
            comments_and_preceding_code["is_first_comment_of_block_level"].append(is_first_comment_of_block_level)
            preceding_code = ""
            count_new_line = 0
            is_first_comment_of_block_level = False
            have_reached_normal_chars = False
            have_reached_code = False
    return comments_and_preceding_code

### Extract preceding code - solved wrong indentation of consecutive line comments

In [None]:
def extract_preceding_code_solved_indentation(unique_comments, file_name):
    preceding_code = ""
    count_new_line = 0
    comments_and_preceding_code = {"comment": [], "preceding_code": [], "is_first_comment_of_block_level": []}
    is_first_comment_of_block_level = False
    have_reached_normal_chars = False
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()

    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&")

        if comment.count("//") >= 2: 
            first_comment_index = comment.find("//")
            second_comment_index = 0
            regex_consecutive_line_cmts = ""

            for j in range(comment.count("//")):
                if first_comment_index == comment.find("//"):
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
                    first_comment_index += 2
                elif j == comment.count("//") - 1:
                    first_comment_index = second_comment_index
                    regex_consecutive_line_cmts += comment[first_comment_index:].strip()
                else:
                    first_comment_index = second_comment_index
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
            
            regex_consecutive_line_cmts = escape_special_chars_except_s(regex_consecutive_line_cmts)
            start_index = [(m.start(), m.end()) for m in re.finditer(regex_consecutive_line_cmts, xmlstr)]
        else:
            start_index = [(m.start(), m.end()) for m in re.finditer(escape_comment + "\n", xmlstr)]
        
        for (s_idx, e_idx) in start_index:
            cur_code = ""
            i = s_idx - 1
            comment = xmlstr[s_idx:e_idx].strip() 
            while i >= 0:
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False:
                        if is_a_pre_line_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_line_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        elif is_a_pre_block_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code[::-1].lstrip().find("//") == 0:
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        else:
                            if cur_code.isspace() == False and cur_code != "": 
                                have_reached_code = True
                            preceding_code += cur_code
                            cur_code = xmlstr[i]
                        if is_a_pre_block_cmt(xmlstr, i-1) != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                                comment = cur_str + "\n" + comment
                                cur_code = xmlstr[i]
                                if i == -1:
                                    break
                            else:
                                break
                elif xmlstr[i] == "{" and have_reached_normal_chars == False:
                    count_new_line = 0 
                    have_reached_normal_chars = True
                    is_first_comment_of_block_level = True
                    cur_code += xmlstr[i]
                else:
                    count_new_line = 0
                    if xmlstr[i] != " ":
                        have_reached_normal_chars = True
                    cur_code += xmlstr[i]
                i -= 1
                if i == -1: 
                    if cur_code[::-1].lstrip().find("//") == 0:
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    else:
                        preceding_code += cur_code
            comments_and_preceding_code["comment"].append(comment) 
            comments_and_preceding_code["preceding_code"].append(preceding_code[::-1])
            comments_and_preceding_code["is_first_comment_of_block_level"].append(is_first_comment_of_block_level)
            preceding_code = ""
            count_new_line = 0
            is_first_comment_of_block_level = False
            have_reached_normal_chars = False
            have_reached_code = False
    return comments_and_preceding_code

In [None]:
def extract_preceding_code_block_cmts_solved_indentation(unique_comments, file_name):
    preceding_code = ""
    count_new_line = 0
    comments_and_preceding_code = {"comment": [], "preceding_code": [], "is_first_comment_of_block_level": []}
    is_first_comment_of_block_level = False
    have_reached_normal_chars = False
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()
        
    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&")
        start_index = [(m.start(), m.end()) for m in re.finditer(escape_comment, xmlstr)]
            
        for (s_idx, e_idx) in start_index:
            cur_code = ""
            i = s_idx - 1
            comment = xmlstr[s_idx:e_idx].strip() 
            while i >= 0:
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False: 
                        if is_a_pre_line_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_line_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        elif is_a_pre_block_cmt(xmlstr, i-1) != False:
                            cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                            comment = cur_str + "\n\n" + comment
                            count_new_line = 1 
                            if i == -1: 
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code[::-1].lstrip().find("//") == 0:
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                            if have_reached_code == False:
                                comment = cur_code[::-1].lstrip() + comment
                                cur_code = xmlstr[i]
                            else:
                                break
                        else:
                            if cur_code.isspace() == False and cur_code != "": 
                                have_reached_code = True
                            preceding_code += cur_code
                            cur_code = xmlstr[i]
                        if is_a_pre_block_cmt(xmlstr, i-1) != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_pre_block_cmt(xmlstr, i-1)
                                comment = cur_str + "\n" + comment
                                cur_code = xmlstr[i]
                                if i == -1:
                                    break
                            else:
                                break
                elif xmlstr[i] == "{" and have_reached_normal_chars == False:
                    count_new_line = 0 
                    have_reached_normal_chars = True
                    is_first_comment_of_block_level = True
                    cur_code += xmlstr[i]
                else:
                    count_new_line = 0
                    if xmlstr[i] != " ":
                        have_reached_normal_chars = True
                    cur_code += xmlstr[i]
                i -= 1
                if i == -1: 
                    if cur_code[::-1].lstrip().find("//") == 0:
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    elif cur_code.strip()[-2:][::-1] == "/*" and cur_code.strip()[:2][::-1] == "*/":
                        if have_reached_code == False:
                            comment = cur_code[::-1].lstrip() + comment
                        else:
                            break
                    else:
                        preceding_code += cur_code
            comments_and_preceding_code["comment"].append(comment) 
            comments_and_preceding_code["preceding_code"].append(preceding_code[::-1])
            comments_and_preceding_code["is_first_comment_of_block_level"].append(is_first_comment_of_block_level)
            preceding_code = ""
            count_new_line = 0
            is_first_comment_of_block_level = False
            have_reached_normal_chars = False
            have_reached_code = False
    return comments_and_preceding_code

### Extract succeeding code - line comments

In [None]:
def is_a_post_line_cmt(xmlstr, i):
    cur_str = ""
    while i < len(xmlstr) and xmlstr[i] != "\n":
        cur_str += xmlstr[i]
        i += 1
    
    if cur_str.lstrip().find("//") == 0:
        return (cur_str.lstrip(), i)
    elif cur_str.lstrip().find("//") > 0: 
        return (False, cur_str[:cur_str.find("//")]) 
    return (False, i) 

In [None]:
def is_a_post_block_cmt(xmlstr, i):
    cur_str = ""
    while i < len(xmlstr) and xmlstr[i] == " ":
        cur_str += xmlstr[i]
        i += 1
    if i < len(xmlstr) - 1:
        if xmlstr[i] + xmlstr[i+1] == "/*":
            cur_str = cur_str + xmlstr[i] + xmlstr[i+1]
            i += 2
            while i < len(xmlstr) and cur_str[-2:] != "*/":
                cur_str += xmlstr[i]
                i += 1
            return (cur_str.lstrip(), i)
        else:
            while i < len(xmlstr) and xmlstr[i] != "\n":
                cur_str += xmlstr[i]
                i += 1
            if cur_str.lstrip().find("/*") > 0:
                return (False, cur_str[:cur_str.find("/*")])
            else:
                return (False, i)
    else:
        return (False, i)

In [None]:
def extract_succeeding_code(unique_comments, file_name):
    succeeding_code = ""
    count_new_line = 0
    comments_and_succeeding_code = {"succeeding_comment": [], "succeeding_code": []}
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()

    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&") + "\n"

        if comment.count("//") >= 2: 
            first_comment_index = comment.find("//")
            second_comment_index = 0
            regex_consecutive_line_cmts = ""

            for j in range(comment.count("//")):
                if first_comment_index == comment.find("//"):
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
                    first_comment_index += 2
                elif j == comment.count("//") - 1:
                    first_comment_index = second_comment_index
                    regex_consecutive_line_cmts += comment[first_comment_index:].strip()
                else:
                    first_comment_index = second_comment_index
                    second_comment_index = comment.find("//", first_comment_index + 2)
                    regex_consecutive_line_cmts += comment[first_comment_index:second_comment_index].strip() + r"\s*"
            
            regex_consecutive_line_cmts = escape_special_chars_except_s(regex_consecutive_line_cmts)
            end_index = [m.end() for m in re.finditer(regex_consecutive_line_cmts, xmlstr)]
        else:
            end_index = [m.end()-1 for m in re.finditer(escape_comment, xmlstr)] 
        for e_idx in end_index:
            cur_code = ""
            i = e_idx
            comment = ""
            while i < len(xmlstr):
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False:
                        if is_a_post_line_cmt(xmlstr, i+1)[0] != False:
                            cur_str, i = is_a_post_line_cmt(xmlstr, i+1)
                            comment = comment + "\n\n" + cur_str
                            count_new_line = 1
                            if i == len(xmlstr):
                                break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] != False:
                            cur_str, i = is_a_post_block_cmt(xmlstr, i+1)
                            comment = comment + "\n\n" + cur_str
                            count_new_line = 1
                            if i == len(xmlstr):
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code.isspace() == False and cur_code != "": 
                            have_reached_code = True
                        succeeding_code += cur_code
                        cur_code = xmlstr[i]
                        
                        if is_a_post_line_cmt(xmlstr, i+1)[0] != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_post_line_cmt(xmlstr, i+1)
                                comment = comment + "\n" + cur_str
                                cur_code = xmlstr[i]
                                if i == len(xmlstr):
                                    break
                            else:
                                break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_post_block_cmt(xmlstr, i+1)
                                comment = comment + "\n" + cur_str
                                cur_code = xmlstr[i]
                                if i == len(xmlstr):
                                    break
                            else:
                                break
                        elif is_a_post_line_cmt(xmlstr, i+1)[0] == False and type(is_a_post_line_cmt(xmlstr, i+1)[1]) == str: 
                            succeeding_code = succeeding_code + "\n" + is_a_post_line_cmt(xmlstr, i+1)[1]
                            break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] == False and type(is_a_post_block_cmt(xmlstr, i+1)[1]) == str:
                            succeeding_code = succeeding_code + "\n" + is_a_post_block_cmt(xmlstr, i+1)[1]
                            break
                else:
                    if is_a_post_line_cmt(xmlstr, i)[0] != False:
                        if have_reached_code == False:
                            cur_str, i = is_a_post_line_cmt(xmlstr, i)
                            comment = comment + "\n" + cur_str
                            cur_code = xmlstr[i]
                            if i == len(xmlstr):
                                break
                        else:
                            break
                    elif is_a_post_block_cmt(xmlstr, i)[0] != False:
                        if have_reached_code == False:
                            cur_str, i = is_a_post_block_cmt(xmlstr, i)
                            comment = comment + "\n" + cur_str
                            cur_code = xmlstr[i]
                            if i == len(xmlstr):
                                break
                        else:
                            break
                    else:
                        count_new_line = 0
                        cur_code += xmlstr[i]
                i += 1
                if i == len(xmlstr):
                    succeeding_code += cur_code
            comments_and_succeeding_code["succeeding_comment"].append(comment) 
            comments_and_succeeding_code["succeeding_code"].append(succeeding_code)
            succeeding_code = ""
            count_new_line = 0
            have_reached_code = False
    return comments_and_succeeding_code

### Extract succeeding code - block comments

In [None]:
def extract_succeeding_code_block_cmts(unique_comments, file_name):
    succeeding_code = ""
    count_new_line = 0
    comments_and_succeeding_code = {"succeeding_comment": [], "succeeding_code": []}
    have_reached_code = False

    with open(file_name, 'r', encoding="utf8", errors='ignore') as f:
        xmlstr = f.read()

    for comment in unique_comments:
        escape_comment = re.escape(comment).replace("\&amp;", "&")
        end_index = [m.end() for m in re.finditer(escape_comment, xmlstr)]
            
        for e_idx in end_index:
            cur_code = ""
            i = e_idx
            comment = ""
            while i < len(xmlstr):
                if xmlstr[i] == "\n":
                    count_new_line += 1
                    if count_new_line == 2 and have_reached_code == False:
                        if is_a_post_line_cmt(xmlstr, i+1)[0] != False:
                            cur_str, i = is_a_post_line_cmt(xmlstr, i+1)
                            comment = comment + "\n\n" + cur_str
                            count_new_line = 1
                            if i == len(xmlstr):
                                break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] != False:
                            cur_str, i = is_a_post_block_cmt(xmlstr, i+1)
                            comment = comment + "\n\n" + cur_str
                            count_new_line = 1
                            if i == len(xmlstr):
                                break
                        else:
                            break
                        cur_code = xmlstr[i]
                    elif count_new_line == 2 and have_reached_code == True:
                        break
                    else:
                        if cur_code.isspace() == False and cur_code != "": 
                            have_reached_code = True
                        succeeding_code += cur_code
                        cur_code = xmlstr[i]
                        
                        if is_a_post_line_cmt(xmlstr, i+1)[0] != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_post_line_cmt(xmlstr, i+1)
                                comment = comment + "\n" + cur_str
                                cur_code = xmlstr[i]
                                if i == len(xmlstr):
                                    break
                            else:
                                break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] != False:
                            if have_reached_code == False:
                                cur_str, i = is_a_post_block_cmt(xmlstr, i+1)
                                comment = comment + "\n" + cur_str
                                cur_code = xmlstr[i]
                                if i == len(xmlstr):
                                    break
                            else:
                                break
                        elif is_a_post_line_cmt(xmlstr, i+1)[0] == False and type(is_a_post_line_cmt(xmlstr, i+1)[1]) == str: 
                            succeeding_code = succeeding_code + "\n" + is_a_post_line_cmt(xmlstr, i+1)[1]
                            break
                        elif is_a_post_block_cmt(xmlstr, i+1)[0] == False and type(is_a_post_block_cmt(xmlstr, i+1)[1]) == str:
                            succeeding_code = succeeding_code + "\n" + is_a_post_block_cmt(xmlstr, i+1)[1]
                            break
                else:
                    if is_a_post_line_cmt(xmlstr, i)[0] != False:
                        if have_reached_code == False:
                            cur_str, i = is_a_post_line_cmt(xmlstr, i)
                            comment = comment + "\n" + cur_str
                            cur_code = xmlstr[i]
                            if i == len(xmlstr):
                                break
                        else:
                            break
                    elif is_a_post_block_cmt(xmlstr, i)[0] != False:
                        if have_reached_code == False:
                            cur_str, i = is_a_post_block_cmt(xmlstr, i)
                            comment = comment + "\n" + cur_str
                            cur_code = xmlstr[i]
                            if i == len(xmlstr):
                                break
                        else:
                            break
                    else:
                        count_new_line = 0
                        cur_code += xmlstr[i]
                i += 1
                if i == len(xmlstr):
                    succeeding_code += cur_code
            comments_and_succeeding_code["succeeding_comment"].append(comment) 
            comments_and_succeeding_code["succeeding_code"].append(succeeding_code)
            succeeding_code = ""
            count_new_line = 0
            have_reached_code = False
    return comments_and_succeeding_code

### Process and merge df

In [None]:
import pandas as pd

cpp_file_path = "path_to_cpp_file"

df_preceding_code = pd.DataFrame.from_dict(extract_preceding_code(unique_line_comments, cpp_file_path))
df_preceding_code_block_cmts = pd.DataFrame.from_dict(extract_preceding_code_block_cmts(unique_block_comments, cpp_file_path))
df_succeeding_code = pd.DataFrame.from_dict(extract_succeeding_code(unique_line_comments, cpp_file_path))
df_succeeding_code_block_cmts = pd.DataFrame.from_dict(extract_succeeding_code_block_cmts(unique_block_comments, cpp_file_path))

df_preceding_code_solved_indentation = pd.DataFrame.from_dict(extract_preceding_code_solved_indentation(unique_line_comments, cpp_file_path))
df_preceding_code_block_cmts_solved_indentation = pd.DataFrame.from_dict(extract_preceding_code_block_cmts_solved_indentation(unique_block_comments, cpp_file_path))

In [None]:
df_preceding_code_solved_indentation = df_preceding_code_solved_indentation.rename(columns={'comment': 'comment_solved_indentation'})
df_preceding_code_block_cmts_solved_indentation = df_preceding_code_block_cmts_solved_indentation.rename(columns={'comment': 'comment_solved_indentation'})
df_preceding_code_solved_indentation = df_preceding_code_solved_indentation.drop(['preceding_code', 'is_first_comment_of_block_level'], axis=1)
df_preceding_code_block_cmts_solved_indentation = df_preceding_code_block_cmts_solved_indentation.drop(['preceding_code', 'is_first_comment_of_block_level'], axis=1)

In [None]:
df_line_cmts = pd.concat([df_preceding_code, df_succeeding_code, df_preceding_code_solved_indentation], axis=1)
df_line_cmts

In [None]:
df_block_cmts = pd.concat([df_preceding_code_block_cmts, df_succeeding_code_block_cmts, df_preceding_code_block_cmts_solved_indentation], axis=1)
df_block_cmts

In [None]:
df_line_cmts["comment"] = df_line_cmts["comment"] + df_line_cmts["succeeding_comment"]
df_block_cmts["comment"] = df_block_cmts["comment"] + df_block_cmts["succeeding_comment"]
df_line_cmts["comment_solved_indentation"] = df_line_cmts["comment_solved_indentation"] + df_line_cmts["succeeding_comment"]
df_block_cmts["comment_solved_indentation"] = df_block_cmts["comment_solved_indentation"] + df_block_cmts["succeeding_comment"]

In [None]:
df_line_cmts = df_line_cmts.drop('succeeding_comment', axis=1)
df_block_cmts = df_block_cmts.drop('succeeding_comment', axis=1)

In [None]:
df_line_cmts.columns

In [None]:
df_line_cmts = df_line_cmts[['comment', 'preceding_code', 'succeeding_code', 'is_first_comment_of_block_level', 'comment_solved_indentation']]
df_block_cmts = df_block_cmts[['comment', 'preceding_code', 'succeeding_code', 'is_first_comment_of_block_level', 'comment_solved_indentation']]

In [None]:
df_line_cmts.loc[df_line_cmts['is_first_comment_of_block_level'], 'preceding_code'] = (
    df_line_cmts['preceding_code'] + df_line_cmts['comment_solved_indentation'] + df_line_cmts['succeeding_code']
)
df_line_cmts = df_line_cmts.drop('comment_solved_indentation', axis=1)
df_line_cmts

In [None]:
df_block_cmts.loc[df_block_cmts['is_first_comment_of_block_level'], 'preceding_code'] = (
    df_block_cmts['preceding_code'] + df_block_cmts['comment_solved_indentation'] + df_block_cmts['succeeding_code']
)
df_block_cmts = df_block_cmts.drop('comment_solved_indentation', axis=1)
df_block_cmts

In [None]:
final_df = pd.concat([df_line_cmts, df_block_cmts])

In [None]:
final_df = final_df.drop_duplicates()
final_df

### Extract ETF SATD

In [None]:
def check_etf_satd(comments):
    etf_satd_comments = []
    for c in comments:
        if "todo" in c.lower() or "fixme" in c.lower() or "to-do" in c.lower() or "to do" in c.lower() or "fix me" in c.lower() or "optimize:" in c.lower() \
              or "future:" in c.lower() or "revisit" in c.lower() or "xx" in c.lower() or "kludge" in c.lower() or "hack" in c.lower() or "refactor" in c.lower() \
              or "bug:" in c.lower() or "issue:" in c.lower() \
                or bool(re.search('.*? temp.*? fix.*?', c.lower())) or bool(re.search('temp.*? fix.*?', c.lower())) or bool(re.search('.*?work.*?around.*?', c.lower())) \
                    or bool(re.search('.*? temp.*? patch.*?', c.lower())) or bool(re.search('temp.*? patch.*?', c.lower())) \
                        or bool(re.search('.*? temp.*? sol.*?', c.lower())) or bool(re.search('temp.*? sol.*?', c.lower())):
            etf_satd_comments.append(1)
        else:
            etf_satd_comments.append(0)
    return etf_satd_comments

In [None]:
final_df['etf_satd'] = check_etf_satd(final_df['comment'].values)
final_df

In [None]:
final_df[final_df['etf_satd'] == 1] # ETF SATD comments