In [None]:

#author: Roufa
#input:pdf
#extract titles, sub-titles, sections, sub-sections, headings, subheadings from the book along
#with the page and line numbers of extracted text.

#check if chapter names are in titlecase or not, if not in title case then return as inconsistent.
#check if sections, headings are in title case or sentence case or any other case.
#OP: return the minority case as inconsistent for headings and sections along with other case (if found.)

import sys
sys.path.append('/data/copy_assessment_tool/modules_final/')

from collections import Counter
import pdfplumber
from fuzzywuzzy import fuzz
import numpy as np

from programmatic.TOC.extract_TOC import TOC_num
from utils import *
from programmatic.TOC.title_sentence_case_consistency import *
from programmatic.json_output import return_json_result




def coordinate_approach(coordinate_dict,unique_cords):
    '''
    This fucntion is to create the dictionary based on unique coradinate that i have and less coordinate value will be Part key to my dictionary
    High coordinate value will be others key to my dictionary 
    '''
    predefined_chapter_map = ["Part", "Chapters", "sections", "subsections", "Headings","others"]

    cord_map = {}
    for groupname, cord in zip(predefined_chapter_map, unique_cords):
        cord_map[cord] = groupname
        
    result_map = {}
    for text, cord in coordinate_dict.items():
        if cord_map[cord] not in result_map.keys():
            result_map[cord_map[cord]] = [text]
        else:
            result_map[cord_map[cord]].append(text)
        
    return result_map


def check_if_part_in_chapter(result_map):
    '''
    This function is to check if the TOC names starts with chapter palce in chapters list and if starts with parts/sections and palce there respective
    sections and parts    
    '''
    preprocessed_data = {
    'chapters': [],
    'parts': [],
    'sections': []
    }

    for key, value in result_map.items():
        x = len(result_map)
        # intendation only have two coordinate value then check for the parts and chapter in dict
        if x <=2:
        
            for item in value:
                new_item = item[:]
                if item.lower().startswith('chapter'):
                    preprocessed_data['chapters'].append(new_item)
                elif item.lower().startswith('part'):
                    preprocessed_data['parts'].append(new_item)
                elif item.lower().startswith('sections'):
                    preprocessed_data['parts'].append(new_item)
                else:
                    preprocessed_data['sections'].append(new_item)
    # preprocessed_cleaned = {key: value for key, value in preprocessed_data.items() if value}
    return preprocessed_data

def check_number_sequencing(preprocessed_cleaned_dict):
    '''
    If the TOC starts with numbers like 1 or 1. then palce these numbers in chapters and if numbers has 1.1 or 1.2. then this would be in sections
    '''
    
    new_dictionary = {"chapters":[],"sections":[],"part":[]}
    for key,values in preprocessed_cleaned_dict.items():
        for val in values:
            result_split = val.split('.')
        
            if len(result_split) > 1:
                if val.startswith(("part","Part")):
                    new_dictionary["part"].append(val)
                
                elif len(result_split) > 1:
                    if (result_split[0].strip().isdigit()) and (result_split[1].strip().split(' ')[0].isdigit()):
                        new_dictionary["sections"].append(val)
                    elif (result_split[0].strip().isdigit()) and (result_split[1].strip().split(' ')[0].isalpha()):
                        new_dictionary["chapters"].append(val)
            else:
                space_spliting = val.split(' ')[:2]  
                if len(space_spliting)>1:
                    if (space_spliting[0].strip().isdigit()) and (space_spliting[1].strip().split(' ')[0].isalpha()):
                        new_dictionary["chapters"] = val
                    else:
                        if val.startswith(("part","Part")):
                            new_dictionary["part"].append(val)
                        elif val.startswith(("chapter","Chapter")):
                            new_dictionary["chapters"].append(val)
                        else:
                            new_dictionary["sections"].append(val)  

    return new_dictionary
            
        
def check_final_call(cleaned_data, unique_cords):
    '''
    This functions is to call the above functions likes coordinate_approach,check_if_part_in_chapter,check_number_sequencing 
    '''
    coo = coordinate_approach(cleaned_data, unique_cords)
    coo = {key: value for key, value in coo.items() if value}
    '''Below condition to check if my dictionary has len of keys is 2 which means it might have chapter or part name in my TOC if 
       chapter or parts not in TOC them my toc are palcing in sections key in dictionary[cp]
       if there is only one key which is sections key then i am checking for number sequencing fucntion
    '''

    if len(coo) <= 2:
        cp = check_if_part_in_chapter(coo)
        
        coo.clear()
        cp = {key: value for key, value in cp.items() if value}
        
        if (len(cp) <= 1) and ("sections" in cp):
            seq = check_number_sequencing(cp)
            cp.clear()
            seq = {key: value for key, value in seq.items() if value}
        else:
            seq = {}  # Assign a default value
    else:
        coo = coo
        cp = {}  # Assign a default value
        seq = {}
    return coo, cp, seq

def chapters_page_no(pdf,num,result_set):
    '''This fucntion is to extract the page no and line no once the chapters are matched with TOC '''

    chapters = []
    page_nos = []
    line_nos = []
    for page_no, page in enumerate(pdf.pages):

        # after table of contents
        if page_no > num:
            
            text = page.filter(filter_superscript).within_bbox((0, 0, 550, 770)).extract_text_simple()
            textlines = text.split("\n")

            for line_no , text_line in enumerate(textlines):
                text_line = text_line

                # to remove special symbols
                text_line = re.sub(r"[^a-zA-Z0-9 :\n]", "", text_line)
                text_line = re.sub(r"[:?]+$ ","",text_line)
                text_line = re.sub(r":(?=\s|$)", "", text_line)
                text_line = re.sub(r'(Chapter|chapter|Part|part|\b\d+(\.\d+)?\b)',"",text_line)
                text_line = re.sub(r'^\d+', '', text_line)
                text_line = remove_chapter_or_part_text([text_line])
                text_line = text_line[0]
                text_line = text_line.strip()
                text_line = ' '.join(text_line.split())

                
                key = result_set.get("sections")
                if key is None:
                    key = result_set.get("chapters")
                for j, res_val in enumerate(key):
                    if fuzz.ratio(text_line, res_val) >= 93:


                        chapters.append(res_val)
                        page_nos.append(page_no+1)
                        line_nos.append(line_no+1)                          
                        
    return chapters,page_nos,line_nos


def append_chapters(chapters, page_nos, line_nos):

    filtered = []

    new_chapters,new_page_nos,new_line_nos = [],[],[]
    for ch,pgno,lineno in zip(chapters, page_nos, line_nos):
        val = f"{ch}+{pgno}+{lineno}"
        
        if val not in filtered:
            filtered.append(val)
            new_chapters.append(ch)
            new_page_nos.append(pgno)
            new_line_nos.append(lineno)
    return new_chapters,new_page_nos,new_line_nos,filtered


def bold_extraction_heading(pdf, new_chapters, new_page_nos, new_line_nos):
    #bold content extraction for heading
    ''' Function to extract bold content in book once match with chapters or sections inside the book '''
    
    dict_bold = {}

    for index in range(len(new_chapters) - 1):
        for page_no, page in enumerate(pdf.pages):
            if page_no >= new_page_nos[index] and page_no <= new_page_nos[index + 1]:
                bold_text = page.filter(filter_boldtext).filter(filter_superscript).extract_text()
                bold_lines = bold_text.split("\n")
                bold_lines = strip_word_in_line(bold_lines)
                
                text = page.extract_text()
                textlines = text.split("\n")
                
                if bold_lines is not None:
                    bold_lines_content = set(bold_lines) & set(textlines)
                    
                    for line_number, content in enumerate(textlines):
                        if content in bold_lines_content:
                            # Remove table of contents elements
                            re_content = test_remove_Toc(content)
                            
                            # Check if the content already exists in the dictionary
                            if re_content not in dict_bold:
                                dict_bold[re_content] = []
                            
                            # Append the page_no and line_number to the dictionary if not already added
                            if (page_no, line_number) not in dict_bold[re_content]:
                                dict_bold[re_content].append((page_no+1, line_number+1))
                                
    new_cleann_dict1 = {k: v for k, v in dict_bold.items() if k}
    return new_cleann_dict1


#extracting italic headings
def italic_content(pdf, new_chapters, new_page_nos):
    #italic_content extraction for heading
    ''' Function to extract italic_content in book once match with chapters or sections inside the book '''
    dict_italic = {}
    
    for index in range(len(new_chapters) - 1):
        for page_no, page in enumerate(pdf.pages):
            if page_no >= new_page_nos[index] and page_no <= new_page_nos[index + 1]:
                italic_text = page.filter(filter_Italictext).filter(filter_superscript).extract_text()
                italic_lines = italic_text.split("\n")
                italic_lines = strip_word_in_line(italic_lines)
                
                text = page.extract_text()
                textlines = text.split("\n")
                
                if italic_lines is not None:
                    sorted_len_italic = sorted(italic_lines, key=lambda line: len(italic_lines))
                    italic_mean = np.mean([len(line) for line in italic_lines])
                    processed_lines = [line for line in sorted_len_italic if len(line) < italic_mean-30]
                    
                    for line_number, content in enumerate(textlines):
                        if content in processed_lines:
                            # Remove table of contents elements
                            re_content = test_remove_Toc(content)
                            
                            # Check if the content already exists in the dictionary
                            if re_content not in dict_italic:
                                dict_italic[re_content] = []
                            
                            # Append the page_no and line_number to the dictionary if not already added
                            if (page_no, line_number) not in dict_italic[re_content]:
                                dict_italic[re_content].append((page_no+1, line_number+1))
                                
    new_cleann_italic = {k: v for k, v in dict_italic.items() if k}
    return new_cleann_italic

#extracting headings that are not bold and italic
def match_lines_heading(pdf, new_chapters, new_page_nos):
    # match line which is having shorter length which considering it as headings
    dict_lines_heading = {}
    
    for index in range(len(new_chapters) - 1):
        for page_no, page in enumerate(pdf.pages):
            if page_no >= new_page_nos[index] and page_no <= new_page_nos[index + 1]:
                text = page.filter(filter_superscript).within_bbox((0, 0, 550, 770)).extract_text()
                text_lines = text.split("\n")
                shortest_lines = sorted(text_lines, key=lambda line: len(line))
                lines_without_period = [line for line in shortest_lines if not (line.endswith(('.','?','!',"'",'"')))]
                line_mean = np.mean([len(line) for line in text_lines])
                short_lines = [line for line in lines_without_period if len(line) < line_mean - 30]
                processed_lines_content = set(short_lines) & set(text_lines)
                
                for line_number, content in enumerate(text_lines):
                    if content in processed_lines_content:
                        re_content = test_remove_Toc(content)
                        
                        # Initialize an empty list for the key if it doesn't exist
                        if re_content not in dict_lines_heading:
                            dict_lines_heading[re_content] = []
                        
                        dict_lines_heading[re_content].append((page_no+1, line_number+1))
    dict_lines_heading_final = {k: v for k, v in dict_lines_heading.items() if k}            
    return dict_lines_heading_final
    

def collection_cords(new_cleaned_Data):
    # pdf plumber to extract coordinate of lines in TOC
    cord_map = {}
    all_cords = [cord for chapter, cord in new_cleaned_Data.items()]

    # overall groups/spaces
    unique_cords = sorted(list(set(all_cords)))

    counter = Counter(all_cords)
    predefined_chapter_map = ["Part", "Chapters", "sections", "subsections", "Titles","others"]
    for groupname, cord in zip(predefined_chapter_map, unique_cords):
        cord_map[cord] = groupname

    return unique_cords,cord_map


def dict_list_contents(dict_list):
    # This functins is to take input as list of dictionary which is return from check_final_call
    add_diction = [] 
    for d in dict_list:
        for d_k,d_v in d.items():
            if d_k:
                add_diction.append(d)
            else:
                print("No contens are extracted",d)
    return add_diction[0]


def strip_dict(result_set):
    result_set_new = {key:val for key,val in result_set.items() if  val }
    return result_set_new

def chapter_remove(data_dict):
    if data_dict.get("chapters"):
        data_dict['chapters'] = [chapter for chapter in data_dict['chapters'] if chapter != '']
    elif data_dict.get("sections"):
        data_dict['sections'] = [chapter for chapter in data_dict['sections'] if chapter != '']
    else:
        dict_find = data_dict
    return data_dict

def cleaning(result_set_data):
    result_set_new_clean = {}

    for key, values in result_set_data.items():
        result_set_new_clean[key] = [re.sub(r'^\.', '', value) for value in values]
    
    return result_set_new_clean

def compare_remove(dict_2,result_set):
    # removing the chapter and sections that are present in the TOC only take Heading
    threshold = 75
    new_clean = {}
    list_complete= []
    for val in result_set.values():
        for app_val in val:
            list_complete.append(app_val)
             
    for c_list in list_complete:
        for new_k,new_v in dict_2.items():
            if fuzz.ratio(new_k, c_list) < threshold:
                
                            new_clean[new_k] = new_v
                            
    return new_clean
    
## Extract line number of chapters and sections inside the book
def clean_textlines(textlines):
    new_text_lines = []
    for line_no , text_line in enumerate(textlines):
        text_line = text_line

        # to remove special symbols
        text_line = re.sub(r"[^a-zA-Z0-9 :\n]", "", text_line)
        text_line = re.sub(r"[:?]+$ ","",text_line)
        text_line = re.sub(r":(?=\s|$)", "", text_line)
        text_line = re.sub(r'(Chapter|chapter|Part|part|\b\d+(\.\d+)?\b)',"",text_line)
        text_line = re.sub(r'^\d+', '', text_line)
        text_line = remove_chapter_or_part_text([text_line])
        text_line = text_line[0]
        text_line = text_line.strip()
        text_line = ' '.join(text_line.split())
        new_text_lines.append(text_line)
    return new_text_lines  

def get_match(text_lines, refrence_list):
    # mactching of chapters and section inside book with TOC
    filtered_list = []
    for line in text_lines:
        for line1 in refrence_list:
            if fuzz.ratio(line, line1) >= 92:
                filtered_list.append(line)
    return filtered_list
            

def extract_page_line_numbers(pdf, num, result_set, keys_to_extract):

    matched_data = {key: {} for key in keys_to_extract}

    for page_no, page in enumerate(pdf.pages):
        if page_no > num:
            text = page.filter(filter_superscript).within_bbox((0, 0, 550, 770)).extract_text_simple()
            textlines = text.split("\n")
            textlines = clean_textlines(textlines)

            for key in keys_to_extract:
                matched_lines = get_match(textlines, result_set.get(key, []))
                for line_no, line_content in enumerate(textlines):
                    if line_content in matched_lines:
                        for line_matched in matched_lines:
                            if line_matched !='':
                                matched_data[key][line_matched] = []
                                matched_data[key][line_matched].append((page_no + 1, line_no + 1))

    return matched_data

def remove_duplicates(input_dict):
    output_dict = {}

    for category, entries in input_dict.items():
        unique_entries = {}
        for entry, coordinates in entries.items():
            unique_coordinates = list(set(coordinates))
            unique_entries[entry] = unique_coordinates

        output_dict[category] = unique_entries

    return output_dict


#extract headings, subheadings, sections, subsections from all chapters in the book.
def extract_all_titles(pdf):
    entries_to_remove = ['Contents', 'Foreword', 'Preface','List of figures','Acknowledgment','List of contributors','List of abbreviations','Conventions','List of tables','About the Authors','References','appendix','index','Conclusions',"Devil Is An Ass","Bibliography"]
    entity_labels = ["person"]
    keys_to_extract = ["chapters", "sections", "part"]
    
    page_content_dict, page_num = TOC_num(pdf) # same
    cleaned_data = remove_entries(page_content_dict, entries_to_remove) # same
    new_cleaned_data = filter_entries_by_named_entities(cleaned_data, entity_labels) # same 
    unique_cords,cord_map = collection_cords(new_cleaned_data)
    result_map = coordinate_approach(new_cleaned_data,unique_cords)
    pre_data = check_if_part_in_chapter(result_map)
    preprocessed_cleaned = strip_dict(pre_data)
    line = check_number_sequencing(preprocessed_cleaned)
    coo,cp,seq = check_final_call(new_cleaned_data,unique_cords)
    dict_list = [coo,cp,seq]
    final_segregation_ch_section_part = dict_list_contents(dict_list)
    cleaned_result = remove_numbers_Toc(final_segregation_ch_section_part)
    new_cleaned_result = strip_dict(cleaned_result)
    result_set=clean_roman_numbers_Toc(new_cleaned_result)
    result_set_new = strip_dict(result_set)
    result_set_data = chapter_remove(result_set_new)
    result_set_new_clean = cleaning(result_set_data)
    result_set_new_clean = strip_dict(result_set_new_clean)
    chapters,page_nos,line_nos = chapters_page_no(pdf,page_num,result_set_new_clean)
    new_chapters,new_page_nos,new_line_nos,filtered = append_chapters(chapters, page_nos, line_nos)
    
    
    dict_bold = bold_extraction_heading(pdf,new_chapters,new_page_nos,new_line_nos)
    new_clean_bold = compare_remove(dict_bold,result_set_new_clean)
    dict_italic = italic_content(pdf,new_chapters,new_page_nos)
    new_clean_italic = compare_remove(dict_italic,result_set_new_clean)
    
    
    append_two_dicts = new_clean_bold.copy()  # Make a copy of dict_1

    # Update dict_1 with the contents of dict_2
    append_two_dicts.update(new_clean_italic)
    
    if (not new_clean_bold) and (not new_clean_italic) and (not append_two_dicts):
        dict_lines = match_lines_heading(pdf,new_chapters,new_page_nos)
        new_clean_lines = compare_remove(dict_lines,result_set_new_clean)
        extract_page_line = extract_page_line_numbers(pdf, page_num, result_set_new_clean, keys_to_extract)
        keys_list = list(extract_page_line.keys())
        if new_clean_lines and ("sections" in  keys_list):
            combined_dict = {
                'heading': new_clean_lines,
                'sections': extract_page_line['sections']
                }
        elif new_clean_lines and ("chapters" in  keys_list):
            combined_dict = {
                'heading': new_clean_lines,
                'chapters': extract_page_line['chapters'],
                
                }
        else:
            if extract_page_line:
                combined_dict = {
                'heading': append_two_dicts,
                'chapters': extract_page_line['chapters'],
                'sections': extract_page_line['sections']
                }
            else:
                combined_dict={}
    extract_page_line = extract_page_line_numbers(pdf, page_num, result_set_new_clean, keys_to_extract)
    keys_list = list(extract_page_line.keys())
    if len(extract_page_line) ==2:
        combined_dict = {
        'heading': append_two_dicts,
        'chapters': extract_page_line['chapters'],
        'sections': extract_page_line['sections']
        }
    elif (len(extract_page_line)<=1) and ("chapters" in  keys_list):
             combined_dict = {
        'heading': append_two_dicts,
        'chapters': extract_page_line['chapters'],
        
        }
    elif (len(extract_page_line)<=1) and ("sections" in  keys_list):
         combined_dict = {
        'heading': append_two_dicts,
        'sections': extract_page_line['sections']
        }

    else:
        if extract_page_line:
            combined_dict = {
            'heading': append_two_dicts,
            'chapters': extract_page_line['chapters'],
            'sections': extract_page_line['sections']
            }
        else:
            combined_dict={}
            
    combined_dict = remove_duplicates(combined_dict)

    return combined_dict
    
#checking style consistency in overall book
def check_text_title_sent_case_inconsistency(pdf):
    inconsistent_text_count = 0

    combined_dict = extract_all_titles(pdf)
    chapters, sections, headings, _ = process_text_data(combined_dict)
    count_non_title_case_chapters, inconsistent_title_case_chapters, inconsistent_sections, inconsistent_headings = get_inconsisent_details(chapters, sections, headings)

    inconsistent_sec_counts = 0
    inconsistent_sec_details = [{}]
    if inconsistent_sections:
        inconsistent_sec = [my_dict[key]  for my_dict in inconsistent_sections for key in my_dict if key.startswith('count')]
        inconsistent_sec_counts = sum(inconsistent_sec)
        #inconsistent_sec_details = [my_dict[key] for my_dict in inconsistent_sections for key in my_dict if key.endswith('details')]
        for my_dict in inconsistent_sections:
            for key, value in my_dict.items():
                if key.endswith('details'):
                    inconsistent_sec_details[0].update(value)


    inconsistent_heading_counts = 0
    inconsistent_heading_details = [{}]
    if inconsistent_headings:
        inconsistent_head = [my_dict[key]  for my_dict in inconsistent_headings for key in my_dict if key.startswith('count')]
        inconsistent_heading_counts = sum(inconsistent_head)
        #inconsistent_heading_details = [my_dict[key] for my_dict in inconsistent_headings for key in my_dict if key.endswith('details')]
        for my_dict in inconsistent_headings:
            for key, value in my_dict.items():
                if key.endswith('details'):
                    inconsistent_sec_details[0].update(value)

    inconsistent_text_count = count_non_title_case_chapters + inconsistent_sec_counts + inconsistent_heading_counts
    
    return inconsistent_text_count, inconsistent_title_case_chapters, inconsistent_sec_details, inconsistent_heading_details

    

def get_inconsistent_text_bboxes(file_path):
    pdf = pdfplumber.open(file_path)
    inconsistent_text_count, inconsistent_title_case_chapters, inconsistent_sec_details, inconsistent_heading_details = check_text_title_sent_case_inconsistency(pdf)

    inconsistent_heading_details += [inconsistent_sec_details[0], inconsistent_title_case_chapters]
    text_json,length = final_consistent_text_json(pdf,inconsistent_heading_details)
    
    # print(text_json)
    final_result_json = return_json_result(text_json)
    return text_json,final_result_json



# file_path = '/data/copy_assessment_tool/modules/data/15032-5196-FullBook.pdf'
file_path ='/data/copy_assessment_tool/modules/data/15031-4988-FullBook.pdf'

text_json,final_result_json = get_inconsistent_text_bboxes(file_path)
