In [None]:
from operator import itemgetter
import fitz
import json
import re 
from gensim.models import Word2Vec
import gensim.downloader
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from sklearn.neighbors import NearestNeighbors
import os
from bs4 import BeautifulSoup
import numpy as np
from collections import defaultdict

In [None]:
PURPOSE_GOAL='What is the goal or aim or purpose of the project'
DESCRIPTION_OVERVIEW='What is the description or overview of the system'
ASSUMPTIONS_DEPENDENCIES='What are the assumptions and dependencies'
SCOPE='What is the scope'
REQUIREMENTS_FUNCTIONAL_NON= 'What are the requirements functional non functional'
SYSTEM_ARCHITECTURE='What is the system architecture or what is the architecture of the software'
USERS_AUDIENCE='Who are the users or audience'
QUESTIONS_LIST=[PURPOSE_GOAL,DESCRIPTION_OVERVIEW,ASSUMPTIONS_DEPENDENCIES, SCOPE, REQUIREMENTS_FUNCTIONAL_NON, SYSTEM_ARCHITECTURE, USERS_AUDIENCE]

In [None]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [None]:
def n_gram_creator(list_sentences,n_gram):
    paras=[]
    start,end=0,n_gram
    while( end<len(list_sentences)):
        combined=list_sentences[start:end]
        start+=1
        end+=1
        paras.append(' '.join(combined))
    return paras

In [None]:
def transform_tags(sentence):
   
    sentence=re.sub('<h[0-9]>|<s[0-9]*>',"",sentence)
    sentence=re.sub(' +', ' ',sentence)
    sentence=sentence.strip()
    return sentence

In [None]:
def make_the_paras(filepath,n_grams):
    document = filepath
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)

    # with open("doc.json", 'w') as json_out:
    #     json.dump(elements, json_out)
    tagged_list=(" ".join(elements).split("|"))
    Soup = BeautifulSoup(" ".join(elements), 'lxml')
    # heading_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
    heading_tags = ["h3"]
    #for tags in Soup.find_all(heading_tags):
        #print(tags.name + ' -> ' + tags.text.strip())
    paras=[]
    for tag_sentence in tagged_list:
        transformed_temp=transform_tags(tag_sentence)
        if len(transformed_temp)==0:
            continue
        paras.append(transformed_temp)
    splitted_paras=(' '.join(paras)).split('<p>')
    corpus=n_combined_grams(splitted_paras,n_grams)
    return corpus

In [None]:
def n_combined_grams(splitted_paras,list_grams):
    all_grams=[]
    for n_gram in list_grams:
        all_grams.append(n_gram_creator(splitted_paras,n_gram))
    flat_list = [item for sublist in all_grams for item in sublist]
    return flat_list


In [None]:
from rank_bm25 import *

In [None]:
bm25 = BM25Okapi(corpus)
top_list=bm25.get_top_n("purpose of the document".split(" "),corpus, n=5)

In [None]:
for rank,val in enumerate(top_list):
    print(rank+1,':',val,"\n\n\n")

In [None]:
for rank,val in enumerate(top_list):
    print(rank+1,':',val,"\n\n\n")

## Word2Vec

In [None]:
def transform_text2(vectorizer, text, verbose=False):
    '''
    Transform the text in a vector[Word2Vec]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    tokens = preprocess_string(text)
    words = [vectorizer[w] for w in tokens if w in vectorizer]
    if verbose:
        print('Text:', text)
        print('Vector:', [w for w in tokens if w in vectorizer])
    elif len(words):
        return np.mean(words, axis=0)
    else:
        return np.zeros((300), dtype=np.float32)

In [None]:
def word2vec_para_summarizer(corpus,questions_list):
    corpus_df =  pd.DataFrame({'paras':corpus})
    corpus_df['id_'] = range(0, len(corpus_df))
    
    corpus_list = corpus_df['paras'].tolist()
    corpus_token = [preprocess_string(t) for t in corpus_list]
    vectorizer = Word2Vec(sentences=corpus_token, vector_size=300, window=5, min_count=1, workers=4).wv
    retriever_configs = {
    'n_neighbors': 5,
    'metric': 'cosine'
    }
    retriever = NearestNeighbors(**retriever_configs)

    # vectorizer the documents, fit the retriever
    X = corpus_df['paras'].apply(lambda x: transform_text2(vectorizer, x)).tolist()
    retriever.fit(X, corpus_df['id_'])
    
    #print(questions_list)
    #questions_list=['What is the goal or aim or purpose of the project', 'What is the description or overview of the system','What are the assumptions and dependencies', 'What is the scope', 'What are the requirements functional non functional', 'What is the system architecture or what is the architecture of the software', 'Who are the users or audience']
    X=[transform_text2(vectorizer,question) for question in questions_list]
    #X = questions_list.apply(lambda x: transform_text2(vectorizer, x))
    # y_test = data['c_id']
    y_pred = retriever.kneighbors(X, return_distance=False)
    json_output={}
    for question,index in enumerate(y_pred):
        
        #print(questions_list[question],"?:")
        inner_json={}
        for rank,i_ in enumerate(index):
            inner_json[rank+1]=corpus_df.iloc[i_,0]
            #print(rank+1,":",corpus_df.iloc[i_,0],"\n")
        json_output[questions_list[question]]=inner_json
        #print("\n")
    return json_output

## Word2Vec results on all documents

In [None]:
folder_path=r"../../Data/SRS Dataset Clean/"
files=os.listdir(folder_path)
output_dict=dict()
for file in files:
    
    if '.pdf' not in file:
        continue
    filepath=folder_path+file
    #print(filepath)
    filepath_result={}
    try:
        corpus=make_the_paras(filepath,[3])
        
        filepath_result=word2vec_para_summarizer(corpus,QUESTIONS_LIST)
    except:
        print(filepath)
    finally:
        output_dict[filepath]=filepath_result
   
    
    

In [None]:
output_dict

In [None]:
pd.DataFrame.from_dict(output_dict)

In [None]:
import json
out_file = open("word2vec_output.json", "w")
  
json.dump(output_dict, out_file, indent = 6)
  
out_file.close()

In [None]:
word2vec_para_summarizer(make_the_paras(r"../../Data/SRS Dataset Clean/SRS20_removed.pdf",[3]),QUESTIONS_LIST)

In [None]:
make_the_paras(r"../../Data/SRS Dataset Clean/SRS20_removed.pdf",[3])

In [None]:
combined_ranks_output=dict()
for filepath in output_dict:
    modified_filepath=((filepath.split('/')[-1]).split('.')[0]).split('_')[0]
    
    combined_ranks_output[filepath]=dict()
    
    for question in output_dict[filepath]:
        #combined_ranks_output[filepath][question]=
        #combined_ranks_output[filepath][question]
        all_paras=''
        for rank in output_dict[filepath][question]:
            all_paras+=output_dict[filepath][question][rank]
        combined_ranks_output[filepath][question]=all_paras

In [None]:
CSV_COLUMNS=['purpose','description','scope','requirements','assumptions and dependencies','architecture','users']
output_dataframe=pd.DataFrame.from_dict(combined_ranks_output,orient='index')
output_dataframe.columns=CSV_COLUMNS
display(output_dataframe)
output_dataframe.to_csv('labelled_data.csv',encoding='utf-8')

In [None]:
combined_ranks_output

In [None]:
pd.read_csv('labelled_data.csv')