In [49]:
from operator import itemgetter
import fitz
import json
import re 
from gensim.models import Word2Vec
import gensim.downloader
import pandas as pd
from gensim.parsing.preprocessing import preprocess_string
from sklearn.neighbors import NearestNeighbors
import os
from bs4 import BeautifulSoup
import numpy as np
from collections import defaultdict

In [39]:
PURPOSE_GOAL='What is the goal or aim or purpose of the project'
DESCRIPTION_OVERVIEW='What is the description or overview of the system'
ASSUMPTIONS_DEPENDENCIES='What are the assumptions and dependencies'
SCOPE='What is the scope'
REQUIREMENTS_FUNCTIONAL_NON= 'What are the requirements functional non functional'
SYSTEM_ARCHITECTURE='What is the system architecture or what is the architecture of the software'
USERS_AUDIENCE='Who are the users or audience'
QUESTIONS_LIST=[PURPOSE_GOAL,DESCRIPTION_OVERVIEW,ASSUMPTIONS_DEPENDENCIES, SCOPE, REQUIREMENTS_FUNCTIONAL_NON, SYSTEM_ARCHITECTURE, USERS_AUDIENCE]

In [2]:
def fonts(doc, granularity=False):
    """Extracts fonts and their usage in PDF documents.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param granularity: also use 'font', 'flags' and 'color' to discriminate text
    :type granularity: bool
    :rtype: [(font_size, count), (font_size, count}], dict
    :return: most used fonts sorted by count, font style information
    """
    styles = {}
    font_counts = {}

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # block contains text
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if granularity:
                            identifier = "{0}_{1}_{2}_{3}".format(s['size'], s['flags'], s['font'], s['color'])
                            styles[identifier] = {'size': s['size'], 'flags': s['flags'], 'font': s['font'],
                                                  'color': s['color']}
                        else:
                            identifier = "{0}".format(s['size'])
                            styles[identifier] = {'size': s['size'], 'font': s['font']}

                        font_counts[identifier] = font_counts.get(identifier, 0) + 1  # count the fonts usage

    font_counts = sorted(font_counts.items(), key=itemgetter(1), reverse=True)

    if len(font_counts) < 1:
        raise ValueError("Zero discriminating fonts found!")

    return font_counts, styles


def font_tags(font_counts, styles):
    """Returns dictionary with font sizes as keys and tags as value.
    :param font_counts: (font_size, count) for all fonts occuring in document
    :type font_counts: list
    :param styles: all styles found in the document
    :type styles: dict
    :rtype: dict
    :return: all element tags based on font-sizes
    """
    p_style = styles[font_counts[0][0]]  # get style for most used font by count (paragraph)
    p_size = p_style['size']  # get the paragraph's size

    # sorting the font sizes high to low, so that we can append the right integer to each tag
    font_sizes = []
    for (font_size, count) in font_counts:
        font_sizes.append(float(font_size))
    font_sizes.sort(reverse=True)

    # aggregating the tags for each font size
    idx = 0
    size_tag = {}
    for size in font_sizes:
        idx += 1
        if size == p_size:
            idx = 0
            size_tag[size] = '<p>'
        if size > p_size:
            size_tag[size] = '<h{0}>'.format(idx)
        elif size < p_size:
            size_tag[size] = '<s{0}>'.format(idx)

    return size_tag


def headers_para(doc, size_tag):
    """Scrapes headers & paragraphs from PDF and return texts with element tags.
    :param doc: PDF document to iterate through
    :type doc: <class 'fitz.fitz.Document'>
    :param size_tag: textual element tags for each size
    :type size_tag: dict
    :rtype: list
    :return: texts with pre-prended element tags
    """
    header_para = []  # list with headers and paragraphs
    first = True  # boolean operator for first header
    previous_s = {}  # previous span

    for page in doc:
        blocks = page.get_text("dict")["blocks"]
        for b in blocks:  # iterate through the text blocks
            if b['type'] == 0:  # this block contains text

                # REMEMBER: multiple fonts and sizes are possible IN one block

                block_string = ""  # text found in block
                for l in b["lines"]:  # iterate through the text lines
                    for s in l["spans"]:  # iterate through the text spans
                        if s['text'].strip():  # removing whitespaces:
                            if first:
                                previous_s = s
                                first = False
                                block_string = size_tag[s['size']] + s['text']
                            else:
                                if s['size'] == previous_s['size']:

                                    if block_string and all((c == "|") for c in block_string):
                                        # block_string only contains pipes
                                        block_string = size_tag[s['size']] + s['text']
                                    if block_string == "":
                                        # new block has started, so append size tag
                                        block_string = size_tag[s['size']] + s['text']
                                    else:  # in the same block, so concatenate strings
                                        block_string += " " + s['text']

                                else:
                                    header_para.append(block_string)
                                    block_string = size_tag[s['size']] + s['text']

                                previous_s = s

                    # new block started, indicating with a pipe
                    block_string += "|"

                header_para.append(block_string)

    return header_para

In [3]:
def n_gram_creator(list_sentences,n_gram):
    paras=[]
    start,end=0,n_gram
    while( end<len(list_sentences)):
        combined=list_sentences[start:end]
        start+=1
        end+=1
        paras.append(' '.join(combined))
    return paras

In [4]:
def transform_tags(sentence):
   
    sentence=re.sub('<h[0-9]>|<s[0-9]*>',"",sentence)
    sentence=re.sub(' +', ' ',sentence)
    sentence=sentence.strip()
    return sentence

In [5]:
def make_the_paras(filepath,n_grams):
    document = filepath
    doc = fitz.open(document)

    font_counts, styles = fonts(doc, granularity=False)

    size_tag = font_tags(font_counts, styles)

    elements = headers_para(doc, size_tag)

    # with open("doc.json", 'w') as json_out:
    #     json.dump(elements, json_out)
    tagged_list=(" ".join(elements).split("|"))
    Soup = BeautifulSoup(" ".join(elements), 'lxml')
    # heading_tags = ["h1", "h2", "h3", "h4", "h5", "h6"]
    heading_tags = ["h3"]
    #for tags in Soup.find_all(heading_tags):
        #print(tags.name + ' -> ' + tags.text.strip())
    paras=[]
    for tag_sentence in tagged_list:
        transformed_temp=transform_tags(tag_sentence)
        if len(transformed_temp)==0:
            continue
        paras.append(transformed_temp)
    splitted_paras=(' '.join(paras)).split('<p>')
    corpus=n_combined_grams(splitted_paras,n_grams)
    return corpus

In [6]:
def n_combined_grams(splitted_paras,list_grams):
    all_grams=[]
    for n_gram in list_grams:
        all_grams.append(n_gram_creator(splitted_paras,n_gram))
    flat_list = [item for sublist in all_grams for item in sublist]
    return flat_list


In [19]:
from rank_bm25 import *

In [27]:
bm25 = BM25Okapi(corpus)
top_list=bm25.get_top_n("purpose of the document".split(" "),corpus, n=5)

In [28]:
for rank,val in enumerate(top_list):
    print(rank+1,':',val,"\n\n\n")

1 : 20, 21, 22, 23, 26, 27  Security, 27, 28 Status, 11, 12, 13, 14, 17, 21, 22, 23, 27 update, 9, 11, 20, 21 Update, 8, 9, 10, 11, 12, 13, 14, 15, 17,  19, 20, 21, 22  



2 : Grid, 9, 11, 12, 19, 20, 21 Historical Society, 1, 5, 9, 11, 16, 17, 19,  



3 : Online Journal, 4, 5, 6, 7, 15, 16, 17, 18,  



4 : 24, 27, 28  



5 : Reader, 4, 5, 6, 16, 17, 18 Review, 1, 7, 11, 12, 18, 21, 23, 26, 27 Reviewer, 1, 4, 5, 6, 7, 9, 11, 16, 17, 19,  





In [29]:
for rank,val in enumerate(top_list):
    print(rank+1,':',val,"\n\n\n")

1 : 20, 21, 22, 23, 26, 27  Security, 27, 28 Status, 11, 12, 13, 14, 17, 21, 22, 23, 27 update, 9, 11, 20, 21 Update, 8, 9, 10, 11, 12, 13, 14, 15, 17,  19, 20, 21, 22  



2 : Grid, 9, 11, 12, 19, 20, 21 Historical Society, 1, 5, 9, 11, 16, 17, 19,  



3 : Online Journal, 4, 5, 6, 7, 15, 16, 17, 18,  



4 : 24, 27, 28  



5 : Reader, 4, 5, 6, 16, 17, 18 Review, 1, 7, 11, 12, 18, 21, 23, 26, 27 Reviewer, 1, 4, 5, 6, 7, 9, 11, 16, 17, 19,  





## Word2Vec

In [7]:
def transform_text2(vectorizer, text, verbose=False):
    '''
    Transform the text in a vector[Word2Vec]
    vectorizer: sklearn.vectorizer
    text: str
    '''
    tokens = preprocess_string(text)
    words = [vectorizer[w] for w in tokens if w in vectorizer]
    if verbose:
        print('Text:', text)
        print('Vector:', [w for w in tokens if w in vectorizer])
    elif len(words):
        return np.mean(words, axis=0)
    else:
        return np.zeros((300), dtype=np.float32)

In [42]:
def word2vec_para_summarizer(corpus,questions_list):
    corpus_df =  pd.DataFrame({'paras':corpus})
    corpus_df['id_'] = range(0, len(corpus_df))
    
    corpus_list = corpus_df['paras'].tolist()
    corpus_token = [preprocess_string(t) for t in corpus_list]
    vectorizer = Word2Vec(sentences=corpus_token, vector_size=300, window=5, min_count=1, workers=4).wv
    retriever_configs = {
    'n_neighbors': 5,
    'metric': 'cosine'
    }
    retriever = NearestNeighbors(**retriever_configs)

    # vectorizer the documents, fit the retriever
    X = corpus_df['paras'].apply(lambda x: transform_text2(vectorizer, x)).tolist()
    retriever.fit(X, corpus_df['id_'])
    
    #print(questions_list)
    #questions_list=['What is the goal or aim or purpose of the project', 'What is the description or overview of the system','What are the assumptions and dependencies', 'What is the scope', 'What are the requirements functional non functional', 'What is the system architecture or what is the architecture of the software', 'Who are the users or audience']
    X=[transform_text2(vectorizer,question) for question in questions_list]
    #X = questions_list.apply(lambda x: transform_text2(vectorizer, x))
    # y_test = data['c_id']
    y_pred = retriever.kneighbors(X, return_distance=False)
    json_output={}
    for question,index in enumerate(y_pred):
        
        #print(questions_list[question],"?:")
        inner_json={}
        for rank,i_ in enumerate(index):
            inner_json[rank+1]=corpus_df.iloc[i_,0]
            #print(rank+1,":",corpus_df.iloc[i_,0],"\n")
        json_output[questions_list[question]]=inner_json
        #print("\n")
    return json_output

## Word2Vec results on all documents

In [43]:
folder_path=r"../../Data/SRS Dataset Clean/"
files=os.listdir(folder_path)
output_dict=dict()
for file in files:
    
    if '.pdf' not in file:
        continue
    filepath=folder_path+file
    #print(filepath)
    filepath_result={}
    try:
        corpus=make_the_paras(filepath,[3])
        
        filepath_result=word2vec_para_summarizer(corpus,QUESTIONS_LIST)
    except:
        print(filepath)
    finally:
        output_dict[filepath]=filepath_result
   
    
    

../../Data/SRS Dataset Clean/SRS21_removed.pdf
../../Data/SRS Dataset Clean/SRS23.pdf
../../Data/SRS Dataset Clean/SRS24_removed.pdf


In [44]:
output_dict

{'../../Data/SRS Dataset Clean/SRS1.pdf': {'What is the goal or aim or purpose of the project': {1: 'This section will cover general information about the project perspective,  functions, and various requirements and constraints. More detailed descriptions of the project requirements and functions will be covered in latter sections. 2.1 Product Perspective  This project is designed to be used by employees of several automotive plants to ',
   2: 'Prototype V1 will have little functionality, and its main purpose will be to display  the user interface as well as to show our interpretation of the requirements. 5.2 Sample Scenarios  An employee, Bob, was given the unique four-digit-code 7924 by their ',
   3: 'The system is a desktop application that requires a computer running Windows  Vista SP2 or higher version of Windows operating system. The computer must have Visual C++ Redistributable 2010, as well as .NET Framework 4.0 installed.  Prototype V1 will have little functionality, and it

In [45]:
pd.DataFrame.from_dict(output_dict)

Unnamed: 0,../../Data/SRS Dataset Clean/SRS1.pdf,../../Data/SRS Dataset Clean/SRS10_removed.pdf,../../Data/SRS Dataset Clean/SRS11.pdf,../../Data/SRS Dataset Clean/SRS12.pdf,../../Data/SRS Dataset Clean/SRS13_removed (1).pdf,../../Data/SRS Dataset Clean/SRS13_removed.pdf,../../Data/SRS Dataset Clean/SRS14_removed.pdf,../../Data/SRS Dataset Clean/SRS15_removed.pdf,../../Data/SRS Dataset Clean/SRS16_removed.pdf,../../Data/SRS Dataset Clean/SRS17_removed.pdf,...,../../Data/SRS Dataset Clean/SRS48.pdf,../../Data/SRS Dataset Clean/SRS49.pdf,../../Data/SRS Dataset Clean/SRS4_removed (1).pdf,../../Data/SRS Dataset Clean/SRS50.pdf,../../Data/SRS Dataset Clean/SRS53_removed.pdf,../../Data/SRS Dataset Clean/SRS5_removed.pdf,../../Data/SRS Dataset Clean/SRS6_removed.pdf,../../Data/SRS Dataset Clean/SRS7_removed.pdf,../../Data/SRS Dataset Clean/SRS8_removed.pdf,../../Data/SRS Dataset Clean/SRS9_removed.pdf
What is the goal or aim or purpose of the project,{1: 'This section will cover general informati...,{1: 'Software Requirements Specification for D...,{1: 'This product aimed toward a person who do...,{1: 'Use-Cases: None Class Diagram The purpos...,{1: 'Basic Flow: User enables the desired noti...,{1: 'Basic Flow: User enables the desired noti...,{1: ' Close Project: Closes the currently ope...,"{1: 'familiar, in-house environment. Most peop...",{1: ' Bowtie Code 3 1. Introduction 1.1 Purpos...,{1: ' Acumen Developers 3 1. Introduction 1.1 ...,...,{1: 'Element Name Description Radar Determine...,{1: 'satisfaction from 0 to 1) of the goal. Fi...,{1: '1.1 Purpose The purpose of this SRS docu...,"{1: '13. “MISRA Safety Analysis.” MISRA , Moto...",{1: 'This section gives a scope description an...,{1: '1.1. Purpose The purpose of this documen...,{1: 'Lab Lab is a place where tests are usuall...,{1: '1 Introduction 1.1 Purpose The purpose o...,{1: 'C. Project: TradeSim 1 Introduction 1.1...,{1: 'This table is populated with a single rec...
What is the description or overview of the system,"{1: 'a. This describes the attributes, operati...",{1: '4.8.3 Functional Requirements REQ-1: Sys...,{1: '3.1 Functional Requirements: This section...,{1: 'Type: Primary Description: Lets the Mana...,"{1: '5. Detlor, B. (2000). ""The corporate port...","{1: '5. Detlor, B. (2000). ""The corporate port...",{1: 'version and other info. Main Pages:  O...,{1: 'SSL Syntax/Semantic Language. UI User in...,{1: '19. The user shall be able to set the des...,{1: '1. Description Before being allowed acc...,...,{1: 'UML Extensions Element Name Description ...,{1: 'Table 4: Use case description for Initiat...,"{1: '[4] World Wide Web Consortium (2011, Last...","{1: 'Cross-refs: 1.8, 1.11 (1.11.1, 1.11.2) U...",{1: ' 1 1. Introduction This section gives a ...,{1: '1.5. Overview of Document The next chapt...,{1: 'Sr No System Environment for development...,{1: 'The main intention of ours is to provide ...,{1: '1.4. Overview: The document follows the ...,{1: 'Time (of departure) Time (of departure) T...
What are the assumptions and dependencies,{1: 'The system will have two separate sets of...,{1: 'System should be compatible and will smoo...,{1: '2 Principle Actors are Customer and Admin...,"{1: 'Variables - TimeDate:Time, Username:Strin...",{1: 'The component will be adapted to the over...,{1: 'The component will be adapted to the over...,"{1: 'above, non-English tutorials etc. Additi...",{1: 'Displayed lines depend on Faculty and Dep...,{1: 'd. Risks - Low risk - simple implementati...,{1: 'Acumen Developers 6 5. Dependencies with...,...,{1: 'Software Requirements Specification (SRS)...,{1: 'model the operation of the system in deta...,{1: '2.7 Assumptions and Dependencies No spec...,{1: 'Section 6 provides credit to the resource...,"{1: '1 * QR8 QR22, QR23, FR14, FR15, FR16, FR...",{1: 'developed here assumes the use of a tool ...,{1: '1. Installation Guide 2. User Manual for...,{1: 'There is only one user at a time in this ...,{1: 'their money in the future. 2.3 Assumptio...,{1: 'The system will provide a restricted “Bac...
What is the scope,"{1: 'Authors: Lisa Doan, Alex Besinger, Patric...",{1: 'The system gives ability to the admin to ...,{1: '3.1.6 Report Generation After ordering f...,{1: '(or any item) in a single transaction. •...,{1: 'privacy. 2. The user's IP will be logged...,{1: 'privacy. 2. The user's IP will be logged...,{1: 'Gephi for more demanding graph analysis. ...,"{1: 'role User’s designation like Student, Dea...",{1: 'Koofers is an online database of class in...,{1: 'none 15. The CMS shall provide a functio...,...,{1: 'Element Name Description Radar Determine...,{1: 'Several components of the BCAS system wil...,{1: '1.2 Disclaimer The work presented in thi...,"{1: 'To ensure maximum safety, we define globa...",{1: 'Term Definition User Someone who interact...,{1: 'stakeholders and the developers of the sy...,{1: 'The document refers to the following assi...,{1: 'UI User Interface Gamer A person who pla...,"{1: 'is doing, and use this to decide where th...",{1: 'CA5 5.00 John Primary Contact Name: Phone...
What are the requirements functional non functional,{1: 'This section will cover general informati...,{1: '5.4.9 Reusability: Current version can be...,{1: '3.2 Non-Functional Requirements: Followin...,"{1: 'Manager Functions - None Class ', 2: 'F...",{1: '1. Introduction 1.1 Purpose The purpose ...,{1: '1. Introduction 1.1 Purpose The purpose ...,{1: 'also every supported graph format (such a...,{1: 'This function allows the administrator to...,{1: 'device must provide network connectivity ...,{1: '5. Dependencies with other requirements ...,...,{1: 'Software Requirements Specification (SRS)...,{1: 'model the operation of the system in deta...,{1: '5. Non-functional Requirements This sect...,{1: '● More specific warning messages such as ...,{1: '26 3.5.4 Maintainability ID: QR19 TITLE...,{1: 'overview of the functionality of the prod...,{1: 'illustrated to give clear idea of priorit...,{1: 'The ﬁrst chapter contains the Introductio...,"{1: '1.3 Acronyms, Abbreviations and Definitio...",{1: 'Ferry Ticketing System – Software Require...
What is the system architecture or what is the architecture of the software,"{1: 'To clarify the unique terminology, acrony...",{1: '5.4.2 Availability: The system is up and ...,{1: '(iii) Better component design to get bett...,{1: 'Customer: Mr. Borzoo Bonakdarpour Instru...,{1: 'Monitor screen – the software shall displ...,{1: 'Monitor screen – the software shall displ...,{1: 'software Gephi. It will explain the purpo...,"{1: 'This estimation was done, by using online...",{1: 'The development of this software system i...,{1: '4.1.4 Diagnostics or ROM N/A 4.2 Hardware...,...,{1: 'Software Requirements Specification (SRS)...,"{1: '[1] Texas Department of Transportation, “...",{1: 'The software system deﬁned in this SRS mu...,"{1: 'To ensure maximum safety, we define globa...",{1: ' [1] IEEE Software Engineering Standards ...,{1: 'Historical Society for its approval. 1.2...,{1: 'Hardware Requirements for hosting: ● Min...,{1: '1. https://en.wikipedia.org/wiki/Software...,{1: 'Architectural Styles • Pipe and filters ...,{1: '250 / 150 / 400 Cargo-A 150 / 100 / 300...
Who are the users or audience,{1: 'A user directory system is a feature that...,{1: '2.5.2 2.5.2 Device Constraint DineOut’s ...,{1: ' 2.3 User Characeristics: User should be ...,{1: 'The quantity of any particular item the i...,{1: 'User Any user of the academic portal. In...,{1: 'User Any user of the academic portal. In...,{1: ' Refresh: Applies changes made by the us...,{1: '4. User selects: 4.1. User manually sele...,"{1: 'e. Dependencies - 13, 22. 4. Interface Re...",{1: '1. Description Before being allowed acc...,...,{1: 'GUI Buttons and Screens with which the us...,{1: 'The prototype of the BCAS can be accessed...,{1: 'The system must have at least a Super-Use...,{1: 'to prevent backup collision injury. 2.3 U...,"{1: '2 3 49 users. ', 2: 'ID: FR27 Feature: ...",{1: 'functionality of the product. Both secti...,{1: 'Sl No User User Interface Name/ Number ...,{1: '2.1.1 User Interface • Every game must h...,{1: '– The user has an internet connection. 4...,{1: ' Listing of existing Users is displayed ...


In [46]:
import json
out_file = open("word2vec_output.json", "w")
  
json.dump(output_dict, out_file, indent = 6)
  
out_file.close()

In [69]:
word2vec_para_summarizer(make_the_paras(r"../../Data/SRS Dataset Clean/SRS20_removed.pdf",[3]),QUESTIONS_LIST)

{'What is the goal or aim or purpose of the project': {1: '*When\xa0new\xa0interface\xa0or\xa0component\xa0is\xa0wanted\xa0to\xa0add\xa0the\xa0system,\xa0any\xa0problem should\xa0not\xa0occur.System\xa0should\xa0be\xa0implemented\xa0in\xa0this\xa0way. 3.3.3.4\xa0RELIABILITY *If\xa0any\xa0interface\xa0or\xa0component\xa0of\xa0system\xa0does\xa0not\xa0work\xa0properly,\xa0informative message\xa0about\xa0error\xa0should\xa0be\xa0displayed\xa0to\xa0the\xa0users. *There\xa0should\xa0be\xa0a\xa0backup\xa0system\xa0for\xa0holding\xa0all\xa0stored\xa0data\xa0of\xa0system\xa0such\xa0as users,\xa0events\xa0or\xa0friendships\xa0in\xa0case\xa0of\xa0failure\xa0of\xa0the\xa0system. *This\xa0system\xa0should\xa0keep\xa0the\xa0database\xa0updated. 3.3.3.5\xa0PORTABILITY *Since\xa0the\xa0application\xa0is\xa0Android\xa0application,\xa0the\xa0system\xa0will\xa0run\xa0on\xa0any platform\xa0that\xa0has\xa0Android\xa0OS. *The\xa0version\xa0of\xa0Android\xa0Operating\xa0system\xa0on\xa0device\xa0should\xa0b

In [70]:
make_the_paras(r"../../Data/SRS Dataset Clean/SRS20_removed.pdf",[3])

['5 \u200b .PLANNING 5 \u200b .1\xa0TEAM\xa0STRUCTURE 5 \u200b .2\xa0ESTIMATED\xa0SCHEDULE 5 \u200b .3\xa0PROCESS\xa0MODEL 6 \u200b .\xa0CONCLUSION 1.\xa0INTRODUCTION This\xa0document\xa0is\xa0a\xa0Software\xa0Requirement\xa0Specification\xa0for\xa0the\xa0Android\xa0Mobile\xa0Application named\xa0“NERS”.This\xa0document\xa0is\xa0\xa0prepared\xa0by\xa0the\xa0following\xa0IEEE\xa0conventions\xa0for\xa0software requirement\xa0specification.This\xa0document\xa0includes\xa0all\xa0the\xa0functions\xa0and\xa0specifications\xa0with their\xa0explanations\xa0to\xa0solve\xa0related\xa0problems\xa0as\xa0a\xa0project\xa0of\xa0METU\xa0CENG\xa0department. 1.1\xa0PROBLEM\xa0DEFINITION In\xa0the\xa0last\xa0two\xa0decade,\xa0Internet\xa0and\xa0mobile\xa0phones\xa0have\xa0increased\xa0rapidly.\xa0Nowadays,\xa0almost all\xa0people\xa0has\xa0a\xa0\xa0mobile\xa0phone\xa0and\xa0different\xa0kind\xa0of\xa0mobile\xa0applications.This\xa0leads\xa0many simplicities\xa0on\xa0people’s\xa0life\xa0in\xa0terms\xa0of\

In [101]:
combined_ranks_output=dict()
for filepath in output_dict:
    modified_filepath=((filepath.split('/')[-1]).split('.')[0]).split('_')[0]
    
    combined_ranks_output[filepath]=dict()
    
    for question in output_dict[filepath]:
        #combined_ranks_output[filepath][question]=
        #combined_ranks_output[filepath][question]
        all_paras=''
        for rank in output_dict[filepath][question]:
            all_paras+=output_dict[filepath][question][rank]
        combined_ranks_output[filepath][question]=all_paras

In [102]:
CSV_COLUMNS=['purpose','description','scope','requirements','assumptions and dependencies','architecture','users']
output_dataframe=pd.DataFrame.from_dict(combined_ranks_output,orient='index')
output_dataframe.columns=CSV_COLUMNS
display(output_dataframe)
output_dataframe.to_csv('labelled_data.csv',encoding='utf-8')

Unnamed: 0,purpose,description,scope,requirements,assumptions and dependencies,architecture,users
../../Data/SRS Dataset Clean/SRS1.pdf,This section will cover general information ab...,"a. This describes the attributes, operations, ...",The system will have two separate sets of hard...,"Authors: Lisa Doan, Alex Besinger, Patrick McC...",This section will cover general information ab...,"To clarify the unique terminology, acronyms, a...",A user directory system is a feature that coul...
../../Data/SRS Dataset Clean/SRS10_removed.pdf,Software Requirements Specification for DineOu...,4.8.3 Functional Requirements REQ-1: System m...,System should be compatible and will smoothly ...,"The system gives ability to the admin to add, ...",5.4.9 Reusability: Current version can be used...,5.4.2 Availability: The system is up and runni...,2.5.2 2.5.2 Device Constraint DineOut’s core ...
../../Data/SRS Dataset Clean/SRS11.pdf,This product aimed toward a person who don’t w...,3.1 Functional Requirements: This section prov...,2 Principle Actors are Customer and Administra...,3.1.6 Report Generation After ordering for th...,3.2 Non-Functional Requirements: Following Non...,(iii) Better component design to get better pe...,2.3 User Characeristics: User should be famil...
../../Data/SRS Dataset Clean/SRS12.pdf,Use-Cases: None Class Diagram The purpose of ...,Type: Primary Description: Lets the Manager e...,"Variables - TimeDate:Time, Username:String, It...",(or any item) in a single transaction. • The ...,Manager Functions - None Class Functions - N...,Customer: Mr. Borzoo Bonakdarpour Instructor:...,The quantity of any particular item the invent...
../../Data/SRS Dataset Clean/SRS13_removed (1).pdf,Basic Flow: User enables the desired notificat...,"5. Detlor, B. (2000). ""The corporate portal as...",The component will be adapted to the overarchi...,privacy. 2. The user's IP will be logged. 3. ...,1. Introduction 1.1 Purpose The purpose of th...,Monitor screen – the software shall display in...,User Any user of the academic portal. Input D...
../../Data/SRS Dataset Clean/SRS13_removed.pdf,Basic Flow: User enables the desired notificat...,"5. Detlor, B. (2000). ""The corporate portal as...",The component will be adapted to the overarchi...,privacy. 2. The user's IP will be logged. 3. ...,1. Introduction 1.1 Purpose The purpose of th...,Monitor screen – the software shall display in...,User Any user of the academic portal. Input D...
../../Data/SRS Dataset Clean/SRS14_removed.pdf, Close Project: Closes the currently open pro...,version and other info. Main Pages:  Overvi...,"above, non-English tutorials etc. Additional ...",Gephi for more demanding graph analysis.  Pr...,also every supported graph format (such as gex...,software Gephi. It will explain the purpose an..., Refresh: Applies changes made by the user. ...
../../Data/SRS Dataset Clean/SRS15_removed.pdf,"familiar, in-house environment. Most people co...",SSL Syntax/Semantic Language. UI User interfa...,Displayed lines depend on Faculty and Departme...,"role User’s designation like Student, Dean, Pr...",This function allows the administrator to chan...,"This estimation was done, by using online serv...",4. User selects: 4.1. User manually selects a...
../../Data/SRS Dataset Clean/SRS16_removed.pdf,Bowtie Code 3 1. Introduction 1.1 Purpose of ...,19. The user shall be able to set the descript...,d. Risks - Low risk - simple implementation an...,Koofers is an online database of class informa...,device must provide network connectivity (both...,The development of this software system is spo...,"e. Dependencies - 13, 22. 4. Interface Require..."
../../Data/SRS Dataset Clean/SRS17_removed.pdf,Acumen Developers 3 1. Introduction 1.1 Purpo...,1. Description Before being allowed access i...,Acumen Developers 6 5. Dependencies with othe...,none 15. The CMS shall provide a function to ...,5. Dependencies with other requirements none ...,4.1.4 Diagnostics or ROM N/A 4.2 Hardware Inte...,1. Description Before being allowed access i...


In [103]:
combined_ranks_output

{'../../Data/SRS Dataset Clean/SRS1.pdf': {'What is the goal or aim or purpose of the project': 'This section will cover general information about the project perspective,  functions, and various requirements and constraints. More detailed descriptions of the project requirements and functions will be covered in latter sections. 2.1 Product Perspective  This project is designed to be used by employees of several automotive plants to Prototype V1 will have little functionality, and its main purpose will be to display  the user interface as well as to show our interpretation of the requirements. 5.2 Sample Scenarios  An employee, Bob, was given the unique four-digit-code 7924 by their The system is a desktop application that requires a computer running Windows  Vista SP2 or higher version of Windows operating system. The computer must have Visual C++ Redistributable 2010, as well as .NET Framework 4.0 installed.  Prototype V1 will have little functionality, and its main purpose will be t

In [100]:
pd.read_csv('labelled_data.csv')

Unnamed: 0.1,Unnamed: 0,purpose,description,scope,requirements,assumptions and dependencies,architecture,users
0,SRS1,This section will cover general information ab...,"a. This describes the attributes, operations, ...",The system will have two separate sets of hard...,"Authors: Lisa Doan, Alex Besinger, Patrick McC...",This section will cover general information ab...,"To clarify the unique terminology, acronyms, a...",A user directory system is a feature that coul...
1,SRS10,Software Requirements Specification for DineOu...,4.8.3 Functional Requirements REQ-1: System m...,System should be compatible and will smoothly ...,"The system gives ability to the admin to add, ...",5.4.9 Reusability: Current version can be used...,5.4.2 Availability: The system is up and runni...,2.5.2 2.5.2 Device Constraint DineOut’s core ...
2,SRS11,This product aimed toward a person who don’t w...,3.1 Functional Requirements: This section prov...,2 Principle Actors are Customer and Administra...,3.1.6 Report Generation After ordering for th...,3.2 Non-Functional Requirements: Following Non...,(iii) Better component design to get better pe...,2.3 User Characeristics: User should be famil...
3,SRS12,Use-Cases: None Class Diagram The purpose of ...,Type: Primary Description: Lets the Manager e...,"Variables - TimeDate:Time, Username:String, It...",(or any item) in a single transaction. • The ...,Manager Functions - None Class Functions - N...,Customer: Mr. Borzoo Bonakdarpour Instructor:...,The quantity of any particular item the invent...
4,SRS13,Basic Flow: User enables the desired notificat...,"5. Detlor, B. (2000). ""The corporate portal as...",The component will be adapted to the overarchi...,privacy. 2. The user's IP will be logged. 3. ...,1. Introduction 1.1 Purpose The purpose of th...,Monitor screen – the software shall display in...,User Any user of the academic portal. Input D...
5,SRS14, Close Project: Closes the currently open pro...,version and other info. Main Pages:  Overvi...,"above, non-English tutorials etc. Additional ...",Gephi for more demanding graph analysis.  Pr...,also every supported graph format (such as gex...,software Gephi. It will explain the purpose an..., Refresh: Applies changes made by the user. ...
6,SRS15,"familiar, in-house environment. Most people co...",SSL Syntax/Semantic Language. UI User interfa...,Displayed lines depend on Faculty and Departme...,"role User’s designation like Student, Dean, Pr...",This function allows the administrator to chan...,"This estimation was done, by using online serv...",4. User selects: 4.1. User manually selects a...
7,SRS16,Bowtie Code 3 1. Introduction 1.1 Purpose of ...,19. The user shall be able to set the descript...,d. Risks - Low risk - simple implementation an...,Koofers is an online database of class informa...,device must provide network connectivity (both...,The development of this software system is spo...,"e. Dependencies - 13, 22. 4. Interface Require..."
8,SRS17,Acumen Developers 3 1. Introduction 1.1 Purpo...,1. Description Before being allowed access i...,Acumen Developers 6 5. Dependencies with othe...,none 15. The CMS shall provide a function to ...,5. Dependencies with other requirements none ...,4.1.4 Diagnostics or ROM N/A 4.2 Hardware Inte...,1. Description Before being allowed access i...
9,SRS18,Figure I1-4. Referential integrity for the che...,DOCNUM C 16 (See DOCS.DBF) Evaluation of Mate...,RPTNUM C 15 (See PROJRPTS.DBF) TITLE C 254 Tit...,Evaluation of Materials Released from SRS Soft...,The testing of a software product should inclu...,The test analyst should document the test resu...,RAC RAC reports are correctly listed in the da...
