1. Import Libraries

In [62]:
import spacy
import PyPDF2
from textblob import TextBlob
import pandas as pd

2. Load spaCy Language Model

In [63]:
nlp = spacy.load("en_core_web_sm")

3. PDF Text Extraction Function

In [64]:
# Cell 3
def extract_text_from_pdf(pdf_path):
    text = ""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        for page in pdf_reader.pages:
            text += page.extract_text() if page.extract_text() else ""
    return text

4. Text Processing and Sentiment Analysis

In [65]:
# Cell 4
def process_text_and_analyze_sentiment(text):
    doc = nlp(text)
    sentiments = []

    for sentence in doc.sents:
        analysis = TextBlob(sentence.text)
        sentiment_score = analysis.sentiment.polarity
        if sentiment_score > 0:
            sentiment = "Positive"
        elif sentiment_score < 0:
            sentiment = "Negative"
        else:
            sentiment = "Neutral"
        sentiments.append((sentence.text, sentiment))

    return sentiments

5. Saving Sentences and Sentiment Analysis to Excel

In [66]:
def save_to_excel(sentences, file_path):
    df = pd.DataFrame(sentences, columns=['Sentence', 'Sentiment'])
    df.to_excel(file_path, index=False)


6. Requirement Extraction Using Model
Assume model is your pre-trained model.
predict_requirement function should take a sentence and return a binary output or a probability indicating whether it's a requirement.

In [67]:
# Cell 6
import pickle
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model
import numpy as np

# Load the tokenizer
with open('/Users/PremGanesh/Developer/AI/CyVidia/Requirement_extraction/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

def predict_requirement(sentence, model, tokenizer):
    # Tokenize and pad the sentence
    sequence = tokenizer.texts_to_sequences([sentence])
    max_length = 100
    padded_sequence = pad_sequences(sequence, maxlen=max_length)

    # Predict
    prediction = model.predict(padded_sequence)
    area_pred, bucket_pred = prediction

    # Define a threshold
    threshold = 0.5  # Example threshold, adjust based on your model and data

    # Check if the highest confidence score in either prediction is above the threshold
    is_requirement = max(np.max(area_pred), np.max(bucket_pred)) > threshold
    return is_requirement


7. Categorizing and Saving Requirements

In [68]:
# Cell 7
def categorize_and_save(sentences, model, req_file_path, non_req_file_path):
    requirements = []
    non_requirements = []

    for sentence, sentiment in sentences:
        if predict_requirement(sentence, model, tokenizer):
            requirements.append((sentence, sentiment))
        else:
            non_requirements.append((sentence, sentiment))

    save_to_excel(requirements, req_file_path)
    save_to_excel(non_requirements, non_req_file_path)

# Usage:
pdf_text = extract_text_from_pdf("/Users/PremGanesh/Developer/AI/CyVidia/Requirement_extraction/Claro.pdf")
sentences_with_sentiment = process_text_and_analyze_sentiment(pdf_text)

model_path = '/Users/PremGanesh/Developer/AI/CyVidia/Requirement_extraction/trained_model_rbi_jll_nist_scf'
model = load_model(model_path)

categorize_and_save(sentences_with_sentiment, model, "requirements1.xlsx", "non_requirements1.xlsx")




How to Use:


In [69]:
pdf_text = extract_text_from_pdf("/Users/PremGanesh/Developer/AI/CyVidia/Requirement_extraction/Claro.pdf")
sentences_with_sentiment = process_text_and_analyze_sentiment(pdf_text)
from tensorflow.keras.models import load_model

model_path = '/Users/PremGanesh/Developer/AI/CyVidia/Requirement_extraction/trained_model_rbi_jll_nist_scf'
model = load_model(model_path)


categorize_and_save(sentences_with_sentiment,model, "requirements1.xlsx", "non_requirements1.xlsx",)


