In [44]:
import spacy
nlp = spacy.load("en_core_web_sm") 
print("spacy loaded successfully!") 

spacy loaded successfully!


In [None]:
import spacy
from spacy.matcher import Matcher
import csv
import os

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Create a Matcher object
matcher = Matcher(nlp.vocab)

# Define a pattern to match "Contractor shall" followed by the entire statement
pattern = [
    {"LOWER": "contractor"},
    {"LOWER": "shall"},
    {"IS_PUNCT": False, "OP": "*"},  # Match all words following "contractor shall"
]

# Add the pattern to the matcher
matcher.add("CONTRACTOR_SHALL", [pattern])

def clean_and_split(statement):
    """
    Cleans the statement (capitalizes properly) and splits it into subparagraphs if applicable.
    
    Args:
        statement: The full text of the "Contractor shall" statement.
    
    Returns:
        A list of cleaned, individual subparagraphs.
    """
    # Ensure proper capitalization of sentences
    statement = statement.strip()
    statement = statement[0].upper() + statement[1:] if statement else ""

    # Split into subparagraphs based on patterns (e.g., '4.1.1', 'a.', etc.)
    subparagraphs = re.split(r"(?<!\w)([0-9]+\.[0-9]+(?:\.[0-9]+)?)|(?<!\w)([a-z]\.)", statement)
    subparagraphs = [s.strip() for s in subparagraphs if s and not s.isspace()]
    return subparagraphs

def extract_contractor_shall_statements(text):
    """
    Extracts "Contractor shall" statements from the given text.
    
    Args:
        text: The input text (e.g., the contract proposal).

    Returns:
        A list of extracted "Contractor shall" statements, including full paragraphs.
    """
    doc = nlp(text)
    matches = matcher(doc)

    statements = set()  # Using a set to ensure uniqueness
    for match_id, start, end in matches:
        string_id = nlp.vocab.strings[match_id]
        if string_id == "CONTRACTOR_SHALL":
            span = doc[start:end]
            statement = span.text

            # Extend the capture to include the entire paragraph (looking for punctuation to stop)
            end_of_paragraph = end
            while end_of_paragraph < len(doc) and doc[end_of_paragraph].text not in ['.', '!', '?']:
                end_of_paragraph += 1

            full_statement = doc[start:end_of_paragraph+1].text
            statements.add(full_statement.strip())  # Add full statement to the set

    return list(statements)  # Convert set back to list

def generate_csv(extracted_statements, filename="contractor_shall_statements.csv"):
    """
    Generates a CSV with the extracted Contractor Shall statements.
    
    Args:
        extracted_statements: List of extracted "Contractor shall" statements.
        filename: The path where the CSV will be saved.
    """
    header = ["PWS/Scope", "Capabilities", 
              "Program Understanding/Knowledge", "Notes and Mitigation", 
              "Past Performance and Content Resources", "Possible Writers/SMEs", 
              "Action Item", "Assigned to"]

    try:
        # Ensure the file is saved in the current working directory
        file_path = os.path.join(os.getcwd(), filename)
        
        # Create and write to the CSV file
        with open(file_path, mode='w', newline='') as file:
            writer = csv.writer(file)
            writer.writerow(header)  # Write the header to the CSV

            # Write each extracted statement or subparagraph to Column A (PWS/Scope) and leave the rest blank
            for statement in extracted_statements:
                subparagraphs = clean_and_split(statement)
                for sub in subparagraphs:
                    writer.writerow([sub, "", "", "", "", "", "", ""])  # Subparagraph in Column A

        print(f"CSV file '{file_path}' generated successfully!")

    except Exception as e:
        print(f"An error occurred while generating the CSV: {e}")

# Main logic to ask for user input and generate the CSV
def main():
    while True:
        # Ask the user for input text
        proposal_text = input("Please enter the proposal text or 'quit' to exit: ")
        
        if proposal_text.lower() == 'quit':
            break

        # Extract "Contractor shall" statements
        extracted_statements = extract_contractor_shall_statements(proposal_text)

        # Generate the CSV with the extracted statements
        generate_csv(extracted_statements)

if __name__ == "__main__":
    main()