In [None]:
!pip install PyMuPDF #PyMuPDF, imported as fitz, is used to extract text from PDF files
!pip install transformers #effective for generating summaries due to its strong contextual understanding.
!pip install spacy #provides a fast,accurate solution for identifying named entities, which is key for understanding roles,locations,org in documents
!pip install gensim #designed for handling large text data efficiently, which makes it suitable for topic modeling
!pip install keybert #leverages BERT embeddings to extract keywords from the document by finding words or phrases most similar to the overall text.



>



In [None]:
import fitz  # PyMuPDF

pdf_path = '/content/HLRBO_Template_Lease_Document.pdf'
text_content = ''

with fitz.open(pdf_path) as pdf:
    for page_num in range(len(pdf)):
        page = pdf[page_num]
        text_content += page.get_text()

# Display the first 500 characters to confirm loading
print(text_content[:500])


The t5-small model is a smaller version of T5 (Text-To-Text Transfer Transformer), developed by Google Research. T5 is a versatile Transformer model that treats all NLP tasks as text-to-text problems, meaning that both the input and output are in text format.

In [None]:
from transformers import pipeline #Generates concise summaries of long documents.

# Load summarization pipeline
summarizer = pipeline("summarization", model="t5-small")

# Summarize the text (using first 1000 characters for brevity in demo)
summary = summarizer(text_content[:1000], max_length=100, min_length=30, do_sample=False)

# Print summary
print("Summary:")
print(summary[0]['summary_text'])


In [None]:
#Topic Detection,  gensim is a library designed for topic modeling, document similarity analysis
from gensim import corpora # corpora.Dictionary class converts the list of words (tokens) into a dictionary format,mapping each unique word to an integer ID.
#This dictionary is essential for converting text into a format usable by the LDA model.
from gensim.models import LdaModel #identifies patterns of co-occurrence among words to discover topics in the document.
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Preprocess text
stop_words = set(stopwords.words('english'))
tokens = [word.lower() for word in word_tokenize(text_content) if word.isalpha() and word.lower() not in stop_words]

# Create dictionary and corpus for LDA
dictionary = corpora.Dictionary([tokens])
corpus = [dictionary.doc2bow(tokens)] # "bag of words" (BoW) representation captures frequency of each word in a document without considering word order

# Build LDA model
lda_model = LdaModel(corpus, num_topics=2, id2word=dictionary, passes=10) # list of words with associated probabilities for each topic

# Print topics
print("Topics Detected:")
for idx, topic in lda_model.print_topics(-1):
    print(f"Topic {idx + 1}: {topic}")


Topic 1 could be labeled "Lease Obligations and Parties" due to the high weights for words like "lessee," "lessor," and "lease."

Topic 2 might relate to "Agreement Conditions" or "Contractual Terms" as it contains terms like "agreement" and "premises."

The LDA model doesn’t provide a single-word label but rather a distribution of words that allows you to infer what the topic represents.

An n-gram is a sequence of n words:

1-gram (Unigram): A single word (e.g., "agreement," "lease").
2-gram (Bigram): A two-word phrase (e.g., "lease agreement," "landowner hereinafter").

In [None]:
#Keywords
from keybert import KeyBERT # identifies the most relevant keywords and key phrases in a text

# Initialize KeyBERT
kw_model = KeyBERT()

# Extract keywords
keywords = kw_model.extract_keywords(text_content[:1000], keyphrase_ngram_range=(1, 2), stop_words='english', top_n=10)

# Display keywords
print("Keywords:")
for kw in keywords:
    print(kw[0])


In [None]:
import spacy #spaCy’s NER model is highly effective at identifying structured entities within text from unstructured documents.

# Load spaCy NER model
nlp = spacy.load("en_core_web_sm")
doc = nlp(text_content)

# Extract and print entities
print("Named Entities:")
for ent in doc.ents:
    print(f"{ent.text} - {ent.label_}")


In [None]:
# Display final metadata summary with new lines for better readability
metadata = {
    "Summary": summary[0]['summary_text'],
    "Topics": [topic[1] for topic in lda_model.print_topics(-1)],
    "Keywords": [kw[0] for kw in keywords],
    "Named Entities": [(ent.text, ent.label_) for ent in doc.ents]
}

print("\nExtracted Metadata:")
for key, value in metadata.items():
    print(f"{key}:\n{value}\n")  # Added newline after each metadata item



In [None]:
from collections import Counter
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK resources if not already done
nltk.download('punkt')

# Convert the keywords list to lowercase for consistency
keyword_list = [kw[0].lower() for kw in keywords]
print("Extracted Keywords:", keyword_list)  # Debugging output

# Tokenize the document content and also convert to lowercase
tokens = [word.lower() for word in word_tokenize(text_content) if word.isalpha()]
print("Tokens from Document:", tokens[:50])  # Show first 50 tokens for reference

# Filter tokens to include only those that match extracted keywords
filtered_tokens = [token for token in tokens if token in keyword_list]
print("Filtered Tokens (Matching Keywords):", filtered_tokens)  # Debugging output

# Count occurrences of each keyword in the filtered token list
keyword_counts = Counter(filtered_tokens)

# Display keyword frequencies
print("Keyword Frequencies:")
for keyword, count in keyword_counts.items():
    print(f"{keyword}: {count}")

# If no keyword frequencies are displayed, the keywords may not be present in the document.


In [None]:
# Function to count approximate occurrences of each keyword as a substring
def count_approximate_matches(text, keywords):
    counts = {}
    for keyword in keywords:
        # Use lowercase text and keyword for case-insensitive matching
        counts[keyword] = text.lower().count(keyword.lower())
    return counts

# Count occurrences of each keyword (case-insensitive, as substring)
keyword_counts = count_approximate_matches(text_content, keyword_list)

# Display keyword frequencies
print("Approximate Keyword Frequencies:")
for keyword, count in keyword_counts.items():
    print(f"{keyword}: {count}")



In [None]:
import re

# Function to count approximate matches of each keyword by ignoring special characters and spaces
def count_approximate_matches_flexible(text, keywords):
    # Clean up text: remove special characters and convert to lowercase
    text_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', text).lower()
    counts = {}

    for keyword in keywords:
        # Clean up the keyword similarly to ignore special characters and spaces
        keyword_cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', keyword).lower()

        # Count occurrences of the cleaned keyword in the cleaned text
        counts[keyword] = text_cleaned.count(keyword_cleaned)

    return counts

# Count occurrences with flexible matching
keyword_counts = count_approximate_matches_flexible(text_content, keyword_list)

# Display keyword frequencies
print("Approximate Keyword Frequencies (Flexible):")
for keyword, count in keyword_counts.items():
    print(f"{keyword}: {count}")


In [None]:
from transformers import pipeline

# Initialize sentiment analysis
sentiment_analyzer = pipeline("sentiment-analysis")

# Get sentiment for the document (or summarize sections)
sentiment = sentiment_analyzer(text_content[:1000])

print("Sentiment Analysis:")
print(sentiment[0])


The negative sentiment score likely stems from the legal and restrictive language commonly used in documents like lease agreements. Here’s why the model may interpret it as "negative":

Restrictive Terms: Phrases like “Lessee shall be liable for…,” “no refunds…,” and “will be forfeited” often imply penalties, limitations, and consequences, which can be perceived as negative.

In [None]:

!pip install textstat

import textstat

# Calculate readability score
readability_score = textstat.flesch_reading_ease(text_content)

print("Readability Score (Flesch Reading Ease):", readability_score)


This score helps us understand the level of reading skill required to comprehend the document, indicating how accessible or complex the content is for readers.
Given the score of 46.03, the document likely contains complex vocabulary or long sentences, common in formal, legal, or technical writing

In [None]:
from sentence_transformers import SentenceTransformer, util

# Initialize model
model = SentenceTransformer('all-MiniLM-L6-v2')
query = "lease agreement"

# Embed document sentences
sentences = text_content.split('\n')
sentence_embeddings = model.encode(sentences)

# Embed query and find similar sentences
query_embedding = model.encode(query)
similarities = util.cos_sim(query_embedding, sentence_embeddings)

# Display top 5 most similar sentences
top_sentences = sorted(enumerate(similarities[0]), key=lambda x: x[1], reverse=True)[:5]

print("Most Relevant Sentences:")
for idx, similarity in top_sentences:
    print(sentences[idx].strip())
