In [None]:
pip install pdfplumber 


In [15]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted from the page
                pdf_text += page_text + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    # Split the text on sentence-ending punctuation and newline characters
    sentences = re.split(r'(?<=[.!?]) +|\n+', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences, threshold_80=0.8):
    # Fit and transform the target and all sentences into TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Debugging output
    print(f"Vector shape: {vectors.shape}")
    print(f"Number of sentences: {len(sentences)}")
    print(f"Length of target_vector: {len(target_vector)}")

    # Ensure alignment between target_vector and sentences
    if len(target_vector) - 1 > len(sentences):
        raise ValueError("Vector size mismatch: target_vector contains more elements than sentences.")

    # Filter sentences based on similarity thresholds (80% or higher)
    similar_sentences_80 = [
        sentences[i - 1]  # Adjust indexing to align with sentences
        for i, score in enumerate(target_vector[1:], 1)
        if i - 1 < len(sentences) and score >= threshold_80
    ]

    return similar_sentences_80

# Example usage
pdf_path = '/Users/sanjju/Downloads/test.pdf'
target = "Sanjjushri"

# Step 4: Extract, split, and find similar sentences
text = extract_text_from_pdf(pdf_path)
print("Extracted text preview:", text[:500])  # Debugging step

if not text.strip():
    raise ValueError("No text extracted from the PDF. Please check the PDF content or format.")

sentences = split_into_sentences(text)
print("Extracted sentences:", sentences)  # Debugging step

if len(sentences) == 0:
    raise ValueError("No sentences were found in the extracted text.")

# Find similar sentences and filter by similarity thresholds
similar_sentences_80 = find_similar_statements(target, sentences)

# Print and store sentences with similarity >= 80%
if similar_sentences_80:
    print("\nSentences with similarity >= 80%:")
    for sentence in similar_sentences_80:
        print(f"Sentence: {sentence}")
else:
    print("\nNo sentences with 80% similarity or higher.")


Extracted text preview: I am Sanjjushri
I love apple
I am an Entrepreneur
I am a Millionaire
I am a Youtuber
I love waterfalls 
Extracted sentences: ['I am Sanjjushri', 'I love apple', 'I am an Entrepreneur', 'I am a Millionaire', 'I am a Youtuber', 'I love waterfalls']
Vector shape: (7, 9)
Number of sentences: 6
Length of target_vector: 7

Sentences with similarity >= 80%:
Sentence: I am Sanjjushri


In [16]:
similar_sentences_80

['I am Sanjjushri']

In [13]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted from the page
                pdf_text += page_text + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    # Split the text on sentence-ending punctuation and newline characters
    sentences = re.split(r'(?<=[.!?]) +|\n+', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences, threshold_70=0.7):
    # Fit and transform the target and all sentences into TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Debugging output
    print(f"Vector shape: {vectors.shape}")
    print(f"Number of sentences: {len(sentences)}")
    print(f"Length of target_vector: {len(target_vector)}")

    # Ensure alignment between target_vector and sentences
    if len(target_vector) - 1 > len(sentences):
        raise ValueError("Vector size mismatch: target_vector contains more elements than sentences.")

    # Filter sentences based on similarity thresholds
    similar_sentences_70 = [
        (sentences[i - 1], score)  # Adjust indexing to align with sentences
        for i, score in enumerate(target_vector[1:], 1)
        if i - 1 < len(sentences) and score >= threshold_70
    ]

    # similar_sentences_80 = [
    #     (sentences[i - 1], score)  # Adjust indexing to align with sentences
    #     for i, score in enumerate(target_vector[1:], 1)
    #     if i - 1 < len(sentences) and score >= threshold_80
    # ]

    return similar_sentences_70

# Example usage
pdf_path = '/Users/sanjju/Downloads/test.pdf'
target = "Sanjjushri"

# Step 4: Extract, split, and find similar sentences
text = extract_text_from_pdf(pdf_path)
print("Extracted text preview:", text[:500])  # Debugging step

if not text.strip():
    raise ValueError("No text extracted from the PDF. Please check the PDF content or format.")

sentences = split_into_sentences(text)
print("Extracted sentences:", sentences)  # Debugging step

if len(sentences) == 0:
    raise ValueError("No sentences were found in the extracted text.")

# Find similar sentences and filter by similarity thresholds
similar_sentences_70 = find_similar_statements(target, sentences)

# Print and store sentences with similarity >= 70%
if similar_sentences_70:
    print("\nSentences with similarity >= 70%:")
    for sentence, similarity in similar_sentences_70:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("\nNo sentences with 70% similarity or higher.")

# Print and store sentences with similarity >= 80%
# if similar_sentences_80:
#     print("\nSentences with similarity >= 80%:")
#     for sentence, similarity in similar_sentences_80:
#         print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
# else:
#     print("\nNo sentences with 80% similarity or higher.")


Extracted text preview: I am Sanjjushri
I love apple
I am an Entrepreneur
I am a Millionaire
I am a Youtuber
I love waterfalls 
Extracted sentences: ['I am Sanjjushri', 'I love apple', 'I am an Entrepreneur', 'I am a Millionaire', 'I am a Youtuber', 'I love waterfalls']
Vector shape: (7, 9)
Number of sentences: 6
Length of target_vector: 7

Sentences with similarity >= 70%:
Similarity: 0.80 - Sentence: I am Sanjjushri


In [14]:
similar_sentences_70

[('I am Sanjjushri', 0.803028938037097)]

In [9]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted from the page
                pdf_text += page_text + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    # Split the text on sentence-ending punctuation and newline characters
    sentences = re.split(r'(?<=[.!?]) +|\n+', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences, threshold=0.7):
    # Fit and transform the target and all sentences into TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Debugging output
    print(f"Vector shape: {vectors.shape}")
    print(f"Number of sentences: {len(sentences)}")
    print(f"Length of target_vector: {len(target_vector)}")

    # Ensure alignment between target_vector and sentences
    if len(target_vector) - 1 > len(sentences):
        raise ValueError("Vector size mismatch: target_vector contains more elements than sentences.")

    # Filter sentences with cosine similarity >= 0.7
    similar_sentences = [
        (sentences[i - 1], score)  # Adjust indexing to align with sentences
        for i, score in enumerate(target_vector[1:], 1)
        if i - 1 < len(sentences) and score >= threshold
    ]

    return similar_sentences

# Example usage
pdf_path = '/Users/sanjju/Downloads/test.pdf'
target = "Sanjjushri"

# Step 4: Extract, split, and find similar sentences
text = extract_text_from_pdf(pdf_path)
print("Extracted text preview:", text[:500])  # Debugging step

if not text.strip():
    raise ValueError("No text extracted from the PDF. Please check the PDF content or format.")

sentences = split_into_sentences(text)
print("Extracted sentences:", sentences)  # Debugging step

if len(sentences) == 0:
    raise ValueError("No sentences were found in the extracted text.")

# Find similar sentences and filter by similarity threshold
similar_sentences = find_similar_statements(target, sentences)

# Print and store sentences with similarity >= 70%
if similar_sentences:
    print("Sentences with similarity >= 70%:")
    for sentence, similarity in similar_sentences:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("No sentences with 70% similarity or higher.")


Extracted text preview: I am Sanjjushri
I love apple
I am an Entrepreneur
I am a Millionaire
I am a Youtuber
I love waterfalls 
Extracted sentences: ['I am Sanjjushri', 'I love apple', 'I am an Entrepreneur', 'I am a Millionaire', 'I am a Youtuber', 'I love waterfalls']
Vector shape: (7, 9)
Number of sentences: 6
Length of target_vector: 7
Sentences with similarity >= 70%:
Similarity: 0.80 - Sentence: I am Sanjjushri


In [10]:
similar_sentences

[('I am Sanjjushri', 0.803028938037097)]

In [8]:
import pdfplumber
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from the PDF using pdfplumber
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted from the page
                pdf_text += page_text + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    # Split the text on sentence-ending punctuation and newline characters
    sentences = re.split(r'(?<=[.!?]) +|\n+', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences):
    # Fit and transform the target and all sentences into TF-IDF vectors
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Debugging output
    print(f"Vector shape: {vectors.shape}")
    print(f"Number of sentences: {len(sentences)}")
    print(f"Length of target_vector: {len(target_vector)}")

    # Ensure alignment between target_vector and sentences
    if len(target_vector) - 1 > len(sentences):
        raise ValueError("Vector size mismatch: target_vector contains more elements than sentences.")

    # Rank sentences by similarity
    similar_sentences = [
        (sentences[i - 1], score)  # Adjust indexing to align with sentences
        for i, score in enumerate(target_vector[1:], 1)
        if i - 1 < len(sentences)
    ]

    return sorted(similar_sentences, key=lambda x: x[1], reverse=True)

# Run the function and check for issues
results = find_similar_statements(target, sentences)

# Print top 5 most similar sentences
if results:
    for sentence, similarity in results[:5]:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("No similar sentences found.")


# Run the function and check for issues
results = find_similar_statements(target, sentences)

# Print top 5 most similar sentences
if results:
    for sentence, similarity in results[:5]:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("No similar sentences found.")


# Example usage
pdf_path = '/Users/sanjju/Downloads/test.pdf'
target = "Sanjjushri"

# Step 4: Extract, split, and find similar sentences
text = extract_text_from_pdf(pdf_path)
print("Extracted text preview:", text[:500])  # Debugging step

if not text.strip():
    raise ValueError("No text extracted from the PDF. Please check the PDF content or format.")

sentences = split_into_sentences(text)
print("Extracted sentences:", sentences)  # Debugging step

if len(sentences) == 0:
    raise ValueError("No sentences were found in the extracted text.")

results = find_similar_statements(target, sentences)

# Print top 5 most similar sentences
if results:
    for sentence, similarity in results[:5]:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("No similar sentences found.")



Vector shape: (7, 13)
Number of sentences: 6
Length of target_vector: 7
Similarity: 0.00 - Sentence: I am Sanjjushri
Similarity: 0.00 - Sentence: I love apple
Similarity: 0.00 - Sentence: I am an Entrepreneur
Similarity: 0.00 - Sentence: I am a Millionaire
Similarity: 0.00 - Sentence: I am a Youtuber
Vector shape: (7, 13)
Number of sentences: 6
Length of target_vector: 7
Similarity: 0.00 - Sentence: I am Sanjjushri
Similarity: 0.00 - Sentence: I love apple
Similarity: 0.00 - Sentence: I am an Entrepreneur
Similarity: 0.00 - Sentence: I am a Millionaire
Similarity: 0.00 - Sentence: I am a Youtuber
Extracted text preview: I am Sanjjushri
I love apple
I am an Entrepreneur
I am a Millionaire
I am a Youtuber
I love waterfalls 
Extracted sentences: ['I am Sanjjushri', 'I love apple', 'I am an Entrepreneur', 'I am a Millionaire', 'I am a Youtuber', 'I love waterfalls']
Vector shape: (7, 9)
Number of sentences: 6
Length of target_vector: 7
Similarity: 0.80 - Sentence: I am Sanjjushri
Similarit

In [None]:
text = extract_text_from_pdf(pdf_path)
if not text.strip():
    raise ValueError("No text extracted from the PDF. Please check the PDF content or format.")

sentences = split_into_sentences(text)
if len(sentences) == 0:
    raise ValueError("No sentences were found in the extracted text.")

results = find_similar_statements(target, sentences)

# Print top 5 most similar sentences
if results:
    for sentence, similarity in results[:5]:
        print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
else:
    print("No similar sentences found.")


In [None]:
import PyPDF2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from PDF
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:  # Check if text was extracted
                pdf_text += page_text + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    if not text.strip():  # Check if the text is empty or whitespace
        return []
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences):
    if not sentences:
        print("No sentences found in the PDF.")
        return []
    
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Rank sentences by similarity (skip the first as it is the target itself)
    similar_sentences = sorted(
        enumerate(target_vector[1:], 1), key=lambda x: x[1], reverse=True
    )
    return [(sentences[i], score) for i, score in similar_sentences]

# Example usage
pdf_path = 'path/to/your/pdf_file.pdf'
target = 'Your target statement to compare against.'

text = extract_text_from_pdf(pdf_path)
if not text:
    print("Failed to extract text from the PDF.")
else:
    sentences = split_into_sentences(text)
    results = find_similar_statements(target, sentences)

    # Print top 5 most similar sentences
    if results:
        for sentence, similarity in results[:5]:
            print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
    else:
        print("No similar statements found.")


In [None]:
import PyPDF2
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Extract text from PDF
def extract_text_from_pdf(file_path):
    pdf_text = ""
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            pdf_text += page.extract_text() + " "
    return pdf_text

# Step 2: Split the text into sentences
def split_into_sentences(text):
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

# Step 3: Find similar sentences using cosine similarity
def find_similar_statements(target_statement, sentences):
    vectorizer = TfidfVectorizer().fit_transform([target_statement] + sentences)
    vectors = vectorizer.toarray()
    cosine_matrix = cosine_similarity(vectors)
    target_vector = cosine_matrix[0]  # First row corresponds to the target statement

    # Rank sentences by similarity (skip the first as it is the target itself)
    similar_sentences = sorted(
        enumerate(target_vector[1:], 1), key=lambda x: x[1], reverse=True
    )
    return [(sentences[i], score) for i, score in similar_sentences]

# Example usage
pdf_path = 'path/to/your/pdf_file.pdf'
target = 'Your target statement to compare against.'

text = extract_text_from_pdf(pdf_path)
sentences = split_into_sentences(text)
results = find_similar_statements(target, sentences)

# Print top 5 most similar sentences
for sentence, similarity in results[:5]:
    print(f"Similarity: {similarity:.2f} - Sentence: {sentence}")
