In [50]:
import fitz  # PyMuPDF
from nltk.corpus import stopwords
import nltk
import numpy as np
from nltk.tokenize import word_tokenize, sent_tokenize
from sentence_transformers import SentenceTransformer
import torch
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/martin/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/martin/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/martin/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/martin/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_eng is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/martin/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_dat

True

In [51]:
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

def extract_text_from_pdf(pdf_path):
    """
    Extracts text from each page of a PDF file as a list of strings.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        text_blob (str): String containing the text of each page concatenated together.
        """
    doc = fitz.open(pdf_path)
    text_blob = ""
    for page in doc:
        text_blob += page.get_text("text").replace('-\n', '').replace('\n', ' ')
    doc.close()
    return text_blob

def remove_stopwords(text):
    """
    Removes English stopwords from a given text string.

    Args:
        text (str): Input text.

    Returns:
        str: Text with stopwords removed.
    """
    stop_words = set(stopwords.words("english"))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return " ".join(filtered_words)

def semantic_chunking(text, max_tokens=300, overlap=100):
    # Break into sentences first
    sentences = sent_tokenize(text)

    # Get sentence embeddings
    sentence_embeddings = model.encode(sentences, normalize_embeddings=True)

    # Track token count (approximate)
    token_counts = [len(sentence.split()) for sentence in sentences]

    chunks = []
    current_chunk = []
    current_token_count = 0

    for i, sentence in enumerate(sentences):
        # If adding this sentence would exceed our limit, start a new chunk
        if current_token_count + token_counts[i] > max_tokens and current_chunk:
            chunks.append(" ".join(current_chunk))

            # For overlap, find the most semantically similar sentences to include
            if overlap > 0 and len(current_chunk) > 0:
                start = i-len(current_chunk)
                # Get embeddings for current chunk sentences
                current_embs = sentence_embeddings[start:i]
                # Find sentences with highest similarity to include in overlap
                similarities = cosine_similarity([sentence_embeddings[i]], current_embs)[0]
                overlap_indices = np.argsort(similarities)[-int(overlap/10):]  # Heuristic for number of sentences
                # Add overlapping sentences to new chunk
                current_chunk = [sentences[start+idx] for idx in overlap_indices]
                current_token_count = sum(token_counts[start+idx] for idx in overlap_indices)
            else:
                current_chunk = []
                current_token_count = 0

        current_chunk.append(sentence)
        current_token_count += token_counts[i]

    # Add the last chunk if it's not empty
    if current_chunk:
        chunks.append(" ".join(current_chunk))

    return chunks

In [52]:
extracted_text = extract_text_from_pdf("Stock_Market_Prediction_via_Multi-Source_Multiple_Instance_Learning.pdf")
extracted_text_chunked = semantic_chunking(extracted_text, max_tokens=300, overlap=100)

In [53]:
extracted_text_chunked

['Received July 29, 2018, accepted August 27, 2018, date of publication September 13, 2018, date of current version October 8, 2018. Digital Object Identifier 10.1109/ACCESS.2018.2869735 Stock Market Prediction via Multi-Source Multiple Instance Learning XI ZHANG 1, (Member, IEEE), SIYU QU1, JIEYUN HUANG 1, BINXING FANG1, AND PHILIP YU2, (Fellow, IEEE) 1Key Laboratory of Trustworthy Distributed Computing and Service, Ministry of Education, Beijing University of Posts and Telecommunications, Beijing 100876, China 2Department of Computer Science, The University of Illinois at Chicago, Chicago, IL 60607, USA Corresponding author: Xi Zhang (zhangx@bupt.edu.cn) This work was supported in part by the State Key Development Program of Basic Research of China under Grant 2013CB329605, in part by the Natural Science Foundation of China under Grant 61300014, in part by the NSF under Grant IIS-1526499, Grant IIS-1763325, and Grant CNS-1626432, and in part by the DongGuan Innovative Research Team P

In [54]:
question = model.encode("How did you extract features?", normalize_embeddings=True)
similarities = []
for chunk in extracted_text_chunked:
    chunk_embedding = model.encode(chunk, normalize_embeddings=True)
    similarity = cosine_similarity([question], [chunk_embedding])[0][0]
    similarities.append({'similarity':similarity, 'chunk': chunk})

similarities = sorted(similarities, key=lambda x: (-x['similarity']))
similarities[:3]


[{'similarity': np.float32(0.33650464),
  'chunk': 'The process is shown in Figure 3 and described in detail as follows. FEATURE EXTRACTION The quantitative features are quite simple to extract, we just collect three indices and normalize each index to form di ∈R3×1. The output of an RBM would be a pre-trained vector used as the input to sentence2vec, and then the event representations are obtained. Here we introduce how to extract event representations from news articles and extract the sentiments from posts in social media, which are used as the inputs to M-MI framework. EVENT FEATURE EXTRACTION Conventional methods commonly represent events using simple features such as TF-IDF, noun phrases or named entities. Note that though we use the Chinese dataset in this study, this process can also be applied to other languages. Recent advances in NLP techniques enable more accurate event models with structures. Structured event extraction from texts. 1) Structured event extraction. In this s