In [1]:

from llama_index.core.schema import MetadataMode
from llama_index.llms.ollama import Ollama

In [2]:
from llama_index.llms.openai import OpenAI

In [3]:

import nest_asyncio

nest_asyncio.apply()

In [4]:

llm = Ollama(model="tinyllama")

In [5]:
from llama_index.core.extractors import (
    SummaryExtractor,
    QuestionsAnsweredExtractor,
    TitleExtractor,
    KeywordExtractor,
    BaseExtractor,
)

In [6]:
from llama_index.extractors.entity import EntityExtractor
from llama_index.core.node_parser import TokenTextSplitter


In [7]:

text_splitter = TokenTextSplitter(
    separator=" "
)


In [8]:
class CustomExtractor(BaseExtractor):
    def extract(self, nodes):
        metadata_list = [
            {
                "custom": (
                    node.metadata["document_title"]
                    + "\n"
                    + node.metadata["excerpt_keywords"]
                )
            }
            for node in nodes
        ]
        return metadata_list

In [9]:
extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
     #EntityExtractor(prediction_threshold=0.5),
     SummaryExtractor(summaries=["prev", "self"], llm=llm),
     KeywordExtractor(keywords=10, llm=llm),
    # CustomExtractor()
]

transformations = [text_splitter] + extractors

In [10]:
from llama_index.core import SimpleDirectoryReader

In [11]:

#replace with your pdf path
docs = SimpleDirectoryReader(input_files=["C:/Users/ponna/Downloads/REPORT_24114065.pdf"]).load_data()
front_pages = docs[0:3]
content = docs[63:69]
docs = front_pages + content

In [12]:

import nest_asyncio

nest_asyncio.apply()

In [13]:
from llama_index.core.ingestion import IngestionPipeline

pipeline = IngestionPipeline(transformations=transformations)
#nest_asyncio.apply()


In [15]:
nodes = await pipeline.arun(documents=docs)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:27<00:00,  9.21s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:11<00:00,  3.88s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:10<00:00,  3.55s/it]
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:14<00:00,  4.91s/it]


In [16]:
len(nodes)

3

In [17]:
nodes[0].metadata["section_summary"]

'The project aims to improve credit risk management frameworks by developing a forward-looking classification model with behavioral data and risk-based techniques using data sources such as historical behavioral data, financial indicator data, and machine learning techniques. The project involves analyzing and comparing multiple models based on business risk trade-offs and selecting an appropriate performance metric like F1-score or precision to guide final prediction. It also conducts in-depth analysis of behavioral treaties, engineers meaningful features, and evaluates and optimizes classification thresholds based on business risk traditions, selects final predictions using real-world credit risk managemet goals, and generates risk-informed predictions on an unlabelled validation set.'

In [18]:
len(nodes)

3

In [19]:
for i in nodes:
    print(i.metadata)
    print("-------------------------------------------------------------------------------------------------------------")

{'page_label': '1', 'file_name': 'REPORT_24114065.pdf', 'file_path': 'C:\\Users\\ponna\\Downloads\\REPORT_24114065.pdf', 'file_type': 'application/pdf', 'file_size': 6407402, 'creation_date': '2025-06-16', 'last_modified_date': '2025-06-16', 'document_title': 'Title: Project 2: Credit Card Behavior Score Prediction Using Classification and Risky Techniques\n\nOverview: This project aims to improve the credit risk management framework of a bank by developing a forward-looking classification model using behavioral data and risk-based techniques. The goal is to create an interpretable model that helps the bank manage credit exposure and predict customer default patterns, which will enable them to handle unprecended volumes of financial data and customer behavior data with confidence. In this project, several data sources such as historical behavioral data, financial indicator data, machine learning techniques like classification methods and ensemble methods will be analyzed and compared.'