In [None]:
!pip install fpdf

Collecting fpdf
  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py) ... [?25l[?25hdone
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40704 sha256=6416bed4b91e74f3eb1704a69fdebd8ef20355de14ba06b3b0acab4e9997a931
  Stored in directory: /root/.cache/pip/wheels/65/4f/66/bbda9866da446a72e206d6484cd97381cbc7859a7068541c36
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [None]:
from fpdf import FPDF

# Create a class inheriting from FPDF to format the PDF
class PDF(FPDF):
    def header(self):
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'LiDAR-Based Tree Height Classification', 0, 1, 'C')

    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 10)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

# Create an instance of the PDF class
pdf = PDF()
pdf.set_auto_page_break(auto=True, margin=15)

# Add a page to the PDF
pdf.add_page()

# Title
pdf.set_font("Arial", size=16, style='B')
pdf.cell(200, 10, txt="LiDAR-Based Tree Height Classification: A Study of Remote Sensing and Data Analytics", ln=True, align='C')

# Introduction
pdf.ln(10)  # Line break
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt="LiDAR (Light Detection and Ranging) is a remote sensing technology that uses laser light to measure distances to the Earth's surface. It has gained significant attention in the field of geospatial data analysis due to its high accuracy and ability to capture fine details in topography, vegetation, and other features of the landscape. One of the key applications of LiDAR is in the classification of tree heights, which is vital for forest management, ecological studies, and environmental monitoring.\n\n"
                          "In this project, we aim to classify tree heights from LiDAR data, focusing on the analysis of point clouds and raster data. By leveraging advanced data analysis techniques and machine learning algorithms, the project aims to automate the process of identifying and classifying trees based on their heights, helping to create more efficient mapping systems for forests and other green spaces.")

# Project Overview
pdf.ln(10)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt="The project begins with data collection from publicly available LiDAR datasets. These datasets, such as those from NASA or national surveying bodies, contain information about the Earth's surface, including the vegetation height, which is essential for tree height classification.\n\n"
                          "Using Python-based tools, including libraries like 'laspy', 'pandas', and 'numpy', we preprocess the LiDAR data to extract useful features such as the height of the trees and the surrounding topography. After preprocessing, the data is visualized using specialized libraries such as 'matplotlib' and 'plotly' to better understand the structure and distribution of the trees.")

# Data Processing and Classification
pdf.ln(10)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt="The key challenge in this project lies in the effective classification of tree heights from the raw LiDAR data. This involves segmenting the point clouds to isolate the trees from the rest of the surface, followed by the application of clustering algorithms like K-Means or DBSCAN to identify individual tree structures. Once the trees are identified, the height of each tree is calculated using the difference between the highest points of the tree canopy and the ground surface.\n\n"
                          "To improve accuracy, we employ machine learning models such as Random Forest and Support Vector Machines (SVM) to train classifiers that can predict the tree height based on the features extracted from the LiDAR data. The models are evaluated using metrics like accuracy, precision, recall, and F1-score to ensure the best performance.")

# Results and Applications
pdf.ln(10)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt="Upon successful classification, the resulting data can be used for various applications, including:\n"
                          "- Forest Management: Determining the growth rate and health of trees by monitoring changes in height over time.\n"
                          "- Ecological Studies: Analyzing the relationship between tree height and biodiversity in different ecosystems.\n"
                          "- Urban Planning: Identifying areas with dense vegetation for potential green spaces in urban environments.")

# Conclusion
pdf.ln(10)
pdf.set_font("Arial", size=12)
pdf.multi_cell(0, 10, txt="LiDAR-based tree height classification has the potential to revolutionize the way we monitor and manage forests and green spaces. By automating the analysis of LiDAR data, we can significantly reduce the time and resources required for manual classification. Moreover, the application of machine learning techniques enhances the precision and scalability of the process, making it a valuable tool for large-scale environmental studies.")

# Save the PDF with UTF-8 encoding
pdf.output("LiDAR_Tree_Height_Classification_Project.pdf", 'F')

print("PDF generated successfully!")


PDF generated successfully!


In [None]:

!pip install transformers PyMuPDF torch scikit-learn


Collecting PyMuPDF
  Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.2-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyMuPDF
Successfully installed PyMuPDF-1.25.2


In [None]:
import fitz  # PyMuPDF

# Open the PDF file
pdf_document = "/content/LiDAR_Tree_Height_Classification_Project.pdf"
document = fitz.open(pdf_document)

# Extract text from each page
extracted_text = ""
for page_number in range(len(document)):
    page = document.load_page(page_number)
    extracted_text += page.get_text()

# Save the extracted text to a file
with open("extracted_text.txt", "w", encoding="utf-8") as file:
    file.write(extracted_text)

print("Text extracted and saved successfully!")


Text extracted and saved successfully!


In [None]:
with open("extracted_text.txt", "r", encoding="utf-8") as file:
    text = file.read()

# Tokenization and cleaning can be added here if needed


In [None]:
from transformers import BertTokenizer

# Prepare the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer(text, padding=True, truncation=True, return_tensors='pt')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer
from sklearn.metrics import accuracy_score

class SimpleNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Prepare the data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Create multiple text samples
texts = ["Your text data here", "Another sample text", "More sample data", "Yet another text"]
labels = [1, 0, 1, 0]  # Dummy labels for binary classification

tokens = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

# Check the shape of the tokens
print(tokens['input_ids'].shape)  # Should be torch.Size([num_samples, sequence_length])

# Ensure the input is in the right shape
input_dim = tokens['input_ids'].shape[1]
hidden_dim = 128
output_dim = 2  # Binary classification, so two output classes

# Create model
model = SimpleNN(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Move model and data to the same device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
tokens = {key: val.to(device) for key, val in tokens.items()}
labels = torch.tensor(labels).to(device)

# Train the model
epochs = 10
for epoch in range(epochs):
    model.train()
    optimizer.zero_grad()
    outputs = model(tokens['input_ids'].float())
    loss = criterion(outputs, labels)
    loss.backward()
    optimizer.step()
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item()}')

# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = model(tokens['input_ids'].float())
    predicted_labels = torch.argmax(predictions, dim=1)

# Calculate accuracy
predicted_labels_cpu = predicted_labels.cpu()  # Move back to CPU for evaluation
labels_cpu = labels.cpu()  # Move back to CPU for evaluation
accuracy = accuracy_score(labels_cpu, predicted_labels_cpu)
print(f'Accuracy: {accuracy}')


torch.Size([4, 6])
Epoch [1/10], Loss: 204.22879028320312
Epoch [2/10], Loss: 141.3643798828125
Epoch [3/10], Loss: 94.46873474121094
Epoch [4/10], Loss: 48.01502990722656
Epoch [5/10], Loss: 14.218053817749023
Epoch [6/10], Loss: 51.577972412109375
Epoch [7/10], Loss: 71.94116973876953
Epoch [8/10], Loss: 79.19939422607422
Epoch [9/10], Loss: 76.75910186767578
Epoch [10/10], Loss: 66.69715881347656
Accuracy: 0.75


In [None]:
pip install langchain pydantic rank-bm25 openai faiss-cpu


Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0.post1


In [None]:
!pip install langchain-community




In [None]:
!pip install pypdf

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/298.0 kB[0m [31m11.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


In [None]:
!pip install langchain-openai


Collecting langchain-openai
  Downloading langchain_openai-0.3.2-py3-none-any.whl.metadata (2.7 kB)
Collecting tiktoken<1,>=0.7 (from langchain-openai)
  Downloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Downloading langchain_openai-0.3.2-py3-none-any.whl (54 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.8.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken, langchain-openai
Successfully installed langchain-openai-0.3.2 tiktoken-0.8.0


In [None]:
import asyncio
import os
import re
from pydantic import BaseModel, Field
from typing import List
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from rank_bm25 import BM25Okapi
from langchain.llms import OpenAI

# Set OpenAI API Key
os.environ["OPENAI_API_KEY"] = "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"

# Function to encode PDF
def encode_pdf(file_path: str):
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

# Function to encode text from string
def encode_from_string(texts: List[str], chunk_size: int = 500, chunk_overlap: int = 50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    docs = text_splitter.create_documents(texts)
    return docs

# Retry function with exponential backoff
async def retry_with_exponential_backoff(func, *args, initial_delay=1, max_delay=60, retries=5, **kwargs):
    delay = initial_delay
    for attempt in range(retries):
        try:
            return await func(*args, **kwargs)
        except Exception as e:
            if attempt == retries - 1:
                raise e
            await asyncio.sleep(delay)
            delay = min(delay * 2, max_delay)

# Class for BM25 and OpenAI Retrieval
class QuestionAnswerFromContext(BaseModel):
    question: str = Field(description="The input question.")

    def retrieve_context_per_question(self, docs, k: int = 5) -> str:
        cleaned_texts = [re.sub(r'\s+', ' ', doc.page_content.strip()) for doc in docs]
        tokenized_texts = [doc.split() for doc in cleaned_texts]
        bm25 = BM25Okapi(tokenized_texts)
        tokenized_query = self.question.split()
        top_n = bm25.get_top_n(tokenized_query, cleaned_texts, n=k)
        return " ".join(top_n)

# Function to initialize and retrieve OpenAI embeddings
def get_embeddings(docs):
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(docs, embeddings)
    return vectorstore

# Function to perform question answering
async def ask_question(question: str, vectorstore, k: int = 5):
    docs = vectorstore.similarity_search(question, k=k)
    llm = OpenAI(model_name="text-davinci-003")
    context = " ".join([doc.page_content for doc in docs])
    return await retry_with_exponential_backoff(llm.call, prompt=f"Question: {question}\nContext: {context}\nAnswer:")

# Main function
def main():
    # Load PDF
    file_path = "/content/LiDAR_Tree_Height_Classification_Project.pdf"
    docs = encode_pdf(file_path)

    # Create embeddings
    vectorstore = get_embeddings(docs)

    # Question-answering
    question = "What is the tree height classification process?"
    context_retriever = QuestionAnswerFromContext(question=question)
    bm25_context = context_retriever.retrieve_context_per_question(docs)
    print("Context from BM25:")
    print(bm25_context)

    # Use OpenAI LLM
    answer = asyncio.run(ask_question(question, vectorstore))
    print("Answer from OpenAI LLM:")
    print(answer)

if __name__ == "__main__":
    main()


  embeddings = OpenAIEmbeddings()


RateLimitError: Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}

In [None]:
pip install sentence-transformers transformers




In [None]:
pip install -U langchain-community


Collecting langchain-community
  Downloading langchain_community-0.3.15-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain-0.3.15-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.31 (from langchain-community)
  Downloading langchain_core-0.3.31-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.7.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.0-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-

In [None]:
pip install rank-bm25




In [None]:
pip install pypdf



In [None]:
pip install nest_asyncio




In [None]:
import asyncio
import os
import re
from pydantic import BaseModel, Field
from typing import List, Dict
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import FAISS
from rank_bm25 import BM25Okapi
from sentence_transformers import SentenceTransformer
from transformers import pipeline
import faiss
import numpy as np
import nest_asyncio
from langchain.docstore.document import Document  # Correctly importing Document class

nest_asyncio.apply()

# Custom SimpleDocstore class
class SimpleDocstore:
    def __init__(self, docs: Dict[str, Document]):
        self.docs = docs

    def search(self, doc_id: str) -> Document:
        return self.docs[doc_id]

# Function to encode PDF
def encode_pdf(file_path: str):
    loader = PyPDFLoader(file_path)
    docs = loader.load()
    return docs

# Function to encode text from string
def encode_from_string(texts: List[str], chunk_size: int = 500, chunk_overlap: int = 50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    docs = text_splitter.create_documents(texts)
    return docs

# Retry function with exponential backoff
async def retry_with_exponential_backoff(func, *args, initial_delay=1, max_delay=60, retries=5, **kwargs):
    delay = initial_delay
    for attempt in range(retries):
        try:
            return func(*args, **kwargs)  # Remove 'await' from here
        except Exception as e:
            if attempt == retries - 1:
                raise e
            await asyncio.sleep(delay)
            delay = min(delay * 2, max_delay)

# Class for BM25 Retrieval
class QuestionAnswerFromContext(BaseModel):
    question: str = Field(description="The input question.")

    def retrieve_context_per_question(self, docs, k: int = 5) -> str:
        cleaned_texts = [re.sub(r'\s+', ' ', doc.page_content.strip()) for doc in docs]
        tokenized_texts = [doc.split() for doc in cleaned_texts]
        bm25 = BM25Okapi(tokenized_texts)
        tokenized_query = self.question.split()
        top_n = bm25.get_top_n(tokenized_query, cleaned_texts, n=k)
        return " ".join(top_n)

# Function to initialize and retrieve embeddings
def get_embeddings(docs):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = np.array([model.encode(doc.page_content) for doc in docs])

    # Create FAISS index
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)

    # Create a simple docstore
    docstore = SimpleDocstore({str(i): doc for i, doc in enumerate(docs)})

    vectorstore = FAISS(index=index, docstore=docstore, index_to_docstore_id={i: str(i) for i in range(len(docs))}, embedding_function=model.encode)
    vectorstore.index.add(embeddings)

    return vectorstore

# Function to perform question answering
async def ask_question(question: str, vectorstore, k: int = 5):
    docs = vectorstore.similarity_search(question, k=k)
    model = pipeline("question-answering")
    context = " ".join([doc.page_content for doc in docs])
    response = await retry_with_exponential_backoff(model, question=question, context=context)
    return response['answer']

# Main function
def main():
    # Load PDF
    file_path = "/content/LiDAR_Tree_Height_Classification_Project.pdf"
    docs = encode_pdf(file_path)

    # Create embeddings
    vectorstore = get_embeddings(docs)

    # Question-answering
    question = "What is LiDAR and how is it used in geospatial data analysis?"
    context_retriever = QuestionAnswerFromContext(question=question)
    bm25_context = context_retriever.retrieve_context_per_question(docs)
    print("Context from BM25:")
    print(bm25_context)

    # Use language model for question answering
    answer = asyncio.run(ask_question(question, vectorstore))
    print("Answer from the model:")
    print(answer)

if __name__ == "__main__":
    main()


No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 564e9b5 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Context from BM25:
LiDAR-Based Tree Height Classification difference between the highest points of the tree canopy and the ground surface. To improve accuracy, we employ machine learning models such as Random Forest and Support Vector Machines (SVM) to train classifiers that can predict the tree height based on the features extracted from the LiDAR data. The models are evaluated using metrics like accuracy, precision, recall, and F1-score to ensure the best performance. Upon successful classification, the resulting data can be used for various applications, including: - Forest Management: Determining the growth rate and health of trees by monitoring changes in height over time. - Ecological Studies: Analyzing the relationship between tree height and biodiversity in different ecosystems. - Urban Planning: Identifying areas with dense vegetation for potential green spaces in urban environments. LiDAR-based tree height classification has the potential to revolutionize the way we monitor a

Device set to use cpu


Answer from the model:
high accuracy
