In [1]:
# Step 1: Install required libraries
!pip install -q bibtexparser python-docx transformers accelerate sentencepiece

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/55.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for bibtexparser (setup.py) ... [?25l[?25hdone


In [2]:
# Step 2: Mount Google Drive if not already mounted
import os
from google.colab import drive
if not os.path.exists('/content/drive/'):
    drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# Step 3: Set Hugging Face token for Gemma (uncomment if using Gemma)
os.environ["HF_TOKEN"] = "hf_xxx"  # Replace with your Fine-grained token: "hf_xxx"
# # Alternatively, run this to input token interactively:
# # !huggingface-cli login

In [None]:
# Step 4: Load all .bib files from a folder
import glob
import bibtexparser

# Specify your folder path (change to your actual path)
folder_path = "/content/drive/MyDrive/Bibtex_Folder"  
bib_paths = glob.glob(os.path.join(folder_path, "*.bib"))
print(f"Found {len(bib_paths)} .bib files")

Found 7 .bib files


In [5]:
# Step 5: Extract texts (title + abstract) from .bib entries
def load_bib_texts(bib_paths):
    entries = []
    for path in bib_paths:
        with open(path, encoding='utf-8', errors='ignore') as file:
            db = bibtexparser.load(file)
        for entry in db.entries:
            title = entry.get('title', '').replace('\n', ' ').strip()
            abstract = entry.get('abstract', '').replace('\n', ' ').strip()
            text = (title + ". " + abstract).strip()
            entry['_text'] = text
            entry['_source_file'] = os.path.basename(path)
            entries.append(entry)
    return entries

entries = load_bib_texts(bib_paths)
print(f"Loaded {len(entries)} entries")

Loaded 2049 entries


In [6]:
# Step 6: Define query based on the provided Word document outline
query_text = (
    "Forest Inventory Using UAV-RGB Images and Segmentation Methods: Systematic Review of Convolutional Neural Networks "
    "for Tree Delineation, Species Classification, Individual Tree Detection, Instance Segmentation, Vitality Assessment. "
    "Keywords: forest tree crown canopy Convolutional Neural Network deep learning UAV drone RGB"
)
print("Using query:", query_text)

Using query: Forest Inventory Using UAV-RGB Images and Segmentation Methods: Systematic Review of Convolutional Neural Networks for Tree Delineation, Species Classification, Individual Tree Detection, Instance Segmentation, Vitality Assessment. Keywords: forest tree crown canopy Convolutional Neural Network deep learning UAV drone RGB


In [None]:
# Step 7: Load embedding model (choose one by uncommenting)
import torch
from transformers import AutoTokenizer, AutoModel
import numpy as np
from tqdm import tqdm
import torch.nn.functional as F

def load_model():
    # Option 1: Use google/embeddinggemma-300m (requires HF token)
    model_name = "google/embeddinggemma-300m"
    # Option 2: Use sentence-transformers/all-MiniLM-L6-v2 (no token needed)
    # model_name = "sentence-transformers/all-MiniLM-L6-v2"

    try:
        tokenizer = AutoTokenizer.from_pretrained(model_name, token=os.environ.get("HF_TOKEN"))
        model = AutoModel.from_pretrained(model_name, token=os.environ.get("HF_TOKEN"))
        model.eval()
        if torch.cuda.is_available():
            model.to("cuda")
        print(f"Loaded model: {model_name}")
        return tokenizer, model, model_name
    except Exception as e:
        print(f"Failed to load {model_name}: {e}")
        # Fallback to MiniLM if Gemma fails
        fallback_name = "sentence-transformers/all-MiniLM-L6-v2"
        print(f"Falling back to {fallback_name}")
        tokenizer = AutoTokenizer.from_pretrained(fallback_name)
        model = AutoModel.from_pretrained(fallback_name)
        model.eval()
        if torch.cuda.is_available():
            model.to("cuda")
        return tokenizer, model, fallback_name

tokenizer, model, model_name = load_model()

In [8]:
# Step 8: Function for generating embeddings with mean pooling
def embed_texts(texts, tokenizer, model, batch_size=32, max_length=512):
    all_embs = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i + batch_size]
        inputs = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=max_length)
        if torch.cuda.is_available():
            inputs = {k: v.to("cuda") for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
            last_hidden = outputs.last_hidden_state  # (batch, seq_len, hidden_size)
        # Mean pooling
        mask = inputs['attention_mask'].unsqueeze(-1).expand(last_hidden.size()).float()
        summed = torch.sum(last_hidden * mask, 1)
        summed_mask = torch.clamp(mask.sum(1), min=1e-9)
        mean_pooled = summed / summed_mask
        # Normalize
        normalized = F.normalize(mean_pooled, p=2, dim=1)
        all_embs.append(normalized.cpu().numpy())
    return np.vstack(all_embs)

In [9]:
# Step 9: Compute embeddings for query and documents
texts = [entry['_text'] for entry in entries if entry['_text']]
doc_embeddings = embed_texts(texts, tokenizer, model)
query_emb = embed_texts([query_text], tokenizer, model)[0]

100%|██████████| 65/65 [10:41<00:00,  9.87s/it]
100%|██████████| 1/1 [00:00<00:00, 14.54it/s]


In [10]:
# Step 10: Compute cosine similarities
from sklearn.metrics.pairwise import cosine_similarity

similarities = cosine_similarity([query_emb], doc_embeddings)[0]
for entry, sim in zip(entries, similarities):
    entry['_score'] = float(sim)

# Sort entries by score descending
sorted_entries = sorted(entries, key=lambda x: x['_score'], reverse=True)

# Print top 20 for preview
for i, entry in enumerate(sorted_entries[:20]):
    print(f"{i+1}. Title: {entry.get('title', '')[:100]}... Score: {entry['_score']:.4f} Source: {entry['_source_file']}")

1. Title: {Individual tree species identification using dense convolutional network (Densenet) on multitempora... Score: 0.8268 Source: Dr.Sohrabi-Review01-ScopusArticles.bib
2. Title: {Individual tree species identification using dense convolutional network (Densenet) on multitempora... Score: 0.8268 Source: Dr.Sohrabi-Review01-Related.bib
3. Title: {Explainable identification and mapping of trees using UAV RGB image and deep learning}... Score: 0.8076 Source: Dr.Sohrabi-Individual Tree Detection.bib
4. Title: {Mapping the Distribution of High-Value Broadleaf Tree Crowns through Unmanned Aerial Vehicle Image ... Score: 0.8015 Source: Dr.Sohrabi-Review01-ScopusArticles.bib
5. Title: {Mapping the Distribution of High-Value Broadleaf Tree Crowns through Unmanned Aerial Vehicle Image ... Score: 0.8015 Source: Dr.Sohrabi-Review01-MaybeRelated.bib
6. Title: {Assessment of CNN-based methods for individual tree detection on images captured by RGB cameras att... Score: 0.7987 Source: Dr.Sohrab

In [None]:
# Step 11: Classify and save outputs
import pandas as pd
from bibtexparser.bwriter import BibTexWriter
from bibtexparser.bibdatabase import BibDatabase
from copy import deepcopy

threshold = 0.50  # Adjust for higher recall (e.g., 0.40) or precision (e.g., 0.65)

# Create DataFrame for CSV
df = pd.DataFrame([
    {
        'id': entry.get('ID', ''),
        'title': entry.get('title', ''),
        'abstract': entry.get('abstract', ''),
        'score': entry['_score'],
        'source_file': entry['_source_file'],
        'category': 'related' if entry['_score'] >= threshold else 'unrelated'
    } for entry in entries
])
df.to_csv('/content/drive/classified_articles.csv', index=False)
print("Saved CSV: /classified_articles.csv")

# Save related and unrelated .bib files, removing non-string fields
def save_bib(entries_list, output_path):
    db = BibDatabase()
    cleaned_entries = []
    for entry in entries_list:
        # Create a copy to avoid modifying original entry
        cleaned_entry = deepcopy(entry)
        # Remove non-standard fields that may cause issues (e.g., _score is float)
        cleaned_entry.pop('_score', None)
        cleaned_entry.pop('_text', None)
        cleaned_entry.pop('_source_file', None)
        cleaned_entries.append(cleaned_entry)
    db.entries = cleaned_entries
    writer = BibTexWriter()
    with open(output_path, 'w', encoding='utf-8') as file:
        file.write(writer.write(db))

related = [entry for entry in entries if entry['_score'] >= threshold]
unrelated = [entry for entry in entries if entry['_score'] < threshold]

save_bib(related, '/content/drive/related.bib')
save_bib(unrelated, '/content/drive/unrelated.bib')
print("Saved /related.bib and /unrelated.bib")
print(len(entries))

Saved CSV: /Review-EmbeddingGemma/classified_articles.csv
Saved /Review-EmbeddingGemma/related.bib and /Review-EmbeddingGemma/unrelated.bib
2049
