In [2]:
pip install torch transformers scikit-learn pdfplumber pillow librosa

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
Downloading pdfplumber-0.11.4-py3-none-any.whl (59 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.2/59.2 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pdfminer.six-20231228-py3-none-any.whl (5.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pypdfium2-4.30.0-py3-non

In [3]:
import os
import pdfplumber
import librosa
from PIL import Image
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
import torch

# Load pre-trained model for text embeddings (e.g., Sentence-BERT)
tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")
model = AutoModel.from_pretrained("sentence-transformers/all-MiniLM-L6-v2")

# Define categories and their descriptions
categories = {
    "Finance": "Files related to financial statements, budgets, invoices, and transactions.",
    "Health": "Medical records, health summaries, lab results, and similar documents.",
    "Education": "Academic papers, coursework, research documents, and study materials."
}

# Precompute category embeddings
category_embeddings = {}

def compute_category_embeddings():
    global category_embeddings
    for name, desc in categories.items():
        inputs = tokenizer(desc, return_tensors="pt", truncation=True)
        with torch.no_grad():
            embedding = model(**inputs).last_hidden_state.mean(dim=1)
        category_embeddings[name] = embedding.squeeze().numpy()

# Extract text from PDF files
def extract_text_from_pdf(file_path):
    text = ""
    with pdfplumber.open(file_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() or ""
    return text

# Placeholder: Extract text from images (can integrate OCR tools like pytesseract)
def extract_text_from_image(file_path):
    return "Extracted text from image"

# Extract features from audio files
def extract_features_from_audio(file_path):
    y, sr = librosa.load(file_path, sr=None)
    return librosa.feature.mfcc(y=y, sr=sr).mean(axis=1)

# Generate text embeddings
def get_text_embedding(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings.squeeze().numpy()

# Classify file based on similarity to category embeddings
def classify_file(file_embedding):
    similarities = {name: cosine_similarity([file_embedding], [embedding])[0][0]
                    for name, embedding in category_embeddings.items()}
    return max(similarities, key=similarities.get)

# Main function to classify any file
def classify_any_file(file_path):
    # Determine file type and extract features
    if file_path.endswith(".pdf"):
        text = extract_text_from_pdf(file_path)
        file_embedding = get_text_embedding(text)
    elif file_path.endswith((".jpg", ".png", ".jpeg")):
        text = extract_text_from_image(file_path)
        file_embedding = get_text_embedding(text)
    elif file_path.endswith(".mp3"):
        features = extract_features_from_audio(file_path)
        file_embedding = features  # Use raw features for now
    else:
        raise ValueError("Unsupported file type!")

    # Classify the file
    return classify_file(file_embedding)

# Example usage
if __name__ == "__main__":
    # Compute category embeddings once
    compute_category_embeddings()

    # Provide file path
    file_path = "/content/ln_internal_med_final.pdf"  # Replace with the actual file path
    try:
        category = classify_any_file(file_path)
        print(f"File '{file_path}' is classified as: {category}")
    except ValueError as e:
        print(str(e))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

File '/content/ln_internal_med_final.pdf' is classified as: Health
