autotagger via spacy de-core-news-lg, BM25Okapi, FastAPI

SimonEff · Mar 24, 2024 · d113e3a · d113e3a
1 parent 4de4412
commit d113e3a
Show file tree

Hide file tree

Showing 7 changed files with 192 additions and 0 deletions.
diff --git a/autotagger/app.py b/autotagger/app.py
@@ -0,0 +1,29 @@
+import pathlib
+from fastapi import FastAPI, Form, Request
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse
+import searching
+
+app = FastAPI()
+BASE_DIR = pathlib.Path(__file__).parent
+templates = Jinja2Templates(directory=[BASE_DIR / "templates"])
+app.mount("/static", StaticFiles(directory="static"), name="static")
+
+@app.get('/', response_class=HTMLResponse)
+async def index(request: Request, query: str | None = None):
+    results = []
+    if query is not None:
+        query_words = searching.clean_tags(query.split())
+        results = list(searching.find_top_documents(query_words))
+    return templates.TemplateResponse("index.html", {
+        "request": request,
+        "results": results,
+        "query": query or '',
+    })
+
+if __name__ == "__main__":
+    import uvicorn
+    import webbrowser
+    webbrowser.open('http://127.0.0.1:8000')
+    uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
diff --git a/autotagger/create_tags.py b/autotagger/create_tags.py
@@ -0,0 +1,35 @@
+import json
+import pathlib
+import docx
+import PyPDF2
+import spacy
+import tqdm
+
+NLP = spacy.load("de_core_news_lg")
+DATA_DIR = pathlib.Path(r"../data/unclean")
+
+def iter_texts(data_dir=DATA_DIR):
+    for path in data_dir.glob("**/*.*"):
+        try:
+            if path.suffix == '.docx':
+                doc = docx.Document(str(path))
+                text = '\n\n'.join(para.text for para in doc.paragraphs)
+                yield path, text
+            elif path.suffix == '.pdf':
+                with path.open("rb") as fp:
+                    pdf = PyPDF2.PdfReader(fp)
+                    text = '\n\n'.join(page.extract_text() for page in pdf.pages)
+                    yield path, text
+        except:
+            pass
+
+def create_tags(text):
+    nlp_doc = NLP(text)
+    noun_phrases = set(ent.text for ent in nlp_doc.noun_chunks)
+    named_entities = set(ent.text for ent in nlp_doc.ents)
+    tags = list(noun_phrases | named_entities)
+    return {'text': text, 'tags': tags}
+
+print("Create tags...")
+data = {path.name: create_tags(text) for path, text in tqdm.tqdm(iter_texts(), unit=" documents")}
+pathlib.Path("data.json").write_text(json.dumps(data), encoding='utf-8')
diff --git a/autotagger/requirements.txt b/autotagger/requirements.txt
@@ -0,0 +1,10 @@
+de-core-news-lg==3.7.0
+fastapi==0.110.0
+numpy==1.26.4
+rank_bm25==0.2.2
+PyPDF2==3.0.1
+python-docx==1.1.0
+python-multipart==0.0.9
+spacy==3.7.4
+tqdm==4.66.2
+uvicorn==0.29.0
diff --git a/autotagger/searching.py b/autotagger/searching.py
@@ -0,0 +1,37 @@
+import pathlib
+import json
+import sys
+import textwrap
+
+import numpy
+from rank_bm25 import BM25Okapi
+
+MAXN = 10
+MINSCORE = 0.1
+
+DATA = json.loads(pathlib.Path("data.json").read_text(encoding="utf-8"))
+
+def clean_tags(tags):
+    return [tag.strip().lower() for tag in tags]
+
+def find_top_documents(query, maxn=MAXN, minscore=MINSCORE):
+    documents = list(DATA.items())
+    scorer = BM25Okapi([clean_tags(doc['tags']) for filename, doc in documents])
+    scores = scorer.get_scores(query)
+    top_docs = numpy.argsort(scores)[::-1][:MAXN]
+    for i in top_docs:
+        filename, doc = documents[i]
+        score = scores[i]
+        if score < MINSCORE:
+            break
+        yield {
+            'filename': filename,
+            'excerpt': textwrap.shorten(doc['text'], 100),
+            'tags': doc['tags'],
+            'score': int(100*score),
+        }
+
+if __name__ == '__main__':
+    query = clean_tags(sys.argv[1:])
+    for match in find_top_documents(query):
+        print(f"{match['filename']} (Score: {match['score']})")
diff --git a/autotagger/static/style.css b/autotagger/static/style.css
@@ -0,0 +1,41 @@
+body{
+    background-color: rgb(255, 255, 200);
+}
+h1{
+    text-align: center;
+}
+form{
+    margin-bottom: 30px;
+    text-align:center;
+}
+.container{
+    margin-left: auto;
+    margin-right: auto;
+    max-width: 60%;
+}
+.index{
+    min-height: 400px;
+}
+.score{
+    float: right;
+    font-size: 12px;
+    margin-right: 5px;
+}
+.card{
+    display: block;
+    box-shadow: 0px 0px 15px #ccc;
+    margin-left: auto;
+    margin-right: auto;
+    padding: 15px;
+    margin-bottom: 25px;
+}
+.card a{
+    text-decoration: none;
+    padding: 2px;
+    background-color: teal;
+    color: white;
+    font-size: 36px;
+}
+.card small{
+    margin-top: auto;
+}
diff --git a/autotagger/templates/core.html b/autotagger/templates/core.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>OpenLegalLab Know How Search</title>
+    <link rel="stylesheet" href="/static/style.css">
+</head>
+<body>
+    <div class="container">
+        {% block core_header%}
+        {% endblock core_header%}
+        <br>
+        {% block core_body%}
+        {% endblock core_body%}
+        <br>
+        {% block core_footer%}
+        {% endblock core_footer%}
+    </div>
+</body>
+</html>
diff --git a/autotagger/templates/index.html b/autotagger/templates/index.html
@@ -0,0 +1,19 @@
+{% extends 'core.html' %}
+{% block core_body%}
+<div class="index">
+    <form method="get" action="/">
+        <input type="text" id="query" name="query" value="{{query}}">
+        <input class="submit" type="submit" value="Search">
+    </form>
+
+    {% for result in results %}
+        <div class="card">
+        <div>
+                <b>{{result.filename}}</b>
+                <i class="score">Score {{result.score}}</i>
+        </div>
+        <small>{{result.excerpt}}</small>
+    </div>
+    {% endfor %}
+</div>
+{% endblock core_body%}