Skip to content

Commit

Permalink
autotagger via spacy de-core-news-lg, BM25Okapi, FastAPI
Browse files Browse the repository at this point in the history
  • Loading branch information
petsuter committed Mar 24, 2024
1 parent 4de4412 commit d113e3a
Show file tree
Hide file tree
Showing 7 changed files with 192 additions and 0 deletions.
29 changes: 29 additions & 0 deletions autotagger/app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pathlib
from fastapi import FastAPI, Form, Request
from fastapi.templating import Jinja2Templates
from fastapi.staticfiles import StaticFiles
from fastapi.responses import HTMLResponse
import searching

app = FastAPI()
BASE_DIR = pathlib.Path(__file__).parent
templates = Jinja2Templates(directory=[BASE_DIR / "templates"])
app.mount("/static", StaticFiles(directory="static"), name="static")

@app.get('/', response_class=HTMLResponse)
async def index(request: Request, query: str | None = None):
results = []
if query is not None:
query_words = searching.clean_tags(query.split())
results = list(searching.find_top_documents(query_words))
return templates.TemplateResponse("index.html", {
"request": request,
"results": results,
"query": query or '',
})

if __name__ == "__main__":
import uvicorn
import webbrowser
webbrowser.open('http://127.0.0.1:8000')
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True)
35 changes: 35 additions & 0 deletions autotagger/create_tags.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import json
import pathlib
import docx
import PyPDF2
import spacy
import tqdm

NLP = spacy.load("de_core_news_lg")
DATA_DIR = pathlib.Path(r"../data/unclean")

def iter_texts(data_dir=DATA_DIR):
for path in data_dir.glob("**/*.*"):
try:
if path.suffix == '.docx':
doc = docx.Document(str(path))
text = '\n\n'.join(para.text for para in doc.paragraphs)
yield path, text
elif path.suffix == '.pdf':
with path.open("rb") as fp:
pdf = PyPDF2.PdfReader(fp)
text = '\n\n'.join(page.extract_text() for page in pdf.pages)
yield path, text
except:
pass

def create_tags(text):
nlp_doc = NLP(text)
noun_phrases = set(ent.text for ent in nlp_doc.noun_chunks)
named_entities = set(ent.text for ent in nlp_doc.ents)
tags = list(noun_phrases | named_entities)
return {'text': text, 'tags': tags}

print("Create tags...")
data = {path.name: create_tags(text) for path, text in tqdm.tqdm(iter_texts(), unit=" documents")}
pathlib.Path("data.json").write_text(json.dumps(data), encoding='utf-8')
10 changes: 10 additions & 0 deletions autotagger/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
de-core-news-lg==3.7.0
fastapi==0.110.0
numpy==1.26.4
rank_bm25==0.2.2
PyPDF2==3.0.1
python-docx==1.1.0
python-multipart==0.0.9
spacy==3.7.4
tqdm==4.66.2
uvicorn==0.29.0
37 changes: 37 additions & 0 deletions autotagger/searching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import pathlib
import json
import sys
import textwrap

import numpy
from rank_bm25 import BM25Okapi

MAXN = 10
MINSCORE = 0.1

DATA = json.loads(pathlib.Path("data.json").read_text(encoding="utf-8"))

def clean_tags(tags):
return [tag.strip().lower() for tag in tags]

def find_top_documents(query, maxn=MAXN, minscore=MINSCORE):
documents = list(DATA.items())
scorer = BM25Okapi([clean_tags(doc['tags']) for filename, doc in documents])
scores = scorer.get_scores(query)
top_docs = numpy.argsort(scores)[::-1][:MAXN]
for i in top_docs:
filename, doc = documents[i]
score = scores[i]
if score < MINSCORE:
break
yield {
'filename': filename,
'excerpt': textwrap.shorten(doc['text'], 100),
'tags': doc['tags'],
'score': int(100*score),
}

if __name__ == '__main__':
query = clean_tags(sys.argv[1:])
for match in find_top_documents(query):
print(f"{match['filename']} (Score: {match['score']})")
41 changes: 41 additions & 0 deletions autotagger/static/style.css
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
body{
background-color: rgb(255, 255, 200);
}
h1{
text-align: center;
}
form{
margin-bottom: 30px;
text-align:center;
}
.container{
margin-left: auto;
margin-right: auto;
max-width: 60%;
}
.index{
min-height: 400px;
}
.score{
float: right;
font-size: 12px;
margin-right: 5px;
}
.card{
display: block;
box-shadow: 0px 0px 15px #ccc;
margin-left: auto;
margin-right: auto;
padding: 15px;
margin-bottom: 25px;
}
.card a{
text-decoration: none;
padding: 2px;
background-color: teal;
color: white;
font-size: 36px;
}
.card small{
margin-top: auto;
}
21 changes: 21 additions & 0 deletions autotagger/templates/core.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>OpenLegalLab Know How Search</title>
<link rel="stylesheet" href="/static/style.css">
</head>
<body>
<div class="container">
{% block core_header%}
{% endblock core_header%}
<br>
{% block core_body%}
{% endblock core_body%}
<br>
{% block core_footer%}
{% endblock core_footer%}
</div>
</body>
</html>
19 changes: 19 additions & 0 deletions autotagger/templates/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
{% extends 'core.html' %}
{% block core_body%}
<div class="index">
<form method="get" action="/">
<input type="text" id="query" name="query" value="{{query}}">
<input class="submit" type="submit" value="Search">
</form>

{% for result in results %}
<div class="card">
<div>
<b>{{result.filename}}</b>
<i class="score">Score {{result.score}}</i>
</div>
<small>{{result.excerpt}}</small>
</div>
{% endfor %}
</div>
{% endblock core_body%}

0 comments on commit d113e3a

Please sign in to comment.