-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
autotagger via spacy de-core-news-lg, BM25Okapi, FastAPI
- Loading branch information
Showing
7 changed files
with
192 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import pathlib | ||
from fastapi import FastAPI, Form, Request | ||
from fastapi.templating import Jinja2Templates | ||
from fastapi.staticfiles import StaticFiles | ||
from fastapi.responses import HTMLResponse | ||
import searching | ||
|
||
app = FastAPI() | ||
BASE_DIR = pathlib.Path(__file__).parent | ||
templates = Jinja2Templates(directory=[BASE_DIR / "templates"]) | ||
app.mount("/static", StaticFiles(directory="static"), name="static") | ||
|
||
@app.get('/', response_class=HTMLResponse) | ||
async def index(request: Request, query: str | None = None): | ||
results = [] | ||
if query is not None: | ||
query_words = searching.clean_tags(query.split()) | ||
results = list(searching.find_top_documents(query_words)) | ||
return templates.TemplateResponse("index.html", { | ||
"request": request, | ||
"results": results, | ||
"query": query or '', | ||
}) | ||
|
||
if __name__ == "__main__": | ||
import uvicorn | ||
import webbrowser | ||
webbrowser.open('http://127.0.0.1:8000') | ||
uvicorn.run("app:app", host="0.0.0.0", port=8000, reload=True) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import json | ||
import pathlib | ||
import docx | ||
import PyPDF2 | ||
import spacy | ||
import tqdm | ||
|
||
NLP = spacy.load("de_core_news_lg") | ||
DATA_DIR = pathlib.Path(r"../data/unclean") | ||
|
||
def iter_texts(data_dir=DATA_DIR): | ||
for path in data_dir.glob("**/*.*"): | ||
try: | ||
if path.suffix == '.docx': | ||
doc = docx.Document(str(path)) | ||
text = '\n\n'.join(para.text for para in doc.paragraphs) | ||
yield path, text | ||
elif path.suffix == '.pdf': | ||
with path.open("rb") as fp: | ||
pdf = PyPDF2.PdfReader(fp) | ||
text = '\n\n'.join(page.extract_text() for page in pdf.pages) | ||
yield path, text | ||
except: | ||
pass | ||
|
||
def create_tags(text): | ||
nlp_doc = NLP(text) | ||
noun_phrases = set(ent.text for ent in nlp_doc.noun_chunks) | ||
named_entities = set(ent.text for ent in nlp_doc.ents) | ||
tags = list(noun_phrases | named_entities) | ||
return {'text': text, 'tags': tags} | ||
|
||
print("Create tags...") | ||
data = {path.name: create_tags(text) for path, text in tqdm.tqdm(iter_texts(), unit=" documents")} | ||
pathlib.Path("data.json").write_text(json.dumps(data), encoding='utf-8') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
de-core-news-lg==3.7.0 | ||
fastapi==0.110.0 | ||
numpy==1.26.4 | ||
rank_bm25==0.2.2 | ||
PyPDF2==3.0.1 | ||
python-docx==1.1.0 | ||
python-multipart==0.0.9 | ||
spacy==3.7.4 | ||
tqdm==4.66.2 | ||
uvicorn==0.29.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import pathlib | ||
import json | ||
import sys | ||
import textwrap | ||
|
||
import numpy | ||
from rank_bm25 import BM25Okapi | ||
|
||
MAXN = 10 | ||
MINSCORE = 0.1 | ||
|
||
DATA = json.loads(pathlib.Path("data.json").read_text(encoding="utf-8")) | ||
|
||
def clean_tags(tags): | ||
return [tag.strip().lower() for tag in tags] | ||
|
||
def find_top_documents(query, maxn=MAXN, minscore=MINSCORE): | ||
documents = list(DATA.items()) | ||
scorer = BM25Okapi([clean_tags(doc['tags']) for filename, doc in documents]) | ||
scores = scorer.get_scores(query) | ||
top_docs = numpy.argsort(scores)[::-1][:MAXN] | ||
for i in top_docs: | ||
filename, doc = documents[i] | ||
score = scores[i] | ||
if score < MINSCORE: | ||
break | ||
yield { | ||
'filename': filename, | ||
'excerpt': textwrap.shorten(doc['text'], 100), | ||
'tags': doc['tags'], | ||
'score': int(100*score), | ||
} | ||
|
||
if __name__ == '__main__': | ||
query = clean_tags(sys.argv[1:]) | ||
for match in find_top_documents(query): | ||
print(f"{match['filename']} (Score: {match['score']})") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
body{ | ||
background-color: rgb(255, 255, 200); | ||
} | ||
h1{ | ||
text-align: center; | ||
} | ||
form{ | ||
margin-bottom: 30px; | ||
text-align:center; | ||
} | ||
.container{ | ||
margin-left: auto; | ||
margin-right: auto; | ||
max-width: 60%; | ||
} | ||
.index{ | ||
min-height: 400px; | ||
} | ||
.score{ | ||
float: right; | ||
font-size: 12px; | ||
margin-right: 5px; | ||
} | ||
.card{ | ||
display: block; | ||
box-shadow: 0px 0px 15px #ccc; | ||
margin-left: auto; | ||
margin-right: auto; | ||
padding: 15px; | ||
margin-bottom: 25px; | ||
} | ||
.card a{ | ||
text-decoration: none; | ||
padding: 2px; | ||
background-color: teal; | ||
color: white; | ||
font-size: 36px; | ||
} | ||
.card small{ | ||
margin-top: auto; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
<!DOCTYPE html> | ||
<html lang="en"> | ||
<head> | ||
<meta charset="UTF-8"> | ||
<meta name="viewport" content="width=device-width, initial-scale=1.0"> | ||
<title>OpenLegalLab Know How Search</title> | ||
<link rel="stylesheet" href="/static/style.css"> | ||
</head> | ||
<body> | ||
<div class="container"> | ||
{% block core_header%} | ||
{% endblock core_header%} | ||
<br> | ||
{% block core_body%} | ||
{% endblock core_body%} | ||
<br> | ||
{% block core_footer%} | ||
{% endblock core_footer%} | ||
</div> | ||
</body> | ||
</html> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{% extends 'core.html' %} | ||
{% block core_body%} | ||
<div class="index"> | ||
<form method="get" action="/"> | ||
<input type="text" id="query" name="query" value="{{query}}"> | ||
<input class="submit" type="submit" value="Search"> | ||
</form> | ||
|
||
{% for result in results %} | ||
<div class="card"> | ||
<div> | ||
<b>{{result.filename}}</b> | ||
<i class="score">Score {{result.score}}</i> | ||
</div> | ||
<small>{{result.excerpt}}</small> | ||
</div> | ||
{% endfor %} | ||
</div> | ||
{% endblock core_body%} |