Skip to content

Commit

Permalink
Merge branch 'master' of github.com:SimonEff/Know-how-Management-OLL
Browse files Browse the repository at this point in the history
  • Loading branch information
mauruswollensak committed Mar 25, 2024
2 parents 7a77be8 + eddd20e commit d945cc4
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 20 deletions.
17 changes: 12 additions & 5 deletions autotagger/create_tags.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,18 +10,25 @@

def iter_texts(data_dir=DATA_DIR):
for path in data_dir.glob("**/*.*"):
parts = []
try:
if path.suffix == '.docx':
doc = docx.Document(str(path))
text = '\n\n'.join(para.text for para in doc.paragraphs)
yield path, text
for i, para in enumerate(doc.paragraphs):
part = para.text
parts.append(part)
yield path, f"Abschnitt {i+1}, {path.name}", part
elif path.suffix == '.pdf':
with path.open("rb") as fp:
pdf = PyPDF2.PdfReader(fp)
text = '\n\n'.join(page.extract_text() for page in pdf.pages)
yield path, text
for i, page in enumerate(pdf.pages):
part = page.extract_text()
parts.append(part)
yield path, f"Seite {i+1}, {path.name}", part
except:
pass
if parts:
yield path, path.name, '\n\n'.join(parts)

def create_tags(text):
nlp_doc = NLP(text)
Expand All @@ -31,5 +38,5 @@ def create_tags(text):
return {'text': text, 'tags': tags}

print("Create tags...")
data = {path.name: create_tags(text) for path, text in tqdm.tqdm(iter_texts(), unit=" documents")}
data = {name: create_tags(text) for path, name, text in tqdm.tqdm(iter_texts(), unit=" documents")}
pathlib.Path("data.json").write_text(json.dumps(data), encoding='utf-8')
10 changes: 5 additions & 5 deletions autotagger/searching.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,22 +16,22 @@ def clean_tags(tags):

def find_top_documents(query, maxn=MAXN, minscore=MINSCORE):
documents = list(DATA.items())
scorer = BM25Okapi([clean_tags(doc['tags']) for filename, doc in documents])
scorer = BM25Okapi([clean_tags(doc['tags']) for name, doc in documents])
scores = scorer.get_scores(query)
top_docs = numpy.argsort(scores)[::-1][:MAXN]
for i in top_docs:
filename, doc = documents[i]
name, doc = documents[i]
score = scores[i]
if score < MINSCORE:
break
yield {
'filename': filename,
'excerpt': textwrap.shorten(doc['text'], 100),
'name': name,
'excerpt': textwrap.shorten(doc['text'], 200),
'tags': doc['tags'],
'score': int(100*score),
}

if __name__ == '__main__':
query = clean_tags(sys.argv[1:])
for match in find_top_documents(query):
print(f"{match['filename']} (Score: {match['score']})")
print(f"{match['name']} (Score: {match['score']})")
31 changes: 23 additions & 8 deletions autotagger/static/style.css
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
body{
background-color: rgb(255, 255, 200);
background-color: #C4DFE6;
}
h1{
text-align: center;
Expand All @@ -8,6 +8,9 @@ form{
margin-bottom: 30px;
text-align:center;
}
#query{
width: 40%;
}
.container{
margin-left: auto;
margin-right: auto;
Expand All @@ -23,19 +26,31 @@ form{
}
.card{
display: block;
box-shadow: 0px 0px 15px #ccc;
border-radius: 10px;
background-image: linear-gradient(to right bottom, #86b4cd, #96c5dd, #86b4cd);
box-shadow: 2px 2px 7px #ccc;
margin-left: auto;
margin-right: auto;
padding: 15px;
margin-bottom: 25px;
}
.card a{
text-decoration: none;
padding: 2px;
background-color: teal;
color: white;
font-size: 36px;
.card b{
font-family:Georgia, 'Times New Roman', Times, serif;
color: #162d3d;
text-overflow: ellipsis;
overflow: hidden;
white-space: nowrap;
max-width: 90%;
margin-bottom: 10px;
display: inline-block;
}
.card i{
font-family:Impact, Haettenschweiler, 'Arial Narrow Bold', sans-serif;
color: #466c8d;
}
.card small{
margin-top: auto;
margin-left: 15px;
font-family:'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
color: #36617d;
}
4 changes: 2 additions & 2 deletions autotagger/templates/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
{% for result in results %}
<div class="card">
<div>
<b>{{result.filename}}</b>
<i class="score">Score {{result.score}}</i>
<i class="score">Score {{result.score}}</i>
<b>{{result.name}}</b>
</div>
<small>{{result.excerpt}}</small>
</div>
Expand Down

0 comments on commit d945cc4

Please sign in to comment.