In [1]:
from top2vec import Top2Vec
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
data = pd.read_csv("data/new_data.csv")
data.head()

Unnamed: 0,description
0,40% of the work scope could only be completed ...
1,The compound license agreement expired and the...
2,Very poor detailing on AFCs and lack of any ow...
3,Lack of communication between CRE and Site Tea...
4,Adequate supervision was provided for the init...


In [4]:
docs = data.description.tolist()

In [5]:
model = Top2Vec(docs, min_count=20)

2025-01-05 09:47:46,697 - top2vec - INFO - Pre-processing documents for training
2025-01-05 09:47:47,251 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model
2025-01-05 09:47:48,791 - top2vec - INFO - Creating joint document/word embedding
2025-01-05 09:49:44,060 - top2vec - INFO - Creating lower dimension embedding of documents
2025-01-05 09:50:06,464 - top2vec - INFO - Finding dense areas of documents
2025-01-05 09:50:06,584 - top2vec - INFO - Finding topics


experiment with the model created

In [6]:
# get the topics identified by the model
model.get_num_topics()

topic_sizes, topic_nums = model.get_topic_sizes()

In [7]:
# the topic sizes gives the number of documents in each topic
print(topic_sizes) 

[610 269 247 236 216 203 186 154 146 144 132 122 107 104 102  82  78  76
  75  68  61  59  50  47  37  27]


In [8]:
# topic nums shows the number of topics identified in the document
print(topic_nums)

[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25]


In [9]:
#get the word groups in each topic
topic_words, word_scores, topic_nums = model.get_topics(10)


In [10]:
for words, scores, num in zip(topic_words, word_scores, topic_nums):
    print(num)
    print(f'Words: {words}')

0
Words: ['construction' 'scaffolding' 'designs' 'design' 'building' 'excavation'
 'buildability' 'surveying' 'contractors' 'workshop' 'contractor'
 'fabrication' 'build' 'detailing' 'inspections' 'designers' 'designed'
 'specification' 'constructed' 'inspection' 'easement' 'repairs'
 'steelwork' 'procurement' 'development' 'foundations' 'projects'
 'parapets' 'concrete' 'built' 'commissioning' 'parapet' 'engineering'
 'compliant' 'refurbishment' 'brickwork' 'maintenance' 'planning'
 'develop' 'stakeholders' 'requirements' 'footbridge' 'demolition'
 'designer' 'drawings' 'methodology' 'structural' 'drilling' 'engineer'
 'manufacturing']
1
Words: ['jms' 'management' 'managers' 'manage' 'tm' 'stakeholders'
 'commissioning' 'manager' 'cdm' 'jv' 'contractors' 'maintenance'
 'workshop' 'cem' 'resourcing' 'briefings' 'clients' 'practices'
 'inspections' 'staff' 'procurement' 'personnel' 'employees' 'development'
 'workforce' 'companies' 'facilities' 'projects' 'briefing' 'contractor'
 'produ

In [11]:
documents, document_scores, document_ids = model.search_documents_by_topic(topic_num=1, num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 821, Score: 0.8476945161819458
-----------
Utilise JMS for resources to reduce reliance on agency labour. ■ Need proactive management of bricklayers and joiners, no continuity ■ Doc controller sacked half way through and no replacement. Use JMS operatives and safety critical operatives where ever possible.
-----------

Document: 2364, Score: 0.825724720954895
-----------
Good communication between all parties prior to works commencing each week. Pre-Start meetings held on a weekly basis. Ensure Pre-Start meetings are held each week to discuss the upcoming works. Good management during possessions from JMS. The full shift was utilised, and JMS ES/COSS was accommodating to works. Good communication between JMS, Subcontractor and Design with any issues that arose on site (i.e. drilling locations). Good flexibility. Surveys findings where submitted to JMS in good timing once completed. Having all parties involved and communicating with one another from early on in the project wor

In [12]:
documents, document_scores, document_ids = model.search_documents_by_keywords(keywords=["cost", "delays", "procurement"], num_docs=5)
for doc, score, doc_id in zip(documents, document_scores, document_ids):
    print(f"Document: {doc_id}, Score: {score}")
    print("-----------")
    print(doc)
    print("-----------")
    print()

Document: 1172, Score: 0.6029401771407675
-----------
Programme effectively started in delay due to need for additional procurement time for large packages(scaffolding and roofing) as a result of minimal contractors with the ability to complete the works and cost. Despite the initial delay an EoT of 8 weeks is agreed in principal with no penalties. When obtaining prices from contractors at tender stage consideration needs to be given on their ability and appetite to execute the works in the current competitive climate. Effective communication with the client and recording of delays and variations enabled agreement.
-----------

Document: 3280, Score: 0.5718628552811948
-----------
Deliveries - delays from suppliers due to payments being held.

-----------

Document: 1116, Score: 0.554788049494889
-----------
Procurement. Location of works. Late procurement delayed an already tight programme. Only avoided being penalised due to client inadequacies in other areas. Due to distance of work

In [13]:
import textwrap

# Example usage of query_documents with text wrapping
query = "delivery delays of procured materials during project"
num_docs = 5

# Assuming the model and query_documents function are defined
documents, doc_scores, doc_ids = model.query_documents(query, num_docs)

print("Most Relevant Documents:\n")
for i, (ids,doc, score) in enumerate(zip(doc_ids,documents, doc_scores)):
    print(f"Score: {score:.4f}\n")
    print(f"Document {ids}:")
    print("\n".join(textwrap.wrap(doc, width=80)))  # Wrap the text to 80 characters per line
    print("-" * 50)


Most Relevant Documents:

Score: 0.6632

Document 3378:
Storage of materials – Better coordination of when materials are required and
when to request them to be brought to the project – procurement schedule
required??
--------------------------------------------------
Score: 0.6561

Document 1172:
Programme effectively started in delay due to need for additional procurement
time for large packages(scaffolding and roofing) as a result of minimal
contractors with the ability to complete the works and cost. Despite the initial
delay an EoT of 8 weeks is agreed in principal with no penalties. When obtaining
prices from contractors at tender stage consideration needs to be given on their
ability and appetite to execute the works in the current competitive climate.
Effective communication with the client and recording of delays and variations
enabled agreement.
--------------------------------------------------
Score: 0.6330

Document 1819:
Identification & Delivery of Materials Late deliver

In [14]:
model.save("data/topic_model")

In [15]:
new_model = Top2Vec.load("data/topic_model")

In [16]:
import textwrap

# Example usage of query_documents with text wrapping
query = "delivery delays of procured materials during project"
num_docs = 5

# Assuming the model and query_documents function are defined
documents, doc_scores, doc_ids = new_model.query_documents(query, num_docs)

print("Most Relevant Documents:\n")
for i, (ids,doc, score) in enumerate(zip(doc_ids,documents, doc_scores)):
    print(f"Score: {score:.4f}\n")
    print(f"Document {ids}:")
    print("\n".join(textwrap.wrap(doc, width=80)))  # Wrap the text to 80 characters per line
    print("-" * 50)


2025-01-05 10:10:13,051 - top2vec - INFO - Downloading all-MiniLM-L6-v2 model


Most Relevant Documents:

Score: 0.6632

Document 3378:
Storage of materials – Better coordination of when materials are required and
when to request them to be brought to the project – procurement schedule
required??
--------------------------------------------------
Score: 0.6561

Document 1172:
Programme effectively started in delay due to need for additional procurement
time for large packages(scaffolding and roofing) as a result of minimal
contractors with the ability to complete the works and cost. Despite the initial
delay an EoT of 8 weeks is agreed in principal with no penalties. When obtaining
prices from contractors at tender stage consideration needs to be given on their
ability and appetite to execute the works in the current competitive climate.
Effective communication with the client and recording of delays and variations
enabled agreement.
--------------------------------------------------
Score: 0.6330

Document 1819:
Identification & Delivery of Materials Late deliver