In [None]:
!pip install pyLDAvis sentence_transformers keybert

Collecting pyLDAvis
  Using cached pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting keybert
  Downloading keybert-0.9.0-py3-none-any.whl.metadata (15 kB)
Collecting funcy (from pyLDAvis)
  Using cached funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence_transformers)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from firebase_admin import firestore, credentials
import firebase_admin

cred = credentials.Certificate('/content/drive/MyDrive/itpe_exams/exam-matching-firebase-adminsdk-hdm65-231e235b83.json')
firebase_admin.initialize_app(cred)

db = firestore.client()

# **Job description Preparation**

In [None]:
job_description = """Partnering with business users to gather requirements to formulate business specifications and translating them into business functionalities.
To participate in cross-functional projects related to procurement transformation, digitalization, and strategic sourcing to support organizational goals.
Documenting workflows and results of business processes, analysis and obtaining sign-off from users on specifications.
Designing and executing test plans to ensure that business requirements and functional specifications are implemented properly and meet objectives.
Need to translate application functionality into application architecture and the production of a business functionality requirement document.
Configure and customize procurement & expense systems such as Rydoo, Concur, Coupa, Oracle Fusion, SAP Ariba, Tipalti with ERP systems and supplier management tools, to meet business needs and optimize procurement processes.
Conducts meetings on all levels of management for purposes of presentation reviews, approvals, recommendations.
Experienced in SDLC process including requirements gathering, analysis and system design.
Works with IT Operations, Other IT BizApps groups and business users as necessary.
Develops, edits, and maintains SOPs and standards manuals.
Control and Monitor system performance, security, standards, compliances and data integrity with procurement policies, guidelines, procedures, and regulatory requirements, and implement corrective actions as needed.
Conducts training sessions to implement new or improved systems and procedures.
Act as a functional specialist in translating business requirements into technical requirements and contribute in solution design.
Constantly looking for ways to transform the BizApps, simplify the workflows / processes.
Interpersonal skills, to help negotiate and to resolve conflicts among project stakeholders, and internal team members"""

# **Save file**

In [None]:
import pandas as pd

def save_file(result, file_name):
  df = pd.DataFrame({'questions':result,})
  df['1'] = ''
  df['0'] = ''
  df['-1'] = ''
  df.to_csv(f'/content/drive/MyDrive/itpe_exams/results/IT Business/Senior IT Business Applications Engineer (Amagi)/business-{file_name}.csv', index=False)

# **KeyAlign: KeyBert + Clustering**

In [None]:
import re

job_description = job_description.replace('\n', ' ')
job_description = re.sub(r'\d+', ' ', job_description)
job_description = re.sub(r'[()\/\+,.]', ' ', job_description)
job_description = re.sub(r'\s+', ' ', job_description)
job_description = job_description.strip()
job_description

'Partnering with business users to gather requirements to formulate business specifications and translating them into business functionalities To participate in cross-functional projects related to procurement transformation digitalization and strategic sourcing to support organizational goals Documenting workflows and results of business processes analysis and obtaining sign-off from users on specifications Designing and executing test plans to ensure that business requirements and functional specifications are implemented properly and meet objectives Need to translate application functionality into application architecture and the production of a business functionality requirement document Configure and customize procurement & expense systems such as Rydoo Concur Coupa Oracle Fusion SAP Ariba Tipalti with ERP systems and supplier management tools to meet business needs and optimize procurement processes Conducts meetings on all levels of management for purposes of presentation review

In [None]:
from keybert import KeyBERT

jd = job_description.split()
top_n = min(15, max(5, int(0.3 * len(jd))))

kw_model = KeyBERT()
keywords = kw_model.extract_keywords(job_description,
                                     keyphrase_ngram_range=(2,2),
                                     highlight=True,
                                     stop_words='english',
                                     diversity=0.7,
                                     top_n=top_n)
keywords

[('requirements technical', 0.5873),
 ('requirements implement', 0.5805),
 ('requirements gathering', 0.5519),
 ('business requirements', 0.5398),
 ('requirements functional', 0.5384),
 ('procurement processes', 0.5295),
 ('business specifications', 0.529),
 ('gather requirements', 0.5267),
 ('requirements formulate', 0.5187),
 ('related procurement', 0.5179),
 ('procurement transformation', 0.4999),
 ('business functionality', 0.4928),
 ('business processes', 0.4848),
 ('customize procurement', 0.4812),
 ('systems procedures', 0.4771)]

In [None]:
filtered_keywords = keywords[:top_n]
filtered_keywords = [key[0] for key in filtered_keywords]
filtered_keywords

['requirements technical',
 'requirements implement',
 'requirements gathering',
 'business requirements',
 'requirements functional',
 'procurement processes',
 'business specifications',
 'gather requirements',
 'requirements formulate',
 'related procurement',
 'procurement transformation',
 'business functionality',
 'business processes',
 'customize procurement',
 'systems procedures']

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(filtered_keywords, convert_to_tensor=True)

In [None]:
len(embeddings)

15

In [None]:
clusters = db.collection('Cluster_Test2').stream()
clusters_id = [cluster.id for cluster in clusters]
clusters_id

['3KENaUxsMffy66kIeyRo',
 '4bvU9CcZQZW1AvdQsW0q',
 'DaigQLrCFTQIaJkHugk8',
 'JyAOgqXMsixrbCx8Ef8P',
 'MZkhebzSpKiJ6363slZj',
 'Yf3bxcNBrgnhT7iwmOll',
 'bgpS1q8VFaBEhjURqrbq',
 'hQ9SdnyeWRWty5p8KbUB',
 'jDuobXqOoilPjoV4m6hK',
 'nZpOEwlCeuLMMzo36Wao',
 'tsDEsmDFauRhUrSXZJur']

In [None]:
embeddings_list = [db.collection('Cluster_Test2').document(id).get().to_dict().get('Embedding vector') for id in clusters_id]

In [None]:
len(embeddings_list)

11

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

most_clusters_dict = {}
n = 20

for keyword in embeddings:
  most_cluster = []

  re_vector = np.array(keyword).reshape(1, -1)

  for vector in embeddings_list:

    re_keyword = np.array(vector).reshape(1, -1)

    similarity = cosine_similarity(re_vector, re_keyword)
    most_cluster.append(similarity[0][0])

  most_similar_index = np.argmax(most_cluster)
  cluster_id = clusters_id[most_similar_index]

  if cluster_id not in most_clusters_dict:
        most_clusters_dict[cluster_id] = {"count": 0, "vector": [], "total_similarity": 0.0}

  most_clusters_dict[cluster_id]['count'] += 1
  most_clusters_dict[cluster_id]['vector'] = embeddings_list[most_similar_index]
  most_clusters_dict[cluster_id]["total_similarity"] += max(most_cluster)

total_keywords = len(keywords)
total_numbers = 0

for cluster_id, data in most_clusters_dict.items():
    weight = data["count"] / len(keywords)
    most_clusters_dict[cluster_id]["weight"] = weight

    number = max(1, round(weight * n))
    most_clusters_dict[cluster_id]["number"] = number
    total_numbers += number

adjustment = n - total_numbers
sorted_clusters = sorted(most_clusters_dict.items(), key=lambda x: x[1]["weight"], reverse=True)

for i in range(abs(adjustment)):

    cluster_id = sorted_clusters[i % len(sorted_clusters)][0]
    if adjustment > 0:
        most_clusters_dict[cluster_id]["number"] += 1
    elif adjustment < 0 and most_clusters_dict[cluster_id]["number"] > 1:
        most_clusters_dict[cluster_id]["number"] -= 1


In [None]:
import re

def clean_question(question):
  question = re.sub(r'^Q\d+\.\s', '', question)
  question = question.replace('\n', '')
  question = re.sub(r'\s+', ' ', question)
  return question.strip()

In [None]:
import random

question_ids = []
missing_questions_log = []
for id,data in most_clusters_dict.items():
  all_question_ids = db.collection('Cluster_Test2').document(id).get().to_dict().get('Questions')

  random_ids = random.sample(all_question_ids, data['number'])
  question_ids.extend(random_ids)

all_questions= []
for id in question_ids:
  doc_snapshot = db.collection('exams').document(id).get()
  if doc_snapshot.exists:
    exam_doc = doc_snapshot.to_dict()
    if exam_doc and 'question' in exam_doc:
      all_questions.append(exam_doc.get('question'))
    else:
      print(f"Document {id} ไม่มีฟิลด์ 'question'")
  else:
    print(f"Document {id} ไม่มีอยู่ใน Firestore")

all_questions = [clean_question(question) for question in all_questions]

In [None]:
len(all_questions)

20

In [None]:
save_file(all_questions, 'auto-KeyBERT')

# **PhraseMap: EmbedRank + Cluster**

In [None]:
import re

job_description = job_description.replace('\n', ' ')
job_description = re.sub(r'\d+', ' ', job_description)
job_description = re.sub(r'[()\/\+,.]', ' ', job_description)
job_description = re.sub(r'\s+', ' ', job_description)
job_description = job_description.strip()
job_description

'Will work closely with the assigned Project Manager to ensure that the scope of the project is well defined and tracked Responsible for gathering and documenting requirements for implementation of IT Systems Enhancements and Features Evaluate existing business solutions processes and systems identify requirements and develop strategies to maximize opportunities Facilitate and or support meetings to illicit requirements from the business users and stakeholders Review documents from business users and stakeholder as inputs to requirements gathering activities Write Business Requirements Documents that will be inputs to the Project Development and Testing phases among others Write Business Process Documents that will be references of business users and stakeholders on the post implementation changes in processes Write Requirements Traceability Matrix to properly track and validate delivery of requirements and features Evaluate existing business solutions processes and systems identify re

In [None]:
from itertools import combinations
from sentence_transformers import SentenceTransformer, util
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def generate_candidates(text, ngram_range=(1, 2)):
    words = text.split()
    # candidates = [' '.join(words[i:i + ngram_range[1]]) for i in range(len(words) - ngram_range[1] + 1)]

    candidates = []
    for n in range(ngram_range[0], ngram_range[1] + 1):
        candidates.extend([' '.join(words[i:i + n]) for i in range(len(words) - n + 1)])

    return candidates

candidate_phrases = generate_candidates(job_description, ngram_range=(2, 2))

candidate_phrases = [phrase for phrase in candidate_phrases if not any(word in stop_words for word in phrase.split())]

candidate_phrases = set(candidate_phrases)
model = SentenceTransformer('all-MiniLM-L6-v2')

job_embedding = model.encode(job_description, convert_to_tensor=True)
candidate_embeddings = model.encode(list(candidate_phrases), convert_to_tensor=True)

cosine_scores = util.cos_sim(job_embedding, candidate_embeddings)

keywords = sorted(zip(candidate_phrases, cosine_scores[0].tolist()), key=lambda x: x[1], reverse=True)

top_n = min(15, max(5, int(0.3 * len(candidate_phrases))))
for kw, score in keywords[:top_n]:
    print(f"Keyword: {kw}, Score: {score:.4f}")

RuntimeError: Trying to override a python impl for DispatchKey.AutogradCUDA on operator aten::rrelu_with_noise

In [None]:
filtered_keywords = keywords[:top_n]
filtered_keywords = [key[0] for key in filtered_keywords]
filtered_keywords

In [None]:
from sentence_transformers import SentenceTransformer

embeddings = model.encode(filtered_keywords, convert_to_tensor=True)

In [None]:
len(embeddings)

In [None]:
clusters = db.collection('Cluster_Test2').stream()
clusters_id = [cluster.id for cluster in clusters]
clusters_id

In [None]:
embeddings_list = [db.collection('Cluster_Test2').document(id).get().to_dict().get('Embedding vector') for id in clusters_id]

In [None]:
len(embeddings_list)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

most_clusters_dict = {}
n = 20

for keyword in embeddings:
  most_cluster = []

  re_vector = np.array(keyword).reshape(1, -1)

  for vector in embeddings_list:

    re_keyword = np.array(vector).reshape(1, -1)

    similarity = cosine_similarity(re_vector, re_keyword)
    most_cluster.append(similarity[0][0])

  most_similar_index = np.argmax(most_cluster)
  cluster_id = clusters_id[most_similar_index]

  if cluster_id not in most_clusters_dict:
        most_clusters_dict[cluster_id] = {"count": 0, "vector": [], "total_similarity": 0.0}

  most_clusters_dict[cluster_id]['count'] += 1
  most_clusters_dict[cluster_id]['vector'] = embeddings_list[most_similar_index]
  most_clusters_dict[cluster_id]["total_similarity"] += max(most_cluster)

total_keywords = len(keywords)
total_numbers = 0

for cluster_id, data in most_clusters_dict.items():
    weight = data["count"] / len(keywords)
    most_clusters_dict[cluster_id]["weight"] = weight

    number = max(1, round(weight * n))
    most_clusters_dict[cluster_id]["number"] = number
    total_numbers += number

adjustment = n - total_numbers
sorted_clusters = sorted(most_clusters_dict.items(), key=lambda x: x[1]["weight"], reverse=True)

for i in range(abs(adjustment)):

    cluster_id = sorted_clusters[i % len(sorted_clusters)][0]
    if adjustment > 0:
        most_clusters_dict[cluster_id]["number"] += 1
    elif adjustment < 0 and most_clusters_dict[cluster_id]["number"] > 1:
        most_clusters_dict[cluster_id]["number"] -= 1


In [None]:
import re

def clean_question(question):
  question = re.sub(r'^Q\d+\.\s', '', question)
  question = question.replace('\n', '')
  question = re.sub(r'\s+', ' ', question)
  return question.strip()

In [None]:
import random

question_ids = []
missing_questions_log = []
for id,data in most_clusters_dict.items():
  all_question_ids = db.collection('Cluster_Test2').document(id).get().to_dict().get('Questions')

  random_ids = random.sample(all_question_ids, data['number'])
  question_ids.extend(random_ids)

all_questions= []
for id in question_ids:
  doc_snapshot = db.collection('exams').document(id).get()
  if doc_snapshot.exists:
    exam_doc = doc_snapshot.to_dict()
    if exam_doc and 'question' in exam_doc:
      all_questions.append(exam_doc.get('question'))
    else:
      print(f"Document {id} ไม่มีฟิลด์ 'question'")
  else:
    print(f"Document {id} ไม่มีอยู่ใน Firestore")

all_questions = [clean_question(question) for question in all_questions]

In [None]:
len(all_questions)

In [None]:
save_file(all_questions, 'auto-EmbedRank')

# **Direct SBERT: SBERT + Cluster**

In [None]:
import re

def clean_jd(text):
  text = re.sub(r'\d+', ' ', text)
  text = re.sub(r'[()\/\+,]', ' ', text)
  text = re.sub(r'\s+', ' ', text)
  text = text.strip()
  return text

cleaned_job_description = [clean_jd(sentence) for sentence in job_description.split('\n')]
cleaned_job_description

['Partnering with business users to gather requirements to formulate business specifications and translating them into business functionalities.',
 'To participate in cross-functional projects related to procurement transformation digitalization and strategic sourcing to support organizational goals.',
 'Documenting workflows and results of business processes analysis and obtaining sign-off from users on specifications.',
 'Designing and executing test plans to ensure that business requirements and functional specifications are implemented properly and meet objectives.',
 'Need to translate application functionality into application architecture and the production of a business functionality requirement document.',
 'Configure and customize procurement & expense systems such as Rydoo Concur Coupa Oracle Fusion SAP Ariba Tipalti with ERP systems and supplier management tools to meet business needs and optimize procurement processes.',
 'Conducts meetings on all levels of management for 

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

embedded_jd = model.encode(cleaned_job_description)

In [None]:
len(embedded_jd)

15

In [None]:
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.cluster import KMeans

def Silhouette(embedding, max_k = len(cleaned_job_description)):
  s = {}

  for k in range(2,max_k):
    kmeans = KMeans(n_clusters=k, init="k-means++")
    kmeans.fit(embedding)
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(embedding, labels)
    s[k] = silhouette_avg

    print("Silhouette Coefficient for k == %s: %s" % (k, round(silhouette_score(embedding, kmeans.labels_), 4)))

  print(s.get)
  return max(s, key=s.get)

k = Silhouette(embedded_jd)

Silhouette Coefficient for k == 2: 0.0344
Silhouette Coefficient for k == 3: 0.0351
Silhouette Coefficient for k == 4: 0.0175
Silhouette Coefficient for k == 5: 0.018
Silhouette Coefficient for k == 6: 0.0666
Silhouette Coefficient for k == 7: 0.0155
Silhouette Coefficient for k == 8: 0.0161
Silhouette Coefficient for k == 9: 0.0452
Silhouette Coefficient for k == 10: 0.0425
Silhouette Coefficient for k == 11: 0.023
Silhouette Coefficient for k == 12: 0.0023
Silhouette Coefficient for k == 13: -0.0003
Silhouette Coefficient for k == 14: 0.0138
<built-in method get of dict object at 0x78b996534080>


In [None]:
k

6

In [None]:
import matplotlib.pyplot as plt

kmeans = KMeans(n_clusters=k)
clusters = kmeans.fit_predict(embedded_jd)

In [None]:
embedded_jd = [vector.reshape(1,-1) for vector in embedded_jd]
embedded_jd

[array([[ 6.43321825e-03,  4.66893129e-02, -8.50912556e-03,
         -4.41771522e-02, -9.99239236e-02,  4.45228964e-02,
          7.13001378e-03,  6.93144053e-02, -2.30506472e-02,
         -9.58681628e-02, -1.13916725e-01, -5.88905215e-02,
          1.44570395e-02,  8.08895528e-02,  9.48432535e-02,
         -4.00157133e-03,  6.49544969e-02, -4.04059328e-02,
         -5.69400415e-02, -3.78791690e-02, -1.43171176e-02,
         -1.55724930e-02, -2.30567735e-02, -1.88817307e-02,
         -5.84181100e-02, -3.34129147e-02,  4.19821292e-02,
         -5.47198616e-02,  9.49105714e-03, -1.15240850e-02,
          3.93770486e-02,  5.04786856e-02,  8.04673806e-02,
          1.57998205e-04,  2.34812591e-02,  1.20249698e-02,
          3.53788845e-02,  7.43534276e-03,  2.81306542e-02,
         -2.82338802e-02, -1.04863688e-01, -7.56338984e-02,
         -6.32895827e-02,  8.35033879e-03,  2.62222104e-02,
         -2.25191694e-02,  1.32481111e-02,  1.51477726e-02,
         -5.42822666e-02,  9.67920423e-0

In [None]:
import pandas as pd

df_jd = pd.DataFrame({
    'text': cleaned_job_description,
    'embedded': embedded_jd,
    'clusters': clusters
})

In [None]:
clusters = db.collection('Cluster_Test2').stream()

cluster_ids = [id.id for id in clusters]
cluster_ids

['3KENaUxsMffy66kIeyRo',
 '4bvU9CcZQZW1AvdQsW0q',
 'DaigQLrCFTQIaJkHugk8',
 'JyAOgqXMsixrbCx8Ef8P',
 'MZkhebzSpKiJ6363slZj',
 'Yf3bxcNBrgnhT7iwmOll',
 'bgpS1q8VFaBEhjURqrbq',
 'hQ9SdnyeWRWty5p8KbUB',
 'jDuobXqOoilPjoV4m6hK',
 'nZpOEwlCeuLMMzo36Wao',
 'tsDEsmDFauRhUrSXZJur']

In [None]:
q_cluster_vectors = [db.collection('Cluster_Test2').document(id).get().to_dict().get('Embedding vector') for id in cluster_ids]


In [None]:
vector_clusters = [db.collection('Cluster_Test2').document(id).get().to_dict().get('Embedding vector') for id in cluster_ids]
len(vector_clusters)

11

In [None]:
df_jd.head()

Unnamed: 0,text,embedded,clusters
0,Partnering with business users to gather requi...,"[[0.0064332183, 0.046689313, -0.008509126, -0....",1
1,To participate in cross-functional projects re...,"[[-0.0587302, -0.022074819, -0.048767503, -0.0...",5
2,Documenting workflows and results of business ...,"[[-0.028350929, 0.09996848, -0.068952054, -0.0...",3
3,Designing and executing test plans to ensure t...,"[[-0.0018280972, 0.10204602, -0.009627427, -0....",1
4,Need to translate application functionality in...,"[[-0.044205897, 0.035138786, -0.038380213, -0....",1


In [None]:
cluster_vectors = {
    cluster: np.mean(np.array(df_jd[df_jd['clusters'] == cluster]['embedded'].tolist()), axis=0)
    for cluster in range(k)
    }
cluster_vectors

{0: array([[-2.98744626e-02,  1.51896868e-02, -5.19271940e-02,
         -1.63102988e-02, -5.91324866e-02, -3.18754278e-03,
         -1.61691923e-02,  5.91030484e-03, -4.22356874e-02,
          2.37204693e-02, -7.74224196e-03,  2.37929970e-02,
          2.55689956e-04,  2.35509947e-02,  6.50934409e-03,
         -3.40842456e-02,  4.37420867e-02, -5.51878661e-03,
          5.03177308e-02, -4.15088758e-02,  3.73106450e-02,
         -2.56480277e-03,  2.66371481e-02,  1.96886808e-02,
         -1.01951569e-01,  2.11965572e-02, -4.90320809e-02,
         -3.13083082e-02,  3.79265249e-02, -5.24995327e-02,
         -2.17292644e-02, -1.18594132e-02,  1.04583278e-01,
          1.78602729e-02, -1.84145290e-02,  5.47103919e-02,
          9.42372456e-02,  2.11530831e-03,  1.17710913e-02,
          5.75130945e-03, -7.78508186e-02, -4.84023467e-02,
         -4.88164350e-02, -2.98581040e-03, -5.78498282e-03,
          1.72257191e-03, -9.27205849e-03, -4.76787426e-02,
         -5.77596873e-02,  9.81774880

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

most_clusters = {}
for cluster, vector in cluster_vectors.items():
  all_similarity = []

  re_vector = np.array(vector).reshape(1, -1)

  for q_vector in q_cluster_vectors:
    re_q_vector = np.array(q_vector).reshape(1, -1)
    similarity = cosine_similarity(re_vector, re_q_vector)
    all_similarity.append(similarity[0][0])

  most_cluster_index = np.argmax(all_similarity)
  most_clusters[cluster] = cluster_ids[most_cluster_index]

In [None]:
most_clusters

{0: '4bvU9CcZQZW1AvdQsW0q',
 1: 'DaigQLrCFTQIaJkHugk8',
 2: 'MZkhebzSpKiJ6363slZj',
 3: 'DaigQLrCFTQIaJkHugk8',
 4: 'nZpOEwlCeuLMMzo36Wao',
 5: 'MZkhebzSpKiJ6363slZj'}

In [None]:
n = 20

questions_number = {}
total_numbers = 0

for i in range(k):
  percent = (len(df_jd[df_jd['clusters'] == i])) / len(cleaned_job_description)
  number = round(percent * n)
  total_numbers += number
  questions_number[i] = {'number': number, 'weight': percent}

adjustment = n - total_numbers

sorted_clusters = sorted(questions_number.items(), key=lambda x: x[1]["weight"], reverse=True)

for i in range(abs(adjustment)):

    cluster_id = sorted_clusters[i % len(sorted_clusters)][0]
    if adjustment > 0:
        questions_number[cluster_id]["number"] += 1
    elif adjustment < 0 and questions_number[cluster_id]["number"] > 0:
        questions_number[cluster_id]["number"] -= 1

questions_number

{0: {'number': 5, 'weight': 0.26666666666666666},
 1: {'number': 7, 'weight': 0.3333333333333333},
 2: {'number': 1, 'weight': 0.06666666666666667},
 3: {'number': 3, 'weight': 0.13333333333333333},
 4: {'number': 1, 'weight': 0.06666666666666667},
 5: {'number': 3, 'weight': 0.13333333333333333}}

In [None]:
import re

def clean_question(question):
  question = re.sub(r'^Q\d+\.\s', '', question)
  question = question.replace('\n', '')
  question = re.sub(r'\s+', ' ', question)
  return question.strip()

In [None]:
from concurrent.futures import ThreadPoolExecutor
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

final_questions = []

def fetch_questions_and_embedding(cluster, questions_number, index):
    number = questions_number[index]['number']

    qids = db.collection('Cluster_Test2').document(cluster).get().to_dict().get('Questions')

    questions = []
    for id in qids:
      doc_snapshot = db.collection('exams').document(id).get()

      if doc_snapshot.exists:
        exam_doc = doc_snapshot.to_dict()
        if exam_doc and 'question' in exam_doc:
              questions.append(exam_doc.get('question'))
        else:
            print(f"Document {id} ไม่มีฟิลด์ 'question'")
      else:
          print(f"Document {id} ไม่มีอยู่ใน Firestore")

    cleaned_questions = [clean_question(question) for question in questions]

    embedding = model.encode(cleaned_questions, batch_size=32, show_progress_bar=True)

    return cleaned_questions, embedding, number

with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(fetch_questions_and_embedding, most_clusters[i], questions_number, i)
        for i in range(k)
    ]

for i, future in enumerate(futures):
    cleaned_questions, embedding, number = future.result()

    similarity_each_question = np.dot(embedding, cluster_vectors[i].reshape(-1, 1)).flatten()
    top_indices = np.argsort(similarity_each_question)[-number:][::-1]

    top_questions = [cleaned_questions[j] for j in top_indices]
    final_questions.extend(top_questions)


Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/3 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/6 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

In [None]:
final_questions

['A company monitors and manages business operations by establishing standards and operational procedures to prevent unlawful or unjus tifiable acts, mistakes, errors, and other problems, and to ensure that the organization is run healthy, effectively, and efficiently. Which of the following is the most ap propriate term that represents this policy or mechanism?',
 'When a department that outsources system development must undergo a system audit concerning “progress management ” of the outsourcing partner , which of the following is an appropriate set of materials that should be submit ted?',
 'When the four roles, change control admi nistrator, configura tion administrator, maintenance administrator, and maintenance staff are defined in software maintenance organizations, which of the following pers ons should be primarily responsible for providing mechanisms for identifying, contro lling, and tracking ve rsions of software components?',
 'Which of the following is the appropriat e us

In [None]:
save_file(final_questions, 'auto-SBert')