In [1]:
import joblib
import pandas as pd
import nltk
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import tkinter as tk
from tkinter import ttk
from ttkbootstrap import Style
from tkinter.scrolledtext import ScrolledText
import threading

# Ensure NLTK resources are downloaded
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Thales
[nltk_data]     Mustafa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Define the functions for loading data and models
def load_data_and_models(dataset):
    if dataset == 'Clinical Trials':
        vectorizer = joblib.load('clinicaltrials/tfidf_vectorizer.pkl')
        tfidf_matrix = joblib.load('clinicaltrials/tfidf_matrix.pkl')
        df = pd.read_csv('clinicaltrials/clinicaltrials_dataset.csv')
    elif dataset == 'Argsme':
        vectorizer = joblib.load('argsme/tfidf_vectorizer.pkl')
        tfidf_matrix = joblib.load('argsme/tfidf_tfidf_matrix.pkl')
        df = pd.read_csv('argsme/argsme_dataset.csv')
    original_df = df.copy()
    return vectorizer, tfidf_matrix, df, original_df

def load_clustering_models(dataset):
    if dataset == 'Clinical Trials':
        cluster_results = joblib.load('clinicaltrials/combined_cluster_results.pkl')
    elif dataset == 'Argsme':
        cluster_results = joblib.load('argsme/combined_cluster_results.pkl')
    return cluster_results['clusters'], cluster_results['cluster_centers']

def load_ground_truth(dataset):
    if dataset == 'Clinical Trials':
        ground_truth = joblib.load('clinicaltrials/ground_truth.pkl')
    elif dataset == 'Argsme':
        ground_truth = joblib.load('argsme/ground_truth.pkl')
    return ground_truth

In [None]:
def preprocess_query(query):
    # Implement any necessary query preprocessing steps
    return query

def vectorize_query(query, vectorizer):
    return vectorizer.transform([query])

def cluster_based_ranking(query_vector, tfidf_matrix, cluster_labels, cluster_centers):
    cluster_similarities = cosine_similarity(query_vector, cluster_centers)
    closest_cluster = cluster_similarities.argmax()

    cluster_indices = [i for i, label in enumerate(cluster_labels) if label == closest_cluster]

    similarities = cosine_similarity(query_vector, tfidf_matrix[cluster_indices])
    rankings = similarities.argsort().flatten()[::-1]

    ranked_indices = [cluster_indices[i] for i in rankings]
    return ranked_indices

def calculate_precision_at_k(relevant_docs, retrieved_docs, k):
    return len(set(relevant_docs) & set(retrieved_docs[:k])) / k

def calculate_recall(relevant_docs, retrieved_docs):
    return len(set(relevant_docs) & set(retrieved_docs)) / len(relevant_docs)

def calculate_average_precision(relevant_docs, retrieved_docs):
    relevant_set = set(relevant_docs)
    ap = 0.0
    num_hits = 0
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_set:
            num_hits += 1
            ap += num_hits / (i + 1)
    return ap / len(relevant_docs) if relevant_docs else 0

def calculate_reciprocal_rank(relevant_docs, retrieved_docs):
    for i, doc_id in enumerate(retrieved_docs):
        if doc_id in relevant_docs:
            return 1 / (i + 1)
    return 0

def search():
    query = query_entry.get()
    if not query:
        messagebox.showwarning("Input Error", "Please enter a search query.")
        return

    processed_query = preprocess_query(query)
    query_vector = vectorize_query(processed_query, vectorizer)
    rankings = cluster_based_ranking(query_vector, tfidf_matrix, cluster_labels, cluster_centers)
    top_documents = df.iloc[rankings[:20]]

    results_listbox.delete(0, tk.END)
    for _, row in top_documents.iterrows():
        results_listbox.insert(tk.END, f"{row['doc_id']}: {row['title'] if dataset_var.get() == 'Clinical Trials' else row['source_title']}")

    if query in ground_truth:
        relevant_docs = ground_truth[query]
        retrieved_docs = [row['doc_id'] for _, row in top_documents.iterrows()]

        p_at_10 = calculate_precision_at_k(relevant_docs, retrieved_docs, 10)
        recall = calculate_recall(relevant_docs, retrieved_docs)
        ap = calculate_average_precision(relevant_docs, retrieved_docs)
        rr = calculate_reciprocal_rank(relevant_docs, retrieved_docs)

        metrics_text = f"Metrics for query '{query}':\n"
        metrics_text += f"  Precision@10: {p_at_10:.3f}\n"
        metrics_text += f"  Recall: {recall:.3f}\n"
        metrics_text += f"  Average Precision: {ap:.3f}\n"
        metrics_text += f"  Reciprocal Rank: {rr:.3f}\n"
        details_text.delete(1.0, tk.END)
        details_text.insert(tk.END, metrics_text)

# Load data and models for the initial dataset
vectorizer, tfidf_matrix, df, original_df = load_data_and_models('Clinical Trials')
cluster_labels, cluster_centers = load_clustering_models('Clinical Trials')
ground_truth = load_ground_truth('Clinical Trials')

def update_data_and_models(event):
    global vectorizer, tfidf_matrix, df, original_df, ground_truth, cluster_labels, cluster_centers
    dataset = dataset_var.get()
    vectorizer, tfidf_matrix, df, original_df = load_data_and_models(dataset)
    cluster_labels, cluster_centers = load_clustering_models(dataset)
    ground_truth = load_ground_truth(dataset)

# GUI setup
root = tk.Tk()
root.title("Document Search with Clustering")

# Dataset selection
dataset_var = tk.StringVar()
dataset_label = ttk.Label(root, text="Select Dataset:")
dataset_label.pack(pady=5)
dataset_combobox = ttk.Combobox(root, textvariable=dataset_var, state='readonly')
dataset_combobox['values'] = ('Clinical Trials', 'Argsme')
dataset_combobox.current(0)
dataset_combobox.bind("<<ComboboxSelected>>", update_data_and_models)
dataset_combobox.pack(pady=5)

# Query input
query_label = ttk.Label(root, text="Enter Query:")
query_label.pack(pady=5)
query_entry = ttk.Entry(root, width=50)
query_entry.pack(pady=5)

# Search button
search_button = ttk.Button(root, text="Search", command=search)
search_button.pack(pady=5)

# Results listbox
results_listbox = tk.Listbox(root, width=100, height=20)
results_listbox.pack(pady=5)

# Details text box
details_text = tk.Text(root, height=10, width=100)
details_text.pack(pady=5)

# Initialize data and models
update_data_and_models(None)

root.mainloop()