In [1]:
import os
import pandas as pd
import numpy as np
import json
import faiss
import logging
import argparse
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# ==============================
# 1. LOAD AND PROCESS dataset.txt
# ==============================
def load_datasets_from_txt(file_path):
    """Reads dataset descriptions from a TXT file and stores them in a list."""
    with open(file_path, "r", encoding="utf-8") as file:
        data = file.read()

    # Split datasets based on "Name: " as each dataset starts with this
    datasets = data.split("\n\nName: ")
    dataset_list = []

    for dataset in datasets:
        dataset_info = dataset.strip().split("\n")
        if dataset_info[0].startswith("Name: "):
            dataset_name = dataset_info[0].replace("Name: ", "").strip()
        else:
            dataset_name = dataset_info[0].strip()

        dataset_list.append({"name": dataset_name, "description": "\n".join(dataset_info)})

    return dataset_list

# Load dataset descriptions
dataset_file_path = "/mnt/data1/raiyan/Mammo-Find/dataset.txt"  # Change this if needed
datasets = load_datasets_from_txt(dataset_file_path)

In [3]:
datasets

[{'name': 'Here are some description of datasets. Information regarding each dataset is provided below:',
  'description': 'Here are some description of datasets. Information regarding each dataset is provided below:'},
 {'name': 'EMBED',
  'description': 'EMBED\nInformation: Stands for EMory BrEast imaging Dataset. This dataset contains 3,383,659 screening and diagnostic mammogram images from 115,910 patients. Among these, 20% of the total 2D and C-view dataset is available for research use. This 20% contains Total 480,606 dicom images, Total 676,009 png images (20%) and Total 34,004 spot magnified images. It also has 4 files of clinical data and metadata.\nCan be used for: Breast_Cancer_Detection, Breast_Cancer_Risk_Prediction, Mammographic_Report_Generation, Breast_Cancer_Type_Classification, Breast_Tumor_Classification, Tumor_Localization, Breast_Density_Estimation, Synthetic_Data_Generation\nIs Data Available?: Available_upon_signing_agreement.\nData Link: Available at https://aws

In [6]:
import os
import numpy as np
import json
import faiss
import logging
import argparse
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

# ==============================
# 1. LOAD AND PROCESS dataset.json
# ==============================
def load_datasets_from_json(file_path):
    """Reads dataset descriptions from a JSON file with variable structures and stores them in a list."""
    with open(file_path, "r", encoding="utf-8") as file:
        datasets = json.load(file)

    dataset_list = []

    for dataset in datasets:
        # Extracting the 'Name' field, assuming it's the identifier for the dataset
        dataset_name = dataset.get("Name", "Unknown Name").strip()  # Default to "Unknown Name" if not present

        # Extracting other fields if available
        dataset_info = {
            "name": dataset_name,
            "information": dataset.get("Information", "No information available").strip(),
            "can_be_used_for": dataset.get("Can be used for", "No usage information").strip(),
            "is_data_available": dataset.get("Is Data Available?", "Unknown availability").strip(),
            "data_link": dataset.get("Data Link", "No link provided").strip(),
            "associated_article": dataset.get("Associated Article", "No associated article").strip(),
            "data_article_published_on": dataset.get("Data/Article published on", "Unknown date").strip(),
            "article_available_at": dataset.get("Article available at", "No article link provided").strip(),
            "types_of_data": dataset.get("Types of data in dataset", "No data types specified").strip(),
            "types_of_files": dataset.get("Types of files in dataset", "No file types specified").strip(),
            "data_collected_from": dataset.get("Data collected from", "Unknown location").strip(),
        }

        # If the dataset contains 'Derived from', include that field if it's available
        derived_from = dataset.get("Derived from")
        if derived_from:
            dataset_info["derived_from"] = derived_from.strip()

        # Append the dataset information to the dataset list
        dataset_list.append(dataset_info)

    return dataset_list

# Load dataset descriptions
dataset_file_path = "/mnt/data1/raiyan/Mammo-Find/dataset.json"  # Change this if needed
datasets = load_datasets_from_json(dataset_file_path)

In [7]:
datasets

[{'name': 'EMBED',
  'information': 'Stands for EMory BrEast imaging Dataset. This dataset contains 3,383,659 screening and diagnostic mammogram images from 115,910 patients. Among these, 20% of the total 2D and C-view dataset is available for research use. This 20% contains Total 480,606 dicom images, Total 676,009 png images (20%) and Total 34,004 spot magnified images. It also has 4 files of clinical data and metadata.',
  'can_be_used_for': 'Breast_Cancer_Detection, Breast_Cancer_Risk_Prediction, Mammographic_Report_Generation, Breast_Cancer_Type_Classification, Breast_Tumor_Classification, Tumor_Localization, Breast_Density_Estimation, Synthetic_Data_Generation',
  'is_data_available': 'Available_upon_signing_agreement.',
  'data_link': 'Available at https://aws.amazon.com/marketplace/pp/prodview-unw4li5rkivs2#overview',
  'associated_article': 'The EMory BrEast imaging Dataset (EMBED): A Racially Diverse, Granular Dataset of 3.4 Million Screening and Diagnostic Mammographic Image

In [8]:
# ==============================
# 2. EMBEDDING & FAISS INDEXING
# ==============================

# Extract all dataset fields and concatenate them into a single string for each dataset
documents = []

for dataset in datasets:
    # Concatenate all available fields as a string (excluding None values)
    dataset_info = []
    
    # Loop over all keys in the dataset
    for key, value in dataset.items():
        if value:  # Only add non-empty values
            dataset_info.append(f"{key}: {value}")
    
    # Join all fields into a single string and add it to the documents list
    documents.append("\n".join(dataset_info))

# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the documents (dataset entries)
document_embeddings = embedding_model.encode(documents, convert_to_numpy=True)

# Initialize FAISS index
dimension = document_embeddings.shape[1]  
index = faiss.IndexFlatL2(dimension)  
index.add(np.array(document_embeddings))

# Document Retrieval
def retrieve_documents(query, top_k=15):
    """Retrieves the top-k most relevant documents using FAISS."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the most relevant documents based on the indices from FAISS
    retrieved_docs = [documents[i] for i in indices[0]]
    
    # Optionally, you can return the full dataset details (name + description) if needed
    result = "\n\n".join(retrieved_docs)
    
    return result

In [9]:
documents

['name: EMBED\ninformation: Stands for EMory BrEast imaging Dataset. This dataset contains 3,383,659 screening and diagnostic mammogram images from 115,910 patients. Among these, 20% of the total 2D and C-view dataset is available for research use. This 20% contains Total 480,606 dicom images, Total 676,009 png images (20%) and Total 34,004 spot magnified images. It also has 4 files of clinical data and metadata.\ncan_be_used_for: Breast_Cancer_Detection, Breast_Cancer_Risk_Prediction, Mammographic_Report_Generation, Breast_Cancer_Type_Classification, Breast_Tumor_Classification, Tumor_Localization, Breast_Density_Estimation, Synthetic_Data_Generation\nis_data_available: Available_upon_signing_agreement.\ndata_link: Available at https://aws.amazon.com/marketplace/pp/prodview-unw4li5rkivs2#overview\nassociated_article: The EMory BrEast imaging Dataset (EMBED): A Racially Diverse, Granular Dataset of 3.4 Million Screening and Diagnostic Mammographic Images\ndata_article_published_on: 202

In [11]:
print(documents[0])

name: EMBED
information: Stands for EMory BrEast imaging Dataset. This dataset contains 3,383,659 screening and diagnostic mammogram images from 115,910 patients. Among these, 20% of the total 2D and C-view dataset is available for research use. This 20% contains Total 480,606 dicom images, Total 676,009 png images (20%) and Total 34,004 spot magnified images. It also has 4 files of clinical data and metadata.
can_be_used_for: Breast_Cancer_Detection, Breast_Cancer_Risk_Prediction, Mammographic_Report_Generation, Breast_Cancer_Type_Classification, Breast_Tumor_Classification, Tumor_Localization, Breast_Density_Estimation, Synthetic_Data_Generation
is_data_available: Available_upon_signing_agreement.
data_link: Available at https://aws.amazon.com/marketplace/pp/prodview-unw4li5rkivs2#overview
associated_article: The EMory BrEast imaging Dataset (EMBED): A Racially Diverse, Granular Dataset of 3.4 Million Screening and Diagnostic Mammographic Images
data_article_published_on: 2023
articl

In [1]:
import os
import numpy as np
import json
import faiss
import logging
import argparse
from flask import Flask, request, jsonify
from flask_cors import CORS
from sentence_transformers import SentenceTransformer
from langchain_ollama import OllamaLLM
from langchain_core.prompts import ChatPromptTemplate

# ==============================
# 1. LOAD AND PROCESS dataset.json
# ==============================
def load_datasets_from_json(file_path):
    """Reads dataset descriptions from a JSON file with variable structures and stores them in a list."""
    with open(file_path, "r", encoding="utf-8") as file:
        datasets = json.load(file)

    dataset_list = []

    for dataset in datasets:
        # Extracting the 'Name' field, assuming it's the identifier for the dataset
        dataset_name = dataset.get("Name", "Unknown Name").strip()  # Default to "Unknown Name" if not present

        # Extracting other fields if available
        dataset_info = {
            "name": dataset_name,
            "information": dataset.get("Information", "No information available").strip(),
            "can_be_used_for": dataset.get("Can be used for", "No usage information").strip(),
            "is_data_available": dataset.get("Is Data Available?", "Unknown availability").strip(),
            "data_link": dataset.get("Data Link", "No link provided").strip(),
            "associated_article": dataset.get("Associated Article", "No associated article").strip(),
            "data_article_published_on": dataset.get("Data/Article published on", "Unknown date").strip(),
            "article_available_at": dataset.get("Article available at", "No article link provided").strip(),
            "types_of_data": dataset.get("Types of data in dataset", "No data types specified").strip(),
            "types_of_files": dataset.get("Types of files in dataset", "No file types specified").strip(),
            "data_collected_from": dataset.get("Data collected from", "Unknown location").strip(),
        }

        # If the dataset contains 'Derived from', include that field if it's available
        derived_from = dataset.get("Derived from")
        if derived_from:
            dataset_info["derived_from"] = derived_from.strip()

        # Append the dataset information to the dataset list
        dataset_list.append(dataset_info)

    return dataset_list

# Load dataset descriptions
dataset_file_path = "/mnt/data1/raiyan/Mammo-Find/dataset.json"  # Change this if needed
datasets = load_datasets_from_json(dataset_file_path)

# ==============================
# 2. EMBEDDING & FAISS INDEXING
# ==============================

# Extract all dataset fields and concatenate them into a single string for each dataset
documents = []

for dataset in datasets:
    # Concatenate all available fields as a string (excluding None values)
    dataset_info = []
    
    # Loop over all keys in the dataset
    for key, value in dataset.items():
        if value:  # Only add non-empty values
            dataset_info.append(f"{key}: {value}")
    
    # Join all fields into a single string and add it to the documents list
    documents.append("\n".join(dataset_info))

# Load the embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for the documents (dataset entries)
document_embeddings = embedding_model.encode(documents, convert_to_numpy=True)

  from .autonotebook import tqdm as notebook_tqdm


In [30]:
# Initialize FAISS index
dimension = document_embeddings.shape[1]  
index = faiss.IndexFlatL2(dimension)  
index.add(np.array(document_embeddings))

# Document Retrieval
def retrieve_documents(query, top_k):
    """Retrieves the top-k most relevant documents using FAISS."""
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    distances, indices = index.search(query_embedding, top_k)

    # Retrieve the most relevant documents based on the indices from FAISS
    retrieved_docs = [documents[i] for i in indices[0]]
    
       # Flatten the list if necessary
    flat_retrieved_docs = [item for sublist in retrieved_docs for item in sublist]
    return flat_retrieved_docs

In [25]:
# Load the dataset
df = pd.read_excel("Mammography_Dataset.xlsx")

# Extract all the columns 
documents = df.values.astype(str).tolist()  # Converts all columns to a list of strings

In [27]:
# Generate embeddings for the documents (dataset entries)
document_embeddings = embedding_model.encode(documents, convert_to_numpy=True)

In [28]:
document_embeddings

array([[ 0.08011394, -0.0805012 , -0.08694214, ..., -0.0585218 ,
        -0.06002836, -0.0234236 ],
       [ 0.07053827,  0.0040918 ,  0.00302206, ..., -0.01392684,
        -0.00061018, -0.0499643 ],
       [ 0.02994247, -0.02075702, -0.05443005, ..., -0.07054365,
         0.01951572, -0.00442925],
       ...,
       [ 0.0015144 ,  0.0180732 , -0.02235634, ..., -0.078903  ,
         0.00123597,  0.04131344],
       [-0.00195684,  0.01090793, -0.01668284, ..., -0.05575781,
         0.04531072,  0.06609486],
       [ 0.0562076 ,  0.03593158, -0.08323912, ..., -0.06491718,
         0.04104049,  0.03670458]], dtype=float32)

In [31]:
result=retrieve_documents("Which datasets available upon signing agreements?", 5)
print(result)

['BancoWeb', 'Contains 1400 images from 320 cases.', 'Breast_Cancer_Detection', 'Not available', 'Online Mammographic Images Database for Development and Comparison of CAD Schemes', 'Need to sign agreement to access', 'Mammogram_Images', 'Tiff', 'Brazil', 'KAU-BCMD', 'King Abdulaziz University Breast Cancer Mammogram Dataset. First significant mammogram dataset from Saudi Arabia.\n1416 cases each with 2 views of both breasts making a total of 5664 images.\n\nAlso contains 205 ultrasound cases corresponding to a part of the mammogram cases, with 405 images as a total.', 'BIRADS_Category_Classification, Breast_Cancer_Detection, Tumor_Localization, Breast_Tumor_Classification', 'Available at https://www.kaggle.com/datasets/asmaasaad/king-abdulaziz-university-mammogram-dataset', 'King Abdulaziz University Breast Cancer Mammogram Dataset (KAU-BCMD)', 'Accessible', 'Mammogram_Images, Metadata', 'Jpg, Xlsx', 'Saudi Arabia', 'EMBED', 'EMory BrEast imaging Dataset.\n\n3 383 659  screening and d

In [3]:
datasets

[{'name': 'EMBED',
  'information': 'Stands for EMory BrEast imaging Dataset. This dataset contains 3,383,659 screening and diagnostic mammogram images from 115,910 patients. Among these, 20% of the total 2D and C-view dataset is available for research use. This 20% contains Total 480,606 dicom images, Total 676,009 png images (20%) and Total 34,004 spot magnified images. It also has 4 files of clinical data and metadata.',
  'can_be_used_for': 'Breast_Cancer_Detection, Breast_Cancer_Risk_Prediction, Mammographic_Report_Generation, Breast_Cancer_Type_Classification, Breast_Tumor_Classification, Tumor_Localization, Breast_Density_Estimation, Synthetic_Data_Generation',
  'is_data_available': 'Available_upon_signing_agreement.',
  'data_link': 'Available at https://aws.amazon.com/marketplace/pp/prodview-unw4li5rkivs2#overview',
  'associated_article': 'The EMory BrEast imaging Dataset (EMBED): A Racially Diverse, Granular Dataset of 3.4 Million Screening and Diagnostic Mammographic Image

In [4]:
document_embeddings

array([[ 0.01494646, -0.0606608 , -0.06950665, ..., -0.10183314,
         0.00125268,  0.02828436],
       [ 0.04311208,  0.02099564, -0.06199616, ..., -0.0621012 ,
        -0.01303593, -0.00075939],
       [ 0.03194933, -0.01275825, -0.07672867, ..., -0.08739328,
        -0.01505549,  0.04357722],
       ...,
       [ 0.0418328 , -0.01025611, -0.07366028, ..., -0.08934465,
        -0.05606401,  0.00693384],
       [ 0.01521577,  0.01108274, -0.07390521, ..., -0.0857215 ,
        -0.03035916,  0.0052451 ],
       [ 0.06226624, -0.02074655, -0.10097716, ..., -0.08753823,
        -0.01808339,  0.01074308]], dtype=float32)

In [1]:
import json

# Load JSON file
def load_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = json.load(file)
    return data

# Count the number of questions
def count_questions(data):
    return len(data)

# Example usage
file_path = 'question.json'  # Replace with your actual file path
data = load_json(file_path)
print(f'Total number of questions: {count_questions(data)}')


Total number of questions: 216
