# Scraping

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import undetected_chromedriver as uc
import time
from selenium import webdriver

# Scraping abstracts for years

def driver_setup():
    options = uc.ChromeOptions()
    options.add_experimental_option("prefs", {
        "profile.managed_default_content_settings.images": 2  # block images
    })
    options.add_argument("--no-sandbox")
    options.add_argument("--disable-dev-shm-usage")
    options.add_argument("--disable-extensions")
    options.add_argument("--disable-gpu")
    capabilities = webdriver.DesiredCapabilities.CHROME.copy()
    capabilities['pageLoadStrategy'] = 'eager'  # This ensures page load considers DOM ready only
    # Launch the undetected driver with the capabilities and options
    driver = uc.Chrome(options=options)
    driver.execute_cdp_cmd("Network.enable", {})
    driver.execute_cdp_cmd("Network.setBlockedURLs", {
        "urls": [
            "*.png", "*.jpg", "*.jpeg", "*.gif", "*.svg",  # Images and stylesheets
            "*.woff", "*.woff2", "*.ttf", "*.ico",  # Fonts
            "*.mp4", "*.webm", "*.avi", "*.mov", "*.mkv",  # Videos
            "*.json", "*.xml"  # Optional: Block large API responses if not required
        ]
    })
    driver.execute_script("""
            var videos = document.querySelectorAll('video');
            videos.forEach(function(video) {
                video.autoplay = false;  // Disable autoplay
                video.pause();           // Pause video if it's playing
            });
        """)

    return driver


def article_scraper(driver, link):
    try:
        driver.get(link)
        name = driver.title.strip()
        print(f"Processing site: {name}")
        wait = WebDriverWait(driver, 5)  # Timeout after 20 seconds
        wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
        print("Page Loaded")

        previous_height = driver.execute_script("return document.body.scrollHeight")
        scroll_attempts = 0  # Count the number of scroll attempts without content loading
        load_clicks = 0
        show_clicks = 0

        while True:
            articles = []
            try:
                articles = wait.until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, "article.mg-result-item.px-2.py-3.border-top"
                                                                          ".border-bottom")))
                print(f"articles length = '{len(articles)}' ")

            except Exception as e:
                print(f"Failed to locate articles: {e}")

            for article in articles:
                try:
                    article_name = article.find_element(By.CSS_SELECTOR, "a.no-underline").text.strip()
                    # show_abstract = WebDriverWait(article, 5).until(
                    #     EC.element_to_be_clickable((By.CSS_SELECTOR, "button.bg-transparent.p-0.my-3.text-secondary"
                    #                                                  ".border-0.block.cursor-pointer"))
                    # )
                    # driver.execute_script("arguments[0].click();", show_abstract)
                    show_clicks += 1

                    # abstract_content = WebDriverWait(article, 5).until(
                    #     EC.visibility_of_element_located((By.CSS_SELECTOR, "div.mb-3"))
                    # )

                    with open("abstracts.txt", "a", encoding="utf-8") as file:  # "a" mode ensures continuous writing
                        file.write(f"Article: {article_name}\n")
                        # file.write(f"Abstract: {abstract_content.text.strip()}\n\n")  # Double line break for separation
                    print(f"Abstract saved for: {article_name}")

                except Exception:
                    print("ERROR OCCURED WHILE PARSING ARTICLES")
                    break

            print(f"\n articles passed = '{show_clicks}'")

            try:
                load_next_button = WebDriverWait(driver, 5).until(
                    EC.element_to_be_clickable((By.CSS_SELECTOR, "a.border-solid.relative.inline-flex.items-center"
                                                                 ".rounded-r-md.border.border-dividers.bg-white.px-2.py-2.font-medium"))
                )
                driver.execute_script("arguments[0].scrollIntoView(true);", load_next_button)
                driver.execute_script("arguments[0].click();", load_next_button)
                time.sleep(2)
                load_clicks += 1
                scroll_attempts = 0  # Reset scroll attempts after a successful load
            except Exception:
                print("'Load More' button not found or no more content to load.")
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                scroll_attempts += 1

            if scroll_attempts >= 1:  # After a few failed scrolls, stop
                print("No more new content loaded after several attempts. Stopping.")
                break

    except Exception:
        print("Error fetching")

# The below is code for directly scraping keywords for 2019 as its abstract data was not available
# def article_scraper(driver, link):
#     try:
#         driver.get(link)
#         name = driver.title.strip()
#         print(f"Processing site: {name}")
#         wait = WebDriverWait(driver, 5)
#         wait.until(EC.presence_of_element_located((By.TAG_NAME, "body")))
#         print("Page Loaded")

#         while True:
#             articles = driver.find_elements(By.CSS_SELECTOR, "article.mg-result-item")
#             print(f"Found {len(articles)} articles")

#             for article in articles:
#                 try:
#                     keywords_container = article.find_elements(By.CSS_SELECTOR, "div.text-gray-light span a")
#                     keywords = [keyword.text.strip() for keyword in keywords_container]
#                     if keywords:
#                         with open("keywords.txt", "a", encoding="utf-8") as file:
#                             file.write(f"Keywords: {', '.join(keywords)}\n")
#                         print(f"Keywords saved: {', '.join(keywords)}")
#                 except Exception:
#                     print("Error occurred while extracting keywords")

#             try:
#                 load_next_button = WebDriverWait(driver, 5).until(
#                     EC.element_to_be_clickable((By.CSS_SELECTOR, "a.border-solid.relative.inline-flex.items-center.rounded-r-md"))
#                 )
#                 driver.execute_script("arguments[0].click();", load_next_button)
#                 time.sleep(2)
#             except Exception:
#                 print("'Load More' button not found or no more content to load.")
#                 break
#     except Exception as e:
#         print(f"Error fetching: {e}")

def main():
    driver = driver_setup()
    link = r"https://www.mrs.org/meetings-events/annual-meetings/archive/meeting/presentations/2022-mrs-spring-meeting?page" \
           r"=1&categories=&symposium=&sessiontype=&topicalcluster=&sessiondate="
    try:
        article_scraper(driver, link)
    finally:
        driver.quit()
        print("Driver closed.")


if __name__ == "__main__":
    main()


Error fetching
Driver closed.


# Text Cleaning

In [None]:
import re

def clean_conference_text(text):
    # Split the text into individual lines
    lines = text.split('\n')
    cleaned = []
    current_line = []

    for line in lines:
        stripped = line.strip()
        # Check if the line is empty (after stripping whitespace)
        if not stripped:
            # If there's content in current_line, join and add to cleaned
            if current_line:
                cleaned.append(' '.join(current_line))
                current_line = []
            # Add an empty line to preserve the line break
            cleaned.append('')
        else:
            # Add the stripped line to the current paragraph
            current_line.append(stripped)
    
    # Add any remaining content in current_line
    if current_line:
        cleaned.append(' '.join(current_line))
    
    # Join the cleaned lines, replacing empty strings with actual newlines
    result = '\n'.join(cleaned)
    # Remove excessive empty lines (optional, adjust as needed)
    result = re.sub(r'\n{3,}', '\n\n', result)

    return result

def clean_text(text):
    # Step 1: Replace single line breaks (but keep double line breaks for paragraphs)
    # text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)

    # Step 2: Remove unwanted session metadata
    # text = re.sub(r"Final Program\s*–\s*\d{1,2}\.\d{1,2}\.\d{2,4}", "", text)
    # text = re.sub(r"SESSION\s+[A-Z]+\d+\.\d+\s*:\s*.*", "", text)
    text = re.sub(r"Session Chairs?:\s*.*", "", text)
    text = re.sub(r"[A-Za-z]+day\s*(Morning|Afternoon|Evening),?\s*[A-Za-z]+\s+\d{1,2},\s*\d{4}", "", text)

    # Step 3: Remove extra spaces
    # text = re.sub(r"\s+", " ", text).strip()

    return text

def merge_lines(text):
    pattern = re.compile(r'([-,])([^\S\n]*)\n+([^\S\n]*)', flags=re.MULTILINE)
    return pattern.sub(r'\1\2\3', text)

def process_text(text):
    # Remove lines that start with 'Acknowledgement', 'References' or '[1]'
    text = re.sub(r'(?m)^(Acknowledgement|References|\[\d+\]).*$', '', text)

    # Split text into lines and process them
    lines = text.strip().split("\n")
    merged_lines = []
    current_para = ""

    time_pattern = re.compile(r"^\d{1,2}:\d{2} [APM]{2}")  # Match time format

    for line in lines:
        line = line.strip()
        if not line:
            continue  # Skip blank lines
        
        if time_pattern.match(line):  # New paragraph starts
            if current_para:
                merged_lines.append(current_para)  # Store the previous paragraph
            current_para = line  # Start a new paragraph
        else:
            current_para += " " + line  # Append to the current paragraph
        
    if current_para:
        merged_lines.append(current_para)  # Add the last paragraph

    return "\n\n".join(merged_lines)



In [None]:
with open("2020-mrs-fall-meeting-abstracts.txt", "r", encoding="utf-8") as file:
    raw_text = file.read()

# cleaned_text = clean_conference_text(raw_text)
# cleaned_text = clean_text(raw_text)
cleaned_text = process_text(raw_text)
# cleaned_text = merge_lines(cleaned_text)
# Save the cleaned text
with open("2020-spring-fall-final.txt", "w", encoding="utf-8") as file:
    file.write(cleaned_text)

print("Text cleaning complete. Saved as 'cleaned_conference_text.txt'.")

# Keywords Extraction

In [None]:
import re
import json
from krutrim_cloud import KrutrimCloud

# Initialize API client
client = KrutrimCloud(api_key='')  # use your api key

# File paths
fall_abstracts_path = 'Fall_Abstracts(2011-2018).txt'
spring_abstracts_path = 'Spring_Abstracts(2011-2018).txt'
output_json_path = 'abstracts_with_5_keywords.json'
checkpoint_path = 'processed_abstracts_checkpoint.json'

# Function to extract keywords
def extract_keywords(abstract):
    model_name = "Llama-3.3-70B-Instruct"
    messages = [
        {"role": "system", "content": "You are an expert keyword extractor specialized in materials science and engineering. Your task is to analyze abstracts from the Materials Research Society (MRS) conferences and extract exactly 5 concise and specific keywords or phrases from each abstract. These keywords should: 1. Reflect the core topics, materials, techniques, or methods discussed in the abstract. 2. Be specific to the research focus of the abstract (e.g., 'perovskite solar cells' instead of 'solar cells'). 3. Avoid overly generic terms (e.g., 'materials science' or 'analysis'). 4. Be formatted as a comma-separated list without any additional text or explanation."},
        {
            "role": "user",
            "content": "Abstract: " + abstract + " Extract exactly 5 concise and specific keywords or phrases that capture the core topics, techniques, materials, or methods discussed in this abstract. Ensure the keywords are relevant, specific, and formatted as a comma-separated list without any additional text or explanation. Output format: Keyword1, Keyword2, Keyword3, Keyword4, Keyword5",
        },
    ]
    try:
        response = client.chat.completions.create(model=model_name, messages=messages)
        return response.choices[0].message.content  # type: ignore
    except Exception as exc:
        print(f"API Error: {exc}")
        return None

# Process a single abstract
def process_abstract(line, current_year):
    try:
        # Extract country as the last word before "Show Abstract"
        pre_abstract_text = line.split("Show Abstract")[0]
        country_match = re.search(r"(\b\w+)$", pre_abstract_text.strip())
        country = country_match.group(1) if country_match else "Unknown Country"

        # Extract keywords
        keywords = extract_keywords(line)
        if not keywords:
            print(f"Failed to extract keywords for line: {line}")
            return None

        return {
            "year": current_year,
            "country": country,
            "keywords": [keyword.strip() for keyword in keywords.split(", ")],
        }
    except Exception as e:
        print(f"Error processing line: {line}\n{e}")
        return None

# Process the file
def process_abstract_file(file_path, checkpoint=None):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.readlines()

    current_year = None
    processed_data = checkpoint or []
    processed_lines = set(item["line_number"] for item in processed_data)

    year_pattern = re.compile(r"(\d{4}) Symposiums")

    for line_number, line in enumerate(content):
        if line_number in processed_lines:
            continue

        # Detect the year
        year_match = year_pattern.match(line.strip())
        if year_match:
            current_year = int(year_match.group(1))
            continue

        # Process abstracts
        if "Show Abstract" in line and current_year:
            print(f"Processing line {line_number} for year {current_year}")
            result = process_abstract(line, current_year)
            if result:
                result["line_number"] = line_number
                processed_data.append(result)

            # Save checkpoint
            if len(processed_data) % 100 == 0:
                with open(checkpoint_path, 'w') as checkpoint_file:
                    json.dump(processed_data, checkpoint_file, indent=4)

    return processed_data

# Load checkpoint
try:
    with open(checkpoint_path, 'r') as checkpoint_file:
        checkpoint_data = json.load(checkpoint_file)
except FileNotFoundError:
    checkpoint_data = []

# Process files
fall_data = process_abstract_file(fall_abstracts_path, checkpoint_data)
spring_data = process_abstract_file(spring_abstracts_path, fall_data)

# Save final results
with open(output_json_path, 'w') as json_file:
    json.dump(spring_data, json_file, indent=4)

print("Processing complete.")


# Combining 4 different years kaywords json files to one for umap + fastcluster parameter tuning.

In [None]:
import json
import os

def combine_json_files(root_folder, combined_json_file):
    combined_data = []
    count=0
    for filename in os.listdir(root_folder):
        if filename.endswith(".json"):
            file_path = os.path.join(root_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                print(f"Processing file: {file_path}")
                try:
                    data = json.load(file)
                    for entry in data:
                        if "year" in entry and "keywords" in entry and entry['year'] in [2011, 2015, 2020, 2024]:
                            combined_data.append({
                                "year": entry["year"],
                                "keywords": entry["keywords"]
                            })
                            count+=len(entry["keywords"])
                        else:
                            print(f"Skipping entry without 'year' or 'keywords': {entry}")
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON file: {filename}")
    print(f"Total keywords combined: {count}")
    combined_data.sort(key=lambda x: x["year"])
    
    with open(combined_json_file, 'w', encoding='utf-8') as outfile:
        json.dump(combined_data, outfile, indent=4)

combine_json_files(".", "combined_for_param_tuning.json")

# Generating embeddings

In [None]:
##### import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader

# Load JSON and extract keywords for a given year
def load_keywords_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    keywords = []
    for entry in data:
        keywords.extend(entry['keywords'])
    return list(set(keywords)) 

# Load Sentence-BERT model with GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

keywords = load_keywords_from_json('combined_for_param_tuning.json')
dataloader = DataLoader(keywords, batch_size=1024, shuffle=False)
embeddings = []
for batch in dataloader:
    batch_embeddings = model.encode(batch, show_progress_bar=True)
    embeddings.append(batch_embeddings)
    
embeddings = np.vstack(embeddings)
np.save('embeddings.npy', embeddings)

# Parameter tuning (sample code)

In [None]:
import json
import umap
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score

embeddings = np.load("embeddings.npy")
print(embeddings.shape)  # Should be (n, 384)


# Define expanded parameter combinations
n_components_range = range(10, 101, 10)  # 10, 20, 30, ..., 100
n_neighbors_range = range(10, 101, 10)  
min_dist_values = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5, 0.65, 0.8, 0.99]
distance_thresholds = [1.0, 1.5, 2.0, 2.5, 3.0]

# File to track progress
progress_file = "clustering_progress.json"
results_file = "clustering_results.csv"

# Generate all parameter combinations
umap_params = []
for n_comp in n_components_range:
    for n_neigh in n_neighbors_range:
        for min_dist in min_dist_values:
            umap_params.append((n_comp, n_neigh, min_dist))

# Load existing results and progress if available
results = []
completed_params = set()

if os.path.exists(results_file):
    df_results = pd.read_csv(results_file)
    results = df_results.to_dict('records')
    
    # Extract already completed parameter combinations
    for row in results:
        param_key = (row["n_components"], row["n_neighbors"], row["min_dist"], row["distance_threshold"])
        completed_params.add(param_key)
    
    print(f"Loaded {len(results)} existing results from {results_file}")

if os.path.exists(progress_file):
    with open(progress_file, 'r') as f:
        progress_data = json.load(f)
        last_index = progress_data.get('last_index', 0)
        print(f"Resuming from parameter combination index {last_index}")
else:
    last_index = 0

# Process remaining parameter combinations
try:
    for param_idx, (n_comp, n_neigh, min_dist) in enumerate(umap_params[last_index:], start=last_index):
        print(f"Processing UMAP: n_components={n_comp}, n_neighbors={n_neigh}, min_dist={min_dist} [{param_idx+1}/{len(umap_params)}]")
        
        # Apply UMAP
        umap_reducer = umap.UMAP(
            n_components=n_comp, 
            n_neighbors=n_neigh, 
            min_dist=min_dist, 
            metric='cosine', 
            random_state=42
        )
        umap_embeddings = umap_reducer.fit_transform(embeddings)
        
        for dist_thresh in distance_thresholds:
            # Skip if this combination has already been processed
            param_key = (n_comp, n_neigh, min_dist, dist_thresh)
            if param_key in completed_params:
                print(f"  - Skipping AgglomerativeClustering with distance_threshold={dist_thresh} (already processed)")
                continue
                
            print(f"  - Applying AgglomerativeClustering with distance_threshold={dist_thresh}")
            
            # Apply Fast Clustering
            linkage_matrix = fastcluster.linkage_vector(umap_embeddings, method='ward')
            labels = fcluster(linkage_matrix, t=dist_thresh, criterion='distance')
            num_clusters = len(set(labels))
            
            # Compute silhouette score only if there is more than 1 cluster and more than 1 sample
            if num_clusters > 1 and len(set(labels)) < len(labels):
                try:
                    sil_score = silhouette_score(umap_embeddings, labels)
                except:
                    sil_score = -1  # Error in calculation
            else:
                sil_score = -1  # Invalid case (all points in one cluster)
            
            # Save cluster assignments to JSON
            clustered_keywords = {}
            for keyword, cluster in zip(keywords, labels):
                clustered_keywords.setdefault(str(cluster), []).append(keyword)
            
            filename = f"clustered_keywords_ncomp{n_comp}_nneigh{n_neigh}_mindist{min_dist}_dthresh{dist_thresh}.json"
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(clustered_keywords, f, indent=4, ensure_ascii=False)
            print(f"    - Saved results to {filename}")
            
            # Store results in table
            results.append({
                "n_components": n_comp,
                "n_neighbors": n_neigh,
                "min_dist": min_dist,
                "distance_threshold": dist_thresh,
                "num_clusters": num_clusters,
                "silhouette_score": sil_score
            })
            
            # Save results incrementally
            df_results = pd.DataFrame(results)
            df_results.to_csv(results_file, index=False)
            
            # Update progress
            with open(progress_file, 'w') as f:
                json.dump({'last_index': param_idx}, f)
            
            completed_params.add(param_key)
        
except KeyboardInterrupt:
    print("\nProcess interrupted by user. Progress has been saved.")

print(f"\nClustering completed/paused. Results saved in '{results_file}'.")


# Data analysis

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_graphs(df):
    # Plot silhouette_score vs other parameters
    params = ['n_components', 'n_neighbors', 'min_dist', 'distance_threshold', 'num_clusters']
    for param in params:
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x=df[param], y=df[df['silhouette_score']>0]['silhouette_score'])
        plt.xlabel(param)
        plt.ylabel('Silhouette Score')
        plt.title(f'Silhouette Score vs {param}')
        plt.show()
    
    # Plot num_clusters vs other parameters
    for param in params[:-1]:  # Excluding num_clusters itself
        plt.figure(figsize=(8, 5))
        sns.scatterplot(x=df[param], y=df['num_clusters'])
        plt.xlabel(param)
        plt.ylabel('Number of Clusters')
        plt.title(f'Number of Clusters vs {param}')
        plt.show()
    
    # Plot correlation heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Heatmap')
    plt.show()

df_sorted = pd.read_csv("clustering_results.csv")
plot_graphs(df_sorted)

# Combining all keywords json files to apply umap and fastcluster

In [None]:
import json
import os

def combine_json_files(root_folder, combined_json_file):
    combined_data = []
    count=0
    for filename in os.listdir(root_folder):
        if filename.endswith(".json") and filename != "combined_for_param_tuning.json":
            file_path = os.path.join(root_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                print(f"Processing file: {file_path}")
                try:
                    data = json.load(file)
                    for entry in data:
                        if "year" in entry and "keywords" in entry:
                            combined_data.append({
                                "year": entry["year"],
                                "keywords": entry["keywords"]
                            })
                            count+=len(entry["keywords"])
                        else:
                            print(f"Skipping entry without 'year' or 'keywords': {entry}")
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON file: {filename}")
    print(f"Total keywords combined: {count}")
    combined_data.sort(key=lambda x: x["year"])
    
    with open(combined_json_file, 'w', encoding='utf-8') as outfile:
        json.dump(combined_data, outfile, indent=4)

# Example usage
combine_json_files(".", "combined.json")

# Applying Umap + Fastcluster

In [None]:
##### import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader

# Load JSON and extract keywords for a given year
def load_keywords_from_json(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    keywords = []
    for entry in data:
        keywords.extend(entry['keywords'])
    return list(set(keywords)) 

# Load Sentence-BERT model with GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

keywords = load_keywords_from_json('combined.json')
dataloader = DataLoader(keywords, batch_size=1024, shuffle=False)
embeddings = []
for batch in dataloader:
    batch_embeddings = model.encode(batch, show_progress_bar=True)
    embeddings.append(batch_embeddings)
    
embeddings = np.vstack(embeddings)
np.save('embeddings.npy', embeddings)

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
from cuml.manifold import UMAP as cumlUMAP
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score
from collections import defaultdict
import pandas as pd
import torch
import gc
import os
from itertools import product

# === Config ===
INPUT_JSON = "combined.json"
OUTPUT_DIR = "./cluster_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# UMAP and clustering params
N_COMPONENTS = [40,50]
N_NEIGHBORS = [10, 20]
MIN_DISTS = [0.01, 0.05, 0.2]
DIST_THRESHOLDS = [0.5, 1, 1.5]

umap_params = []
for n_comp in N_COMPONENTS:
    for n_neigh in N_NEIGHBORS:
        for min_dist in MIN_DISTS:
            umap_params.append((n_comp, n_neigh, min_dist))

results_file = "clustering_results.csv"
results=[]
# Generate all parameter combinations
umap_params = []
for n_comp in n_components_range:
    for n_neigh in n_neighbors_range:
        for min_dist in min_dist_values:
            umap_params.append((n_comp, n_neigh, min_dist))


for param_idx, (n_comp, n_neigh, min_dist) in enumerate(umap_params[last_index:], start=last_index):
    print(f"Processing UMAP: n_components={n_comp}, n_neighbors={n_neigh}, min_dist={min_dist} [{param_idx+1}/{len(umap_params)}]")
    
    # Apply UMAP
    umap_reducer = umap.UMAP(
        n_components=n_comp, 
        n_neighbors=n_neigh, 
        min_dist=min_dist, 
        metric='cosine', 
        random_state=42
    )
    umap_embeddings = umap_reducer.fit_transform(embeddings)
    
    for dist_thresh in distance_thresholds:
        
        linkage_matrix = fastcluster.linkage_vector(umap_embeddings, method='ward')
        labels = fcluster(linkage_matrix, t=dist_thresh, criterion='distance')
        num_clusters = len(set(labels))
        
        # Compute silhouette score only if there is more than 1 cluster and more than 1 sample
        if num_clusters > 1 and len(set(labels)) < len(labels):
            try:
                sil_score = silhouette_score(umap_embeddings, labels)
            except:
                sil_score = -1  # Error in calculation
        else:
            sil_score = -1  # Invalid case (all points in one cluster)
        
        # Save cluster assignments to JSON
        clustered_keywords = {}
        for keyword, cluster in zip(keywords, labels):
            clustered_keywords.setdefault(str(cluster), []).append(keyword)
        
        filename = f"clustered_keywords_ncomp{n_comp}_nneigh{n_neigh}_mindist{min_dist}_dthresh{dist_thresh}.json"
        with open(filename, "w", encoding="utf-8") as f:
            json.dump(clustered_keywords, f, indent=4, ensure_ascii=False)
        print(f"    - Saved results to {filename}")
        
        # Store results in table
        results.append({
            "n_components": n_comp,
            "n_neighbors": n_neigh,
            "min_dist": min_dist,
            "distance_threshold": dist_thresh,
            "num_clusters": num_clusters,
            "silhouette_score": sil_score
        })
        
        # Save results incrementally
        df_results = pd.DataFrame(results)
        df_results.to_csv(results_file, index=False)
    

print(f"\nClustering completed/paused. Results saved in '{results_file}'.")

### The results of clustering on combined keywords of all years were bad due to UMAP, so Now trying to skip UMAPand and apply only fastcluster with dist_thresh=1.0

In [None]:
import json
import numpy as np
import torch
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader

# Set output directory
OUTPUT_DIR = "/clustering_results_new"
results_file = os.path.join(OUTPUT_DIR, "clustering_results_without_umap.csv")

# Load JSON and extract keywords for a given year
def load_keywords_from_json(file_path, target_year):
    with open(file_path, 'r') as f:
        data = json.load(f)
    keywords = []
    for entry in data:
        if entry['year']==target_year:
            keywords.extend(entry['keywords'])
    return list(set(keywords)) 

# Load Sentence-BERT model with GPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

for year in range(2011,2025):
    keywords = load_keywords_from_json('combined.json', target_year=year)
    dataloader = DataLoader(keywords, batch_size=1024, shuffle=False)
    embeddings = []
    for batch in dataloader:
        batch_embeddings = model.encode(batch, show_progress_bar=True)
        embeddings.append(batch_embeddings)
        
    embeddings = np.vstack(embeddings)
    dist_thresh=1.0
    print(f"  - Applying AgglomerativeClustering with distance_threshold={dist_thresh}")
    linkage_matrix = fastcluster.linkage_vector(embeddings, method="ward")
    labels = fcluster(linkage_matrix, t=dist_thresh, criterion="distance")
    num_clusters = len(set(labels))
    
    # Compute silhouette score
    sil_score = silhouette_score(embeddings, labels) if num_clusters > 1 else -1
    
    # Save cluster assignments
    clustered_keywords = {}
    for keyword, cluster in zip(keywords, labels):
        clustered_keywords.setdefault(str(cluster), []).append(keyword)
    
    filename = f"{year}_fastclustr_dthresh{dist_thresh}.json"
    filepath = os.path.join(OUTPUT_DIR, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        json.dump(clustered_keywords, f, indent=4, ensure_ascii=False)
    print(f"    - Saved results to {filepath}")
    
    results.append({
        "distance_threshold": dist_thresh,
        "num_clusters": num_clusters,
        "silhouette_score": sil_score,
        "cluster_file": filename
    })
    
    # Save results incrementally
    pd.DataFrame(results).to_csv(results_file, index=False)

print(f"\nClustering completed. Results saved in '{results_file}'.")


# Cluster Labelling using LLM

In [None]:
import os
import json
from krutrim_cloud import KrutrimCloud
api_key="" # Use your api key
# Initialize API client
client = KrutrimCloud(api_key=api key)

# Directory containing clustered keyword JSON files
INPUT_DIR = "./clustering_results_new"  # Adjust if needed
OUTPUT_DIR = "./cluster_labels"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Years you have cluster files for
YEARS = list(range(2011, 2025))  # 2011 to 2024

# LLM model config
MODEL_NAME = "Llama-3.3-70B-Instruct"

# Function to get representative label from LLM
def label_cluster_with_llm(cluster_keywords):
    keywords_text = ", ".join(cluster_keywords)
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert in materials science. Your task is to assign a single representative keyword or phrase "
                "to a given list of related materials science research keywords. Choose a concise, specific phrase that "
                "accurately summarizes the entire cluster. If the keywords are too unrelated or vary widely, return only: None.\n\n"
                "Rules:\n"
                "1. Choose a single short phrase or keyword (no more than 4-5 words).\n"
                "2. Make it specific and relevant (e.g., 'Mott-Schottky analysis', not 'materials characterization').\n"
                "3. Output only the label or 'None' with no additional explanation."
            )
        },
        {
            "role": "user",
            "content": f"Cluster: {keywords_text}\n\nOutput a single label or 'None':"
        },
    ]
    try:
        response = client.chat.completions.create(model=MODEL_NAME, messages=messages)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"❌ API Error: {e}")
        return "None"

# Process each year's cluster file
for year in YEARS:
    cluster_file = os.path.join(INPUT_DIR, f"{year}_fastclust_dt1.json")  # Adjust filename format if needed
    if not os.path.exists(cluster_file):
        print(f"❌ Missing: {cluster_file}")
        continue

    print(f"\n📦 Processing clusters from: {year}")
    with open(cluster_file, 'r', encoding='utf-8') as f:
        clusters = json.load(f)

    labeled_clusters = {}
    for cluster_id, keywords in clusters.items():
        # print(f"→ Cluster {cluster_id} ({len(keywords)} keywords)")
        label = label_cluster_with_llm(keywords)
        # print(f"   🏷️ Label: {label}")
        labeled_clusters[cluster_id] = label

    # Save labeled output
    output_path = os.path.join(OUTPUT_DIR, f"{year}_labeled_clusters.json")
    with open(output_path, 'w', encoding='utf-8') as f:
        json.dump(labeled_clusters, f, indent=2, ensure_ascii=False)

print("\n✅ Done labeling all clusters.")


# Making a combined json file with all cluster labels and keywords count in that cluster

In [None]:
import os
import json
from collections import defaultdict

# Directory where the per-year JSONs are stored
INPUT_DIR1 = "./clustering_results_new"
INPUT_DIR2 = "./cluster_labels"  # Adjust if needed
OUTPUT_JSON = "keyword_timeline.json"

# Years to process
YEARS = list(range(2011, 2025))

# Output dictionary
label_year_counts = defaultdict(lambda: defaultdict(int))

for year in YEARS:
    cluster_path = os.path.join(INPUT_DIR1, f"{year}_fastclust_dt1.json")
    label_path = os.path.join(INPUT_DIR2, f"{year}_labeled_clusters.json")

    if not os.path.exists(cluster_path) or not os.path.exists(label_path):
        print(f"Skipping {year} (files not found)")
        continue

    with open(cluster_path, "r", encoding='utf-8') as f:
        cluster_data = json.load(f)

    with open(label_path, "r", encoding='utf-8') as f:
        label_data = json.load(f)

    for cluster_id, keywords in cluster_data.items():
        label = label_data.get(cluster_id)
        if label is not None:
            label_year_counts[label][str(year)] += len(keywords)

# Convert defaultdict to normal dict for JSON output
final_result = {label: dict(years) for label, years in label_year_counts.items()}

# Save to JSON
with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
    json.dump(final_result, f, indent=2, ensure_ascii=False)

print(f"✅ Saved keyword label year-count data to {OUTPUT_JSON}")


#  Clustering on the combined labels file for generating 3 variants of final keywords sets.

In [None]:
import json
import numpy as np
from sentence_transformers import SentenceTransformer
from torch.utils.data import DataLoader
import fastcluster
from scipy.cluster.hierarchy import fcluster
from sklearn.metrics import silhouette_score
from collections import defaultdict
import pandas as pd
import torch
import os

# === Config ===
INPUT_JSON = "keyword_timeline.json"
OUTPUT_DIR = "./"
os.makedirs(OUTPUT_DIR, exist_ok=True)

DIST_THRESHOLDS = [0.5, 1, 1.5]

# === Load model ===
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = SentenceTransformer("all-MiniLM-L6-v2", device=device)

# === Load keyword -> [years] mapping ===
with open(INPUT_JSON, 'r', encoding='utf-8') as f:
    keyword_to_years = json.load(f)

keywords = list(keyword_to_years.keys())
print(f"Loaded {len(keywords)} unified keywords")

# === Embed keywords ===
print("Encoding keywords...")
dataloader = DataLoader(keywords, batch_size=1024, shuffle=False)
all_embeddings = []
for batch in dataloader:
    batch_embeddings = model.encode(batch, show_progress_bar=False)
    all_embeddings.append(batch_embeddings)

embeddings = np.vstack(all_embeddings).astype(np.float32)

all_summary=[]
for dist in DIST_THRESHOLDS:
    # === Fastcluster clustering ===
    print(f"\n🔧 Fastcluster(thresh={dist})")
    linkage_matrix = fastcluster.linkage_vector(embeddings, method='ward')
    labels = fcluster(linkage_matrix, t=dist, criterion='distance')
    n_clusters = len(set(labels))
    
    # Silhouette score
    silhouette = silhouette_score(embeddings, labels) if n_clusters > 1 else -1
    
    # === Create cluster structure with aggregated year counts ===
    clustered_keywords = defaultdict(list)
    for kw, label in zip(keywords, labels):
        clustered_keywords[f"cluster_{label}"].append(kw)
    
    # Final output: cluster_label -> {"keywords": [...], "years": {year: count, ...}}
    clusters = {}
    for cluster_id, kws in clustered_keywords.items():
        year_counts = defaultdict(int)
        for kw in kws:
            for year, count in keyword_to_years[kw].items():
                year_counts[year] += count
    
        clusters[cluster_id] = {
            "keywords": kws,
            "years": dict(sorted(year_counts.items()))
        }
    
    # Save cluster JSON
    filename_json = f"clusters_dt{dist}_with_year_counts.json"
    with open(os.path.join(OUTPUT_DIR, filename_json), "w", encoding="utf-8") as f:
        json.dump(clusters, f, indent=2, ensure_ascii=False)
    
    # Log results
    summary = [{
        "distance_threshold": dist,
        "n_clusters": n_clusters,
        "silhouette_score": silhouette,
        "output_file": filename_json
    }]
    all_summary.append(summary)
    
df = pd.DataFrame(all_summary)
df.to_csv(os.path.join(OUTPUT_DIR, "summary_results.csv"), index=False)

print("\n✅ Done! Cluster JSON with year counts saved in:", OUTPUT_DIR)


#  Labelling clusters with LLM 

In [None]:
import os
import json
from collections import defaultdict
from krutrim_cloud import KrutrimCloud

# === Config ===
THRESHOLDS = [0.5, 1, 1.5]
OUTPUT_DIR = "./labeled_cluster_results"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# === API Client ===
client = KrutrimCloud(api_key='') #api key 
MODEL_NAME = "Llama-3.3-70B-Instruct"

def label_cluster_with_llm(cluster_keywords):
    keywords_text = ", ".join(cluster_keywords)
    messages = [
        {
            "role": "system",
            "content": (
                "You are an expert in materials science. Your task is to assign a single representative keyword or phrase "
                "to a given list of related materials science research keywords. Choose a concise, specific phrase that "
                "accurately summarizes the entire cluster. If the keywords are too unrelated or vary widely, return only: None.\n\n"
                "Rules:\n"
                "1. Choose a single short phrase or keyword (no more than 4-5 words).\n"
                "2. Make it specific and relevant (e.g., 'Mott-Schottky analysis', not 'materials characterization').\n"
                "3. Output only the label or 'None' with no additional explanation."
            )
        },
        {
            "role": "user",
            "content": f"Cluster: {keywords_text}\n\nOutput a single label or 'None':"
        },
    ]
    try:
        response = client.chat.completions.create(model=MODEL_NAME, messages=messages)
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"❌ API Error: {e}")
        return "None"

# === Process each threshold file ===
for thresh in THRESHOLDS:
    input_file = f"clusters_dt{thresh}_with_year_counts.json"
    with open(input_file, 'r', encoding='utf-8') as f:
        clusters = json.load(f)

    print(f"\n🔍 Processing {input_file}...")

    labeled_output = defaultdict(lambda: defaultdict(int))

    for cluster_id, cluster_data in clusters.items():
        keywords = cluster_data["keywords"]
        years = cluster_data["years"]

        label = label_cluster_with_llm(keywords)
        if label == "None":
            continue

        for year, count in years.items():
            labeled_output[label][str(year)] += count

    # Sort inner dicts by year
    final_output = {
        label: dict(sorted(year_data.items()))
        for label, year_data in labeled_output.items()
    }

    # Save
    output_file = os.path.join(OUTPUT_DIR, f"labeled_clusters_dt{thresh}.json")
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(final_output, f, indent=2, ensure_ascii=False)

    print(f"✅ Saved labeled results to: {output_file}")


# Saving precomputed embeddings for these 3 variants of final keywords set

In [None]:
# === save_embeddings.py ===
import os
import json
import numpy as np
from sentence_transformers import SentenceTransformer

FILE_MAP = {
    "Fine-grained (0.5 threshold, ~9000)": "labeled_clusters_dt0.5.json",
    "Moderate (1.0 threshold, ~3000)": "labeled_clusters_dt1.json",
    "Broad (1.5 threshold, ~1600)": "labeled_clusters_dt1.5.json"
}
DATA_DIR = "./labeled_cluster_results"
SAVE_DIR = "./precomputed_embeddings"

os.makedirs(SAVE_DIR, exist_ok=True)

model = SentenceTransformer("all-MiniLM-L6-v2")

for label, file in FILE_MAP.items():
    file_path = os.path.join(DATA_DIR, file)
    with open(file_path, "r", encoding="utf-8") as f:
        cluster = json.load(f)
        keywords = list(cluster.keys())

    embeddings = model.encode(keywords, show_progress_bar=True)

    # Save with label key in filename
    base_name = label.split("(")[0].strip().replace(" ", "_").lower()
    np.save(os.path.join(SAVE_DIR, f"{base_name}_embeddings.npy"), embeddings)
    with open(os.path.join(SAVE_DIR, f"{base_name}_keywords.json"), "w") as f:
        json.dump(keywords, f)

    print(f"Saved: {label}")
