In [5]:
import json
import os

def combine_json_files(root_folder, combined_json_file):
    combined_data = []
    count=0
    for filename in os.listdir(root_folder):
        if filename.endswith(".json") and filename != "combined_for_param_tuning.json":
            file_path = os.path.join(root_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                print(f"Processing file: {file_path}")
                try:
                    data = json.load(file)
                    for entry in data:
                        if "year" in entry and "keywords" in entry:
                            combined_data.append({
                                "year": entry["year"],
                                "keywords": entry["keywords"]
                            })
                            count+=len(entry["keywords"])
                        else:
                            print(f"Skipping entry without 'year' or 'keywords': {entry}")
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON file: {filename}")
    print(f"Total keywords combined: {count}")
    combined_data.sort(key=lambda x: x["year"])
    
    with open(combined_json_file, 'w', encoding='utf-8') as outfile:
        json.dump(combined_data, outfile, indent=4)

# Example usage
combine_json_files(".", "combined.json")

Processing file: .\2011-2018-fall.json
Processing file: .\2011-2018-spring.json
Processing file: .\2019-fall-keywords.json
Processing file: .\2019-spring-keywords.json
Processing file: .\2022-spring-keywords.json
Processing file: .\keywords-fall-2021.json
Processing file: .\keywords-fall-2022.json
Processing file: .\keywords-fall-2023.json
Processing file: .\keywords-fall-2024.json
Processing file: .\keywords-spring-2021.json
Processing file: .\keywords-spring-2023.json
Processing file: .\keywords-spring-2024.json
Processing file: .\keywords-spring-fall-2020.json
Total keywords combined: 559997


In [5]:
import json
keywords = []
with open("combined.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
    for entry in data:
        keywords.extend(entry["keywords"])

In [6]:
print(f"Total Keywords = {len(keywords)}")
print(f"Total Unique Keywords = {len(set(keywords))}")

Total Keywords = 559997
Total Unique Keywords = 299751


In [5]:
import json
import os

keywords = {}
with open("combined.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
    for year in range(2011,2025):
        res=[]
        for entry in data:
            if entry["year"] == year:
                res.extend(entry["keywords"])
        keywords[year] = len(set(res))
                
print(keywords)

{2011: 30173, 2012: 30084, 2013: 39489, 2014: 40820, 2015: 40915, 2016: 37426, 2017: 38293, 2018: 37697, 2019: 354, 2020: 13900, 2021: 24907, 2022: 14765, 2023: 30961, 2024: 30991}


In [None]:
import json

keywords = {}
with open("combined.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
    for year in range(2011,2025):
        res=[]
        for entry in data:
            if entry["year"] == year:
                res.extend(entry["keywords"])
        keywords[year] = len(res)
                
print(keywords)

{2011: 39460, 2012: 38593, 2013: 52649, 2014: 54313, 2015: 54100, 2016: 48723, 2017: 50367, 2018: 48579, 2019: 20951, 2020: 20982, 2021: 31768, 2022: 17980, 2023: 40522, 2024: 41010}


In [7]:
with open("2011-2018-fall.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
    res=[]
    for entry in data:
        if entry["year"] == 2011:
            res.extend(entry["keywords"])
print(len(res))
print(len(set(res)))

26772
21246


In [8]:
with open("2011-2018-spring.json", 'r', encoding='utf-8') as file:
    data = json.load(file)
    res=[]
    for entry in data:
        if entry["year"] == 2011:
            res.extend(entry["keywords"])
print(len(res))
print(len(set(res)))

12688
10649


In [None]:
import json
import os

def combine_json_files(root_folder):
    count=0
    for filename in os.listdir(root_folder):
        count_clusterss=0
        if filename.endswith(".json") and filename != "combined_for_param_tuning.json":
            file_path = os.path.join(root_folder, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                print(f"Processing file: {file_path}")
                try:
                    data = json.load(file)
                    for entry in data:
                        count+=1
                        count_clusterss+=1
                except json.JSONDecodeError:
                    print(f"Skipping invalid JSON file: {filename}")
        print(f"Total clusters in {filename}: {count_clusterss}")
    print(f"Total keywords combined: {count}")


# Example usage
combine_json_files("./Clusters_2011_Fall")

Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh20_mindist0.025.json
Total clusters in clustered_keywords_ncomp20_nneigh20_mindist0.025.json: 690
Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh20_mindist0.05.json
Total clusters in clustered_keywords_ncomp20_nneigh20_mindist0.05.json: 737
Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh20_mindist0.1.json
Total clusters in clustered_keywords_ncomp20_nneigh20_mindist0.1.json: 829
Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh20_mindist0.5.json
Total clusters in clustered_keywords_ncomp20_nneigh20_mindist0.5.json: 1859
Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh25_mindist0.0.025.json
Total clusters in clustered_keywords_ncomp20_nneigh25_mindist0.0.025.json: 679
Processing file: ./Clusters_2011_Fall\clustered_keywords_ncomp20_nneigh25_mindist0.05.json
Total clusters in clustered_keywords_ncomp20_nneigh25_mindist0.05.jso

: 

In [None]:
import json
with open("timeline_cluster_results/clusters_dt0.5.json", 'r', encoding='utf-8') as file:
    clusters = json.load(file)

# Create a new dictionary with only the first keyword from each cluster
keyword_to_years = {}
for cluster_data in clusters.values():
    keywords = cluster_data.get("keywords", [])
    years = cluster_data.get("years", [])
    if keywords:
        keyword_to_years[keywords[0]] = sorted(list(set(years)))

# Save the new JSON
with open("cluster_representative_keywords.json", "w") as f:
    json.dump(keyword_to_years, f, indent=2)

In [11]:
import json
with open("cluster_labels/2011_labeled_clusters.json", 'r', encoding='utf-8') as file:
    clusters = json.load(file)
    print(len(clusters.values()))
    print(len(set(clusters.values())))

1708
1620


In [18]:
import json

# === Config ===
INPUT_CLUSTER_FILE = "clusters_dt0.5_with_year_counts.json"  # Replace with your actual file path
OUTPUT_FILE = "./timeline_cluster_results/representative_keyword_timeline.json"

# === Load clustered data ===
with open(INPUT_CLUSTER_FILE, "r", encoding="utf-8") as f:
    cluster_data = json.load(f)

# === Convert to representative keyword format ===
representative_keywords = {}

for cluster_id, cluster_info in cluster_data.items():
    keywords = cluster_info.get("keywords", [])
    years = cluster_info.get("years", {})
    if keywords:
        representative_keywords[keywords[0]] = years

# === Save new JSON ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    json.dump(representative_keywords, f, indent=2, ensure_ascii=False)

print(f"✅ Representative keyword timeline saved to: {OUTPUT_FILE}")


✅ Representative keyword timeline saved to: ./timeline_cluster_results/representative_keyword_timeline.json
