In [10]:
import json
import pandas as pd
from rapidfuzz import process
from collections import defaultdict
from flatten_json import flatten
import os

In [8]:
""" create a dict that stored list of affiliation which chula cooperate with
    , based on subject area. stored it as json file"""

def flatten(dictionary, parent_key='', separator='_'):
    """
    Flattens a nested dictionary into a single level dictionary.
    """
    items = []
    if not isinstance(dictionary, dict):
        # Return the dictionary itself if it's not a dict (e.g., a string)
        return {parent_key: dictionary}
    for k, v in dictionary.items():
        new_key = f"{parent_key}{separator}{k}" if parent_key else k
        if isinstance(v, dict):
            items.extend(flatten(v, new_key, separator=separator).items())
        else:
            items.append((new_key, v))
    return dict(items)

def process_affiliation(file_path):
    """
    Processes a single file to extract affiliations, excluding 'Chula'.
    """
    with open(file_path, "r") as f:
        data = json.load(f)

    author_groups = data['abstracts-retrieval-response']['item']['bibrecord']['head']['author-group']
    all_affiliation = []

    # Flatten author group data
    try:
        flat_data = [flatten(item) for item in author_groups]
    except AssertionError:
        flat_data = [flatten(author_groups)]

    # Process flattened data to extract affiliations
    for item in flat_data:
        if "affiliation_organization" in item:
            affiliation_name = []
            for org in item["affiliation_organization"]:
                if isinstance(org, dict) and '$' in org:
                    affiliation_name.append(org['$'])
                elif isinstance(org, str):
                    affiliation_name.append(org)

            final_name = ", ".join(affiliation_name)
            if "Chula" not in final_name:
                all_affiliation.append(final_name)

    return all_affiliation

def process_all_file(base_dir):
    """
    Processes all files in the directory structure and aggregates affiliations by subject area.
    """
    combined_affiliation = defaultdict(list)
    
    for year_folder in os.listdir(base_dir):
        year_path = os.path.join(base_dir, year_folder)
        if os.path.isdir(year_path):
            for file_name in os.listdir(year_path):
                file_path = os.path.join(year_path, file_name)
                if file_path.endswith('.json'):
                    affiliation_list = process_affiliation(file_path)
                    if affiliation_list:
                        # Extract subject areas from JSON data
                        with open(file_path, "r") as f:
                            data = json.load(f)
                        area_list = data.get('abstracts-retrieval-response', {}).get('subject-areas', {}).get('subject-area', [])

                        for area in area_list:
                            area_abbrev = area.get('@abbrev')
                            if area_abbrev:
                                combined_affiliation[area_abbrev].extend(affiliation_list)
                                # Ensure uniqueness
                                combined_affiliation[area_abbrev] = list(set(combined_affiliation[area_abbrev]))
    return combined_affiliation

# Base directory where project files are stored
base_dir = "/Users/im/Documents/Data Sci/Project/Project/chulaDatabase"

# Process all files and get subject area trends
affiliation_data = process_all_file(base_dir)

# Convert affiliation lists to sets for faster searching
affiliation_data_sets = {area: set(affiliations) for area, affiliations in affiliation_data.items()}

# Save to JSON
with open("affiliation_data.json", "w") as json_file:
    json.dump({k: list(v) for k, v in affiliation_data_sets.items()}, json_file)

In [11]:
## Check whether Chula already have cooperation with those affiliation or not ##


# Load JSON data and convert lists to sets for faster operations
with open("affiliation_data.json", "r") as json_file:
    loaded_data = json.load(json_file)
    loaded_data_sets = {k.upper(): [aff.lower() for aff in v] for k, v in loaded_data.items()}  # Normalize to lowercase

# Load top_recommendations CSV
recommendation = pd.read_csv("top_recommendations.csv")

# Normalize affiliation and area columns for consistent comparison
recommendation['affiliation_normalized'] = recommendation['Affiliation'].str.lower()
recommendation['area_normalized'] = recommendation['Area'].str.upper()

# Function to check if an affiliation exists in the JSON data for a specific area using fuzzy matching
def check_affiliation_in_area(row):
    area = row['area_normalized']
    affiliation = row['affiliation_normalized']
    
    # Check if the area exists in the JSON data
    if area in loaded_data_sets:
        # Perform fuzzy matching for affiliation within the area list
        matches = process.extractOne(affiliation, loaded_data_sets[area], score_cutoff=90)  # 90% similarity threshold
        if matches:
            return pd.Series([True, matches[0]])  # Return True and the best match affiliation name
    return pd.Series([False, None])  # Return False and None if no match

# Apply the function to each row
recommendation[['exists_in_area', 'matched_affiliation']] = recommendation.apply(check_affiliation_in_area, axis=1)

# Select only the relevant columns
relevant_columns = ['Affiliation', 'Area', 'exists_in_area', 'matched_affiliation']
recommendation_relevant = recommendation[relevant_columns]

# Output the updated DataFrame (for inspection)
recommendation_relevant


Unnamed: 0,Affiliation,Area,exists_in_area,matched_affiliation
0,Jiangnan University,BIOC,True,"key laboratory of industrial biotechnology, mi..."
1,Poznan University of Medical Sciences,BIOC,True,"poznan university of medical sciences, departm..."
2,Beijing Forestry University,BIOC,False,
3,Jilin University,BIOC,True,"the first hospital of jilin university, labora..."
4,Lovely Professional University,BIOC,True,"department of forensic science, school of bioe..."
5,Jiangnan University,ENER,False,
6,Centre de Développement des Energies Renouvela...,ENER,False,
7,Beijing Forestry University,ENER,False,
8,Université de Batna 1,ENER,False,
9,Zhengzhou University,ENER,True,key laboratory of materials processing and mol...


In [12]:
# Group by the 'exists_in_area' column and focus on the False group
false_group = recommendation_relevant.groupby('exists_in_area').get_group(False)
false_group = false_group.drop(columns=["exists_in_area", "matched_affiliation"])
# Save the rows with False values to a new CSV
false_group.to_csv("recommendations_with_no_match.csv", index=False)

# Output the false group DataFrame (for inspection)
false_group

Unnamed: 0,Affiliation,Area
2,Beijing Forestry University,BIOC
5,Jiangnan University,ENER
6,Centre de Développement des Energies Renouvela...,ENER
7,Beijing Forestry University,ENER
8,Université de Batna 1,ENER
10,Southwest State University,ECON
11,Beijing Jiaotong University,ECON
12,Jiangnan University,ECON
13,Beijing Forestry University,ECON
14,Tianjin University of Science &amp; Technology,ECON
