In [2]:
import lux
import pandas as pd
import json
import pandas as pd

# Define the paths to your JSON files
json_files = [
    'test_processed_faculty_stats_data.json',
    'test_processed_crossref_article_stats_data.json',
    'test_processed_category_data.json',
    'test_processed_crossref_article_stats_obj_data.json'
]

# Initialize empty DataFrames
df_faculty_stats = pd.DataFrame()
df_article_stats = pd.DataFrame()
df_category_stats = pd.DataFrame()

for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # Process faculty stats data
    if 'faculty_stats' in str(data):
        # Flatten the JSON structure
        for category, details in data.items():
            faculty_stats = details.get('faculty_stats', {})
            df_faculty = pd.json_normalize(faculty_stats, sep='_').T.reset_index()
            df_faculty.columns = ['faculty_name'] + list(df_faculty.iloc[0, 1:])
            df_faculty = df_faculty[1:].assign(category=category)
            df_faculty_stats = pd.concat([df_faculty_stats, df_faculty], ignore_index=True)
    
    # Process article stats data (by category)
    elif 'article_citation_map' in str(data):
        for category, details in data.items():
            article_citation_map = details.get('article_citation_map', {})
            df_articles = pd.json_normalize(article_citation_map, sep='_').T.reset_index()
            df_articles.columns = ['doi'] + list(df_articles.iloc[0, 1:])
            df_articles = df_articles[1:].assign(category=category)
            df_article_stats = pd.concat([df_article_stats, df_articles], ignore_index=True)
    
    # Process article stats data (by DOI)
    elif any(key.startswith('10.') for key in data.keys()):
        df_articles_obj = pd.json_normalize(data, sep='_').T.reset_index()
        df_articles_obj.columns = ['doi'] + list(df_articles_obj.iloc[0, 1:])
        df_articles_obj = df_articles_obj[1:]
        df_article_stats = pd.concat([df_article_stats, df_articles_obj], ignore_index=True)
    
    # Process category data
    elif 'doi_list' in str(data):
        df_category = pd.json_normalize(data, sep='_').T.reset_index()
        df_category.columns = ['category'] + list(df_category.iloc[0, 1:])
        df_category = df_category[1:]
        df_category_stats = pd.concat([df_category_stats, df_category], ignore_index=True)

# Now, we need to merge all DataFrames into one comprehensive DataFrame

# Merge article stats with faculty stats on 'doi'
# First, ensure that 'doi' is present in both DataFrames
if not df_article_stats.empty and 'doi' in df_article_stats.columns:
    # Expand faculty members in df_article_stats
    df_article_stats['faculty_members'] = df_article_stats['faculty_members'].apply(lambda x: x if isinstance(x, list) else [])
    df_article_expanded = df_article_stats.explode('faculty_members').rename(columns={'faculty_members': 'faculty_name'})
else:
    df_article_expanded = pd.DataFrame()

# Merge with faculty stats
if not df_article_expanded.empty and not df_faculty_stats.empty:
    df_merged = pd.merge(df_article_expanded, df_faculty_stats, on='faculty_name', how='outer', suffixes=('_article', '_faculty'))
else:
    df_merged = pd.DataFrame()

# Merge with category stats
if not df_merged.empty and not df_category_stats.empty:
    df_final = pd.merge(df_merged, df_category_stats, left_on='category_article', right_on='category', how='outer', suffixes=('', '_category'))
else:
    df_final = df_merged if not df_merged.empty else df_category_stats

# Drop duplicates
df_final = df_final.drop_duplicates()

# Output the final DataFrame
print("Final Merged DataFrame:")
print(df_final.head())

TypeError: unhashable type: 'list'