In [None]:
import pandas as pd
import numpy as np

print("üöÄ Starting Data Integration (Final Version)...")

try:
    df_papers = pd.read_csv('CSV_Files/papers_all_years.csv')
    
    required_cols = ['title', 'journal', 'year', 'citedby_count', 'countries_str', 'subject_areas_str']
    
    available_cols = [c for c in required_cols if c in df_papers.columns]
    df_papers = df_papers[available_cols].dropna(subset=['journal'])
    
    print(f"   -> Chula Papers loaded: {len(df_papers)} rows")
    
except FileNotFoundError:
    print("‚ùå Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå papers_all_years.csv")
    exit()


try:
    df_sjr = pd.read_csv('CSV_Files\scimagojr 2023.csv', sep=';', quotechar='"', on_bad_lines='skip')
    df_sjr = df_sjr[['Title', 'SJR Best Quartile']]
    df_sjr['Title'] = df_sjr['Title'].str.replace('"', '', regex=False).str.strip()
    
    print(f"   -> SJR Global Data loaded: {len(df_sjr)} journals")
    
except FileNotFoundError:
    print("‚ùå Error: ‡πÑ‡∏°‡πà‡∏û‡∏ö‡πÑ‡∏ü‡∏•‡πå scimagojr 2023.csv")
    exit()


print("üîÑ Merging datasets...")

df_papers['journal_lower'] = df_papers['journal'].str.lower().str.strip()
df_sjr['title_lower'] = df_sjr['Title'].str.lower().str.strip()

merged_df = df_papers.merge(df_sjr, left_on='journal_lower', right_on='title_lower', how='left')

total_papers = len(df_papers)
found_match = merged_df['SJR Best Quartile'].notna().sum()
q1_papers = merged_df[merged_df['SJR Best Quartile'] == 'Q1'].shape[0]

print("\n" + "="*40)
print("üìä INTEGRATION REPORT")
print("="*40)
print(f"Total Papers (Chula):    {total_papers}")
print(f"Matched with SJR:        {found_match}")
print(f"Match Success Rate:      {found_match/total_papers*100:.2f}%")
print("-" * 40)
print(f"üèÜ Found Q1 Papers:      {q1_papers}")
print(f"ü•à Found Q2 Papers:      {merged_df[merged_df['SJR Best Quartile'] == 'Q2'].shape[0]}")
print(f"ü•â Found Q3 Papers:      {merged_df[merged_df['SJR Best Quartile'] == 'Q3'].shape[0]}")
print(f"üîª Found Q4 Papers:      {merged_df[merged_df['SJR Best Quartile'] == 'Q4'].shape[0]}")
print("="*40 + "\n")


merged_df.drop(columns=['journal_lower', 'title_lower', 'Title'], inplace=True, errors='ignore')

merged_df['is_Q1'] = merged_df['SJR Best Quartile'].apply(lambda x: 1 if x == 'Q1' else 0)

output_filename = 'chula_papers_with_quality.csv'
merged_df.to_csv(output_filename, index=False)
print(f"üíæ Saved integrated data to '{output_filename}' (with 'is_Q1' column)")

  df_sjr = pd.read_csv('CSV_Files\scimagojr 2023.csv', sep=';', quotechar='"', on_bad_lines='skip')


üöÄ Starting Data Integration (Final Version)...
   -> Chula Papers loaded: 20216 rows
   -> SJR Global Data loaded: 31283 journals
üîÑ Merging datasets...

üìä INTEGRATION REPORT
Total Papers (Chula):    20216
Matched with SJR:        18537
Match Success Rate:      91.69%
----------------------------------------
üèÜ Found Q1 Papers:      9680
ü•à Found Q2 Papers:      4577
ü•â Found Q3 Papers:      2461
üîª Found Q4 Papers:      1260

üíæ Saved integrated data to 'chula_papers_with_quality.csv' (with 'is_Q1' column)
