In [74]:
import pandas as pd
from pybtex.database.input.bibtex import Parser
import re
import PyPDF2
import tabula

In [75]:
# Define relative file paths
pdf_path_abb = "../data/raw/Journal-Abbreviation.pdf"
if_path = "../data/raw/ImpactFactor2024.xlsx"
sheet_name = "2024最新完整版IF"

# Load Impact Factor data from the Excel file
if_2024 = pd.read_excel(if_path, sheet_name=sheet_name)

# Display first rows for initial verification
if_2024.head()


Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Category
0,CA-A CANCER JOURNAL FOR CLINICIANS,CA-CANCER J CLIN,0007-9235,1542-4863,503.1,297.0,ONCOLOGY|Q1|1/322
1,NATURE REVIEWS DRUG DISCOVERY,NAT REV DRUG DISCOV,1474-1776,1474-1784,122.7,114.9,PHARMACOLOGY & PHARMACY|Q1|1/354
2,LANCET,LANCET,0140-6736,1474-547X,98.4,106.9,"MEDICINE, GENERAL & INTERNAL|Q1|1/325"
3,NEW ENGLAND JOURNAL OF MEDICINE,NEW ENGL J MED,0028-4793,1533-4406,96.2,94.3,"MEDICINE, GENERAL & INTERNAL|Q1|2/325"
4,BMJ-British Medical Journal,BMJ-BRIT MED J,0959-535X,1756-1833,93.6,69.9,"MEDICINE, GENERAL & INTERNAL|Q1|3/325"


In [132]:
# Ensure the 'Category' column exists before splitting
if 'Category' in if_2024.columns:
    # Split the 'Category' column into three new columns
    if_2024[['Domain', 'Area', 'Rank']] = if_2024['Category'].str.split('|', expand=True)
else:
    print("Warning: 'Category' column not found in the DataFrame.")
# Display the original and newly created columns to verify the split
if_2024[['Category', 'Domain', 'Area', 'Rank']].head()
# Drop the original 'Category' column from the DataFrame
if_2024 = if_2024.drop(columns=['Category'], errors='ignore')

# Display the first few rows to confirm removal
if_2024.head()




KeyError: "['Category'] not in index"

In [None]:
# Define the path to your text file
text_file_path = "../data/raw/ABB.csv"
abb = pd.read_csv(text_file_path)

In [None]:
abb.head()

In [None]:
# Function to normalize journal names
def normalize_journal_name(name):
    if isinstance(name, str):  # Check if the value is a string
        # Convert to lowercase
        name = name.lower()
        # Remove 'the ' if it's at the start
        if name.startswith('the '):
            name = name[4:]
        # Replace hyphens with spaces or remove them
        name = name.replace('-', ' ')
        name = name.replace(':', ' ')
        name = re.sub(r'\s+', ' ', name)  # Replace multiple spaces with a single space
        # Replace '&' with 'and'
        name = name.replace('&', 'and')
        return name.strip()
    return name  # Return as-is if not a string

In [None]:
if_2024.head()

In [133]:
# Normalize journal names in the 'if_2024' DataFrame
if_2024['Normalized Journal'] = if_2024['Name'].apply(normalize_journal_name)

# Normalize journal names in the 'abb' DataFrame
abb['Normalized Journal'] = abb['Journal Name'].apply(normalize_journal_name)

# Quick verification of normalized results
print(if_2024[['Name', 'Normalized Journal']].head())
print(abb[['Journal Name', 'Normalized Journal']].head())

                                 Name                  Normalized Journal
0  CA-A CANCER JOURNAL FOR CLINICIANS  ca a cancer journal for clinicians
1       NATURE REVIEWS DRUG DISCOVERY       nature reviews drug discovery
2                              LANCET                              lancet
3     NEW ENGLAND JOURNAL OF MEDICINE     new england journal of medicine
4         BMJ-British Medical Journal         bmj british medical journal
                             Journal Name  \
0                           2D Materials    
1                               3 Biotech   
2  3D Printing and Additive Manufacturing   
3                 3D Printing in Medicine   
4              Astronomy and Astrophysics   

                       Normalized Journal  
0                            2d materials  
1                               3 biotech  
2  3d printing and additive manufacturing  
3                 3d printing in medicine  
4              astronomy and astrophysics  


In [134]:
# Define the new entries with Normalized Journal names and their IF values
new_records = pd.DataFrame({
    'Normalized Journal': [
        'materials science and engineering c',
        'applied surface science advances',
        'european biophysics journal',
        'nanotechnologies in russia'
    ],
    'JIF': [8.3, 6.2, 2.0, 0.626]
})
# Ensure new records match existing columns by adding minimal placeholders if necessary
new_records['Name'] = new_records['Normalized Journal']  # You may replace this with original names if available
# Append new records to the existing DataFrame
if_2024_updated = pd.concat([if_2024_clean, new_records], ignore_index=True)

# Verify the addition by checking the tail of the updated DataFrame
if_2024_updated.tail(10)

Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal
21796,Health Promotion Practice,HEALTH PROMOT PRACT,1524-8399,1552-6372,1.6,1.7,"PUBLIC, ENVIRONMENTAL & OCCUPATIONAL HEALTH",Q3,273/403,health promotion practice
21797,High Energy Density Physics,HIGH ENERG DENS PHYS,1574-1818,1878-0563,1.6,1.3,"PHYSICS, FLUIDS & PLASMAS",Q3,26/40,high energy density physics
21798,International Journal on Digital Libraries,INT J DIGIT LIBRARIE,1432-5012,1432-1300,1.6,1.7,INFORMATION SCIENCE & LIBRARY SCIENCE,Q2,65/160,international journal on digital libraries
21799,materials science and engineering c,,,,8.3,,,,,materials science and engineering c
21800,european biophysics journal,,,,2.0,,,,,european biophysics journal
21801,nanotechnologies in russia,,,,0.626,,,,,nanotechnologies in russia
21802,materials science and engineering c,,,,8.3,,,,,materials science and engineering c
21803,applied surface science advances,,,,6.2,,,,,applied surface science advances
21804,european biophysics journal,,,,2.0,,,,,european biophysics journal
21805,nanotechnologies in russia,,,,0.626,,,,,nanotechnologies in russia


In [135]:
initial_if_count = len(if_2024_updated)

# Drop duplicates and missing entries simultaneously
if_2024_clean = (if_2024_updated
                 .dropna(subset=['Normalized Journal'])  # remove empty rows
                 .drop_duplicates(subset='Normalized Journal', keep='first')  # remove duplicates
                 .reset_index(drop=True))  # reset index for neatness

# Display cleanup summary
print(f"IF dataset cleaned:")
print(f" - Initial records: {initial_if_count}")
print(f" - Records after cleaning: {len(if_2024_clean)}")
print(f" - Removed records: {initial_if_count - len(if_2024_clean)}")


IF dataset cleaned:
 - Initial records: 21806
 - Records after cleaning: 21802
 - Removed records: 4


In [136]:
# Define the new entries with Normalized Journal names and their IF values
new_records = pd.DataFrame({
    'Normalized Journal': [
        'materials science and engineering c',
        'applied surface science advances',
        'european biophysics journal',
        'nanotechnologies in russia'
    ],
    'JIF': [8.3, 6.2, 2.0, 0.626]
})
# Ensure new records match existing columns by adding minimal placeholders if necessary
new_records['Name'] = new_records['Normalized Journal']  # You may replace this with original names if available
# Append new records to the existing DataFrame
if_2024_updated = pd.concat([if_2024_clean, new_records], ignore_index=True)

# Verify the addition by checking the tail of the updated DataFrame
if_2024_updated.tail(10)



Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal
21796,Health Promotion Practice,HEALTH PROMOT PRACT,1524-8399,1552-6372,1.6,1.7,"PUBLIC, ENVIRONMENTAL & OCCUPATIONAL HEALTH",Q3,273/403,health promotion practice
21797,High Energy Density Physics,HIGH ENERG DENS PHYS,1574-1818,1878-0563,1.6,1.3,"PHYSICS, FLUIDS & PLASMAS",Q3,26/40,high energy density physics
21798,International Journal on Digital Libraries,INT J DIGIT LIBRARIE,1432-5012,1432-1300,1.6,1.7,INFORMATION SCIENCE & LIBRARY SCIENCE,Q2,65/160,international journal on digital libraries
21799,materials science and engineering c,,,,8.3,,,,,materials science and engineering c
21800,european biophysics journal,,,,2.0,,,,,european biophysics journal
21801,nanotechnologies in russia,,,,0.626,,,,,nanotechnologies in russia
21802,materials science and engineering c,,,,8.3,,,,,materials science and engineering c
21803,applied surface science advances,,,,6.2,,,,,applied surface science advances
21804,european biophysics journal,,,,2.0,,,,,european biophysics journal
21805,nanotechnologies in russia,,,,0.626,,,,,nanotechnologies in russia


In [137]:
# Initial record count
initial_abb_count = len(abb)

# Drop duplicates and missing entries simultaneously
abb_clean = (abb
             .dropna(subset=['Normalized Journal'])  # remove empty rows
             .drop_duplicates(subset='Normalized Journal', keep='first')  # remove duplicates
             .reset_index(drop=True))  # reset index for neatness

# Display cleanup summary
print(f"ABB dataset cleaned:")
print(f" - Initial records: {initial_abb_count}")
print(f" - Records after cleaning: {len(abb_clean)}")
print(f" - Removed records: {initial_abb_count - len(abb_clean)}")


ABB dataset cleaned:
 - Initial records: 12409
 - Records after cleaning: 12012
 - Removed records: 397


In [138]:
abb.head()

Unnamed: 0,Journal Name,ISO 4 abbreviation,Normalized Journal
0,2D Materials,2D Mater.,2d materials
1,3 Biotech,3 Biotech,3 biotech
2,3D Printing and Additive Manufacturing,3D Print. Addit. Manuf.,3d printing and additive manufacturing
3,3D Printing in Medicine,3D Print. Med.,3d printing in medicine
4,Astronomy and Astrophysics,A & A,astronomy and astrophysics


In [139]:
# Verify no duplicates remain
print("Duplicates remaining in IF:", if_2024_clean.duplicated('Normalized Journal').sum())
print("Duplicates remaining in ABB:", abb_clean.duplicated('Normalized Journal').sum())

# Verify no empty values remain
print("Empty entries remaining in IF:", if_2024_clean['Normalized Journal'].isna().sum())
print("Empty entries remaining in ABB:", abb_clean['Normalized Journal'].isna().sum())



Duplicates remaining in IF: 0
Duplicates remaining in ABB: 0
Empty entries remaining in IF: 0
Empty entries remaining in ABB: 0


In [140]:
# Merge the DataFrames on the normalized journal names
merged_df_final = pd.merge(if_2024_clean, 
                     abb_clean[['Normalized Journal', 'ISO 4 abbreviation']],
                     on='Normalized Journal',
                     how='left')  # using left join keeps all rows from if_2024

# Rename columns for clarity (optional but recommended)
merged_df_final.rename(columns={'ISO 4 abbreviation': 'Journal_Abbreviation'}, inplace=True)

# Check the result of merging
merged_df_final.head()


Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal,Journal_Abbreviation
0,CA-A CANCER JOURNAL FOR CLINICIANS,CA-CANCER J CLIN,0007-9235,1542-4863,503.1,297.0,ONCOLOGY,Q1,1/322,ca a cancer journal for clinicians,
1,NATURE REVIEWS DRUG DISCOVERY,NAT REV DRUG DISCOV,1474-1776,1474-1784,122.7,114.9,PHARMACOLOGY & PHARMACY,Q1,1/354,nature reviews drug discovery,Nat. Rev. Drug Discovery
2,LANCET,LANCET,0140-6736,1474-547X,98.4,106.9,"MEDICINE, GENERAL & INTERNAL",Q1,1/325,lancet,
3,NEW ENGLAND JOURNAL OF MEDICINE,NEW ENGL J MED,0028-4793,1533-4406,96.2,94.3,"MEDICINE, GENERAL & INTERNAL",Q1,2/325,new england journal of medicine,N. Engl. J. Med.
4,BMJ-British Medical Journal,BMJ-BRIT MED J,0959-535X,1756-1833,93.6,69.9,"MEDICINE, GENERAL & INTERNAL",Q1,3/325,bmj british medical journal,


In [141]:
# Calculate required statistics clearly
total_if_records = len(if_2024_clean)
total_abb_records = len(abb_clean)
total_merged_records = len(merged_df_final)
records_missing_abbreviation = merged_df_final['Journal_Abbreviation'].isna().sum()
records_with_abbreviation = total_merged_records - records_missing_abbreviation
records_with_if = merged_df_final['JIF'].notna().sum()
records_missing_if = merged_df_final['JIF'].isna().sum()

# Print concise summary
print("\n📌 --- Final Merge Summary --- 📌\n")
print(f"Total records in IF dataset:               {total_if_records}")
print(f"Total records in ABB dataset:              {total_abb_records}")
print(f"Total records after merging:               {total_merged_records}")
print(f"Records with abbreviation matched:         {records_with_abbreviation}")
print(f"Records missing abbreviation:              {records_missing_abbreviation}")
print(f"Records with valid Impact Factor (IF):     {records_with_if}")
print(f"Records missing Impact Factor (IF):        {records_missing_if}")




📌 --- Final Merge Summary --- 📌

Total records in IF dataset:               21802
Total records in ABB dataset:              12012
Total records after merging:               21802
Records with abbreviation matched:         6775
Records missing abbreviation:              15027
Records with valid Impact Factor (IF):     21789
Records missing Impact Factor (IF):        13


In [142]:
abb_journals_not_in_if = abb_clean[
    ~abb_clean['Normalized Journal'].isin(merged_df_final['Normalized Journal'])
]

print(f"ABB journals not present in IF database: {len(abb_journals_not_in_if)}")

additional_records = abb_journals_not_in_if[['Journal Name', 'ISO 4 abbreviation', 'Normalized Journal']].copy()

# Set IF and other related fields explicitly to NaN
additional_records['JIF'] = pd.NA
additional_records['JIF5Years'] = pd.NA
additional_records['Domain'] = pd.NA
additional_records['Area'] = pd.NA
additional_records['Rank'] = pd.NA

# Set 'Name' as the original journal name from ABB
additional_records['Name'] = additional_records['Journal Name']

# Clearly rename abbreviation column
additional_records.rename(columns={'ISO 4 abbreviation': 'Journal_Abbreviation'}, inplace=True)

# Drop redundant column
additional_records.drop(columns=['Journal Name'], inplace=True)

# View these additional records clearly
additional_records.head()


ABB journals not present in IF database: 5237


Unnamed: 0,Journal_Abbreviation,Normalized Journal,JIF,JIF5Years,Domain,Area,Rank,Name
6,AAPG Mem.,aapg memoir,,,,,,AAPG Memoir
7,AAPPS Bull.,aapps bulletin,,,,,,AAPPS Bulletin
9,AAPS Open,aaps open,,,,,,AAPS Open
12,ABB Rev.,abb review,,,,,,ABB Review
13,Abh. Akad. Wiss. DDR,abhandlungen der akademie der wissenschaften d...,,,,,,Abhandlungen der Akademie der Wissenschaften d...


In [143]:
final_complete_df = pd.concat([merged_df_final, additional_records], ignore_index=True)

# Quick verification
print(f"Final merged dataset records: {len(final_complete_df)}")
final_complete_df.tail(10)


Final merged dataset records: 27039


Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal,Journal_Abbreviation
27029,"Zentralblatt fur Pharmazie, Pharmakotherapie u...",,,,,,,,,"zentralblatt fur pharmazie, pharmakotherapie u...",Zentralbl. Pharm. Pharmakother. Laboratoriumsd...
27030,Zentralblatt fur Veterinarmedizin,,,,,,,,,zentralblatt fur veterinarmedizin,Zentralbl. Veterinarmed.
27031,Zhurnal Fizicheskoi Khimii,,,,,,,,,zhurnal fizicheskoi khimii,Zh. Fiz. Khim.
27032,Zhurnal Neorganicheskoi Khimii,,,,,,,,,zhurnal neorganicheskoi khimii,Zh. Neorg. Khim.
27033,Zhurnal Obshchei Khimii,,,,,,,,,zhurnal obshchei khimii,Zh. Obshch. Khim.
27034,Zhurnal Organicheskoi Khimii,,,,,,,,,zhurnal organicheskoi khimii,Zh. Org. Khim.
27035,Zhurnal Prikladnoi Khimii,,,,,,,,,zhurnal prikladnoi khimii,Zh. Prikl. Khim.
27036,Zoology and Ecology,,,,,,,,,zoology and ecology,Zool. Ecol.
27037,Zoologische Garten,,,,,,,,,zoologische garten,Zool. Garten
27038,Zoologicheskii Zhurnal,,,,,,,,,zoologicheskii zhurnal,Zool. Zh.


In [144]:
total_records_final = len(final_complete_df)
records_with_if_final = final_complete_df['JIF'].notna().sum()
records_missing_if_final = final_complete_df['JIF'].isna().sum()
records_with_abbreviation_final = final_complete_df['Journal_Abbreviation'].notna().sum()
records_missing_abbreviation_final = final_complete_df['Journal_Abbreviation'].isna().sum()

print("\n📌 --- Comprehensive Final Journal Database --- 📌\n")
print(f"Total records:                           {total_records_final}")
print(f"Records with valid IF:                   {records_with_if_final}")
print(f"Records without IF:                      {records_missing_if_final}")
print(f"Records with abbreviations:              {records_with_abbreviation_final}")
print(f"Records missing abbreviations:           {records_missing_abbreviation_final}")




📌 --- Comprehensive Final Journal Database --- 📌

Total records:                           27039
Records with valid IF:                   21789
Records without IF:                      5250
Records with abbreviations:              12012
Records missing abbreviations:           15027


In [145]:
# Find journals with missing abbreviation
missing_abbr = final_complete_df['Journal_Abbreviation'].isna()

# Identify single-word journal names with missing abbreviation
single_word_journals = final_complete_df[missing_abbr & final_complete_df['Name'].str.match(r'^\w+$')]

print(f"Number of single-word journals without abbreviation: {len(single_word_journals)}")
single_word_journals[['Name', 'Normalized Journal']].head(10)


Number of single-word journals without abbreviation: 1514


Unnamed: 0,Name,Normalized Journal
2,LANCET,lancet
18,NATURE,nature
26,CELL,cell
28,SCIENCE,science
30,eScience,escience
39,Joule,joule
47,CIRCULATION,circulation
52,Innovation,innovation
80,eLight,elight
89,Gastroenterology,gastroenterology


In [146]:
# Identify journals without abbreviation
missing_abbr = final_complete_df['Journal_Abbreviation'].isna()

# Identify single-word journal names with missing abbreviation
single_word_journals = final_complete_df[missing_abbr & final_complete_df['Name'].str.match(r'^\w+$')]

print(f"Single-word journals without abbreviation: {len(single_word_journals)}")
# Assign journal abbreviation as the capitalized original journal name
final_complete_df.loc[single_word_journals.index, 'Journal_Abbreviation'] = single_word_journals['Name'].str.capitalize()

# Verify the assigned abbreviations explicitly
final_complete_df.loc[single_word_journals.index, ['Name', 'Journal_Abbreviation']].head(10)
records_with_abbreviation_final = final_complete_df['Journal_Abbreviation'].notna().sum()
records_missing_abbreviation_final = final_complete_df['Journal_Abbreviation'].isna().sum()

print("\n📌 --- Final Database Summary (after abbreviation capitalization fix) --- 📌\n")
print(f"Total records:                           {len(final_complete_df)}")
print(f"Records with abbreviations:              {records_with_abbreviation_final}")
print(f"Records missing abbreviations:           {records_missing_abbreviation_final}")


Single-word journals without abbreviation: 1514

📌 --- Final Database Summary (after abbreviation capitalization fix) --- 📌

Total records:                           27039
Records with abbreviations:              13526
Records missing abbreviations:           13513


In [147]:
# Identify records without abbreviation
journals_without_abbr = final_complete_df[final_complete_df['Journal_Abbreviation'].isna()]

# Display the total count clearly
print(f"📌 Journals still missing abbreviations: {len(journals_without_abbr)}")

# Examine the first 20 records explicitly
journals_without_abbr[['Name', 'Normalized Journal', 'JIF']].head(20)



📌 Journals still missing abbreviations: 13513


Unnamed: 0,Name,Normalized Journal,JIF
0,CA-A CANCER JOURNAL FOR CLINICIANS,ca a cancer journal for clinicians,503.1
4,BMJ-British Medical Journal,bmj british medical journal,93.6
13,JAMA-JOURNAL OF THE AMERICAN MEDICAL ASSOCIATION,jama journal of the american medical association,63.1
14,World Psychiatry,world psychiatry,60.5
16,ANNALS OF ONCOLOGY,annals of oncology,56.7
22,CANCER CELL,cancer cell,48.8
32,JOURNAL OF CLINICAL ONCOLOGY,journal of clinical oncology,42.1
38,Lancet Respiratory Medicine,lancet respiratory medicine,38.7
42,EUROPEAN HEART JOURNAL,european heart journal,37.6
43,MMWR Surveillance Summaries,mmwr surveillance summaries,37.3


In [151]:
import pandas as pd

def find_abbreviation_candidates(df, name_col='Name', abbr_col='Abbr Name', journal_abbr_col='Journal_Abbreviation'):
    # Function to clearly identify acronym candidates
    def identify_acronym(row):
        if pd.isna(row[journal_abbr_col]) and isinstance(row[abbr_col], str):
            abbr_first_word = row[abbr_col].split()[0]
            name_first_word = row[name_col].split()[0]

            # Acronym conditions explicitly defined:
            if abbr_first_word.isupper() and len(abbr_first_word) >= 2 and '-' not in abbr_first_word:
                name_acronym_part = name_first_word.split('-')[0]
                if abbr_first_word == name_acronym_part:
                    return abbr_first_word
        return pd.NA

    # Apply the identification function
    df['Acronym_Candidate'] = df.apply(identify_acronym, axis=1)

    # Clearly return DataFrame with potential acronym candidates for easy review
    candidates_df = df[df['Acronym_Candidate'].notna()][[name_col, abbr_col, 'Acronym_Candidate']]

    return candidates_df.reset_index(drop=True)
# Call the function clearly on your dataset
abbr_candidates_df = find_abbreviation_candidates(final_complete_df)

# Display abbreviation candidates explicitly
print(f"Total abbreviation candidates found: {len(abbr_candidates_df)}")
abbr_candidates_df.head(30)



Total abbreviation candidates found: 1216


Unnamed: 0,Name,Abbr Name,Acronym_Candidate
0,CANCER CELL,CANCER CELL,CANCER
1,MMWR Surveillance Summaries,MMWR SURVEILL SUMM,MMWR
2,JAMA Pediatrics,JAMA PEDIATR,JAMA
3,JAMA Psychiatry,JAMA PSYCHIAT,JAMA
4,TOWN PLANNING REVIEW,TOWN PLAN REV,TOWN
5,ACTA OPTICA SINICA,ACTA OPT SIN,ACTA
6,CHINESE JOURNAL OF GEOPHYSICS-CHINESE EDITION,CHINESE J GEOPHYS-CH,CHINESE
7,GROWTH HORMONE & IGF RESEARCH,GROWTH HORM IGF RES,GROWTH
8,INDIANA LAW JOURNAL,INDIANA LAW J,INDIANA
9,MAYO CLINIC PROCEEDINGS,MAYO CLIN PROC,MAYO


In [152]:
final_complete_df[final_complete_df['Journal_Abbreviation'].isna()].to_csv("../data/processed/journals_without_iso4.csv", index=False)


In [149]:
final_complete_df.head(20)

Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal,Journal_Abbreviation
0,CA-A CANCER JOURNAL FOR CLINICIANS,CA-CANCER J CLIN,0007-9235,1542-4863,503.1,297.0,ONCOLOGY,Q1,1/322,ca a cancer journal for clinicians,
1,NATURE REVIEWS DRUG DISCOVERY,NAT REV DRUG DISCOV,1474-1776,1474-1784,122.7,114.9,PHARMACOLOGY & PHARMACY,Q1,1/354,nature reviews drug discovery,Nat. Rev. Drug Discovery
2,LANCET,LANCET,0140-6736,1474-547X,98.4,106.9,"MEDICINE, GENERAL & INTERNAL",Q1,1/325,lancet,Lancet
3,NEW ENGLAND JOURNAL OF MEDICINE,NEW ENGL J MED,0028-4793,1533-4406,96.2,94.3,"MEDICINE, GENERAL & INTERNAL",Q1,2/325,new england journal of medicine,N. Engl. J. Med.
4,BMJ-British Medical Journal,BMJ-BRIT MED J,0959-535X,1756-1833,93.6,69.9,"MEDICINE, GENERAL & INTERNAL",Q1,3/325,bmj british medical journal,
5,NATURE REVIEWS MOLECULAR CELL BIOLOGY,NAT REV MOL CELL BIO,1471-0072,1471-0080,81.3,115.5,CELL BIOLOGY,Q1,1/205,nature reviews molecular cell biology,Nat. Rev. Mol. Cell Biol.
6,Nature Reviews Clinical Oncology,NAT REV CLIN ONCOL,1759-4774,1759-4782,81.1,81.5,ONCOLOGY,Q1,2/322,nature reviews clinical oncology,Nat. Rev. Clin. Oncol.
7,Nature Reviews Materials,NAT REV MATER,2058-8437,2058-8437,79.8,85.7,NANOSCIENCE & NANOTECHNOLOGY,Q1,1/140,nature reviews materials,Nat. Rev. Mater.
8,Nature Reviews Disease Primers,NAT REV DIS PRIMERS,2056-676X,2056-676X,76.9,92.6,"MEDICINE, GENERAL & INTERNAL",Q1,4/325,nature reviews disease primers,Nat. Rev. Dis. Primers
9,NATURE REVIEWS CANCER,NAT REV CANCER,1474-175X,1474-1768,72.5,77.2,ONCOLOGY,Q1,3/322,nature reviews cancer,Nat. Rev. Cancer


In [128]:
import pandas as pd
import re

# Function to extract potential acronym
def extract_acronym(row):
    abbr_first_word = row['Abbr Name'].split()[0]
    name_first_word = row['Name'].split()[0]

    # Check conditions: first word uppercase, matches first part of name before hyphen
    if abbr_first_word.isupper() and name_first_word.startswith(abbr_first_word):
        return abbr_first_word
    else:
        return pd.NA  # Not an acronym candidate

# Apply the function explicitly to each row
final_complete_df['Acronym_Candidate'] = final_complete_df.apply(extract_acronym, axis=1)

# Inspect results
print(final_complete_df[['Name', 'Abbr Name', 'Acronym_Candidate']].head(20))


AttributeError: 'float' object has no attribute 'split'

In [124]:
import re

def iso4_abbreviation(journal_name):
    # ISO 4 standard abbreviations dictionary (common words)
    abbreviations = {
        'journal': 'J.',
        'journals': 'J.',
        'international': 'Int.',
        'reviews': 'Rev.',
        'review': 'Rev.',
        'annals': 'Ann.',
        'bulletin': 'Bull.',
        'medicine': 'Med.',
        'medical': 'Med.',
        'clinical': 'Clin.',
        'clinicians': 'Clin.',
        'american': 'Am.',
        'european': 'Eur.',
        'and': '',
        'of': '',
        'for': '',
        'the': '',
        'on': '',
        'in': '',
        'science': 'Sci.',
        'sciences': 'Sci.',
        'technology': 'Technol.',
        'engineering': 'Eng.',
        'communications': 'Commun.',
        'proceedings': 'Proc.',
        'transactions': 'Trans.',
        'letters': 'Lett.',
        'advances': 'Adv.',
        'reports': 'Rep.',
        'report': 'Rep.',
        'research': 'Res.',
        'studies': 'Stud.',
        'yearbook': 'Yearb.',
        'supplement': 'Suppl.',
        'series': 'Ser.',
        'section': 'Sect.',
        'hepatology': 'Hepatol.',
        'endocrinology': 'Endocrinol.',
        'oncology': 'Oncol.',
        'pediatrics': 'Pediatr.',
        'surgery': 'Surg.',
        'urology': 'Urol.',
        'care': 'Care',
        'intensive': 'Intensive',
        'discovery': 'Discov.',
        'surveillance': 'Surveill.',
        'summaries': 'Summ.',
        'weekly': 'Wkly.',
        'morbidity': 'Morb.',
        'mortality': 'Mortal.',
        'public': 'Public',
        'health': 'Health',
        'respiratory': 'Respir.',
        'psychiatry': 'Psychiatry',
    }

    words = re.split(r'\s+', journal_name.lower())
    abbr_words = []

    for word in words:
        if word in ['and', 'of', 'for', 'the', 'on', 'in']:
            continue
        
        if word in abbreviations and abbreviations[word]:
            abbr_words.append(abbreviations[word])
        elif '-' in word:  # Handle compound (hyphenated) words
            parts = word.split('-')
            abbr_parts = [abbreviations.get(part, part[:4].capitalize() + '.') for part in parts]
            abbr_words.append('-'.join(abbr_parts))
        elif word.endswith('ology'):
            abbr_words.append(word[:-4].capitalize() + 'ol.')
        elif word.endswith('ics'):
            abbr_words.append(word[:-3].capitalize() + '.')
        elif len(word) > 6:
            abbr_words.append(word[:4].capitalize() + '.')
        else:
            abbr_words.append(word.capitalize())

    # Join words and capitalize the first character of the abbreviation
    abbreviation = ' '.join(abbr_words)
    abbreviation = abbreviation[0].upper() + abbreviation[1:]

    return abbreviation



In [125]:
# Select rows with missing abbreviations
missing_abbr_idx = final_complete_df[final_complete_df['Journal_Abbreviation'].isna()].index

# Apply the ISO4 abbreviation function
final_complete_df.loc[missing_abbr_idx, 'Journal_Abbreviation'] = final_complete_df.loc[
    missing_abbr_idx, 'Name'
].apply(iso4_abbreviation)

# Check result
final_complete_df.loc[missing_abbr_idx, ['Name', 'Journal_Abbreviation']].head(20)



Unnamed: 0,Name,Journal_Abbreviation


In [127]:
final_complete_df.head(20)

Unnamed: 0,Name,Abbr Name,ISSN,EISSN,JIF,JIF5Years,Domain,Area,Rank,Normalized Journal,Journal_Abbreviation
0,CA-A CANCER JOURNAL FOR CLINICIANS,CA-CANCER J CLIN,0007-9235,1542-4863,503.1,297.0,ONCOLOGY,Q1,1/322,ca a cancer journal for clinicians,Ca-a Cancer J. Clin.
1,NATURE REVIEWS DRUG DISCOVERY,NAT REV DRUG DISCOV,1474-1776,1474-1784,122.7,114.9,PHARMACOLOGY & PHARMACY,Q1,1/354,nature reviews drug discovery,Nat. Rev. Drug Discovery
2,LANCET,LANCET,0140-6736,1474-547X,98.4,106.9,"MEDICINE, GENERAL & INTERNAL",Q1,1/325,lancet,Lancet
3,NEW ENGLAND JOURNAL OF MEDICINE,NEW ENGL J MED,0028-4793,1533-4406,96.2,94.3,"MEDICINE, GENERAL & INTERNAL",Q1,2/325,new england journal of medicine,N. Engl. J. Med.
4,BMJ-British Medical Journal,BMJ-BRIT MED J,0959-535X,1756-1833,93.6,69.9,"MEDICINE, GENERAL & INTERNAL",Q1,3/325,bmj british medical journal,Bmj-british Med. J.
5,NATURE REVIEWS MOLECULAR CELL BIOLOGY,NAT REV MOL CELL BIO,1471-0072,1471-0080,81.3,115.5,CELL BIOLOGY,Q1,1/205,nature reviews molecular cell biology,Nat. Rev. Mol. Cell Biol.
6,Nature Reviews Clinical Oncology,NAT REV CLIN ONCOL,1759-4774,1759-4782,81.1,81.5,ONCOLOGY,Q1,2/322,nature reviews clinical oncology,Nat. Rev. Clin. Oncol.
7,Nature Reviews Materials,NAT REV MATER,2058-8437,2058-8437,79.8,85.7,NANOSCIENCE & NANOTECHNOLOGY,Q1,1/140,nature reviews materials,Nat. Rev. Mater.
8,Nature Reviews Disease Primers,NAT REV DIS PRIMERS,2056-676X,2056-676X,76.9,92.6,"MEDICINE, GENERAL & INTERNAL",Q1,4/325,nature reviews disease primers,Nat. Rev. Dis. Primers
9,NATURE REVIEWS CANCER,NAT REV CANCER,1474-175X,1474-1768,72.5,77.2,ONCOLOGY,Q1,3/322,nature reviews cancer,Nat. Rev. Cancer


In [114]:
bib_df['Normalized Journal'] = bib_df['Journal'].apply(normalize_journal_name)

NameError: name 'bib_df' is not defined

In [73]:
# Map the Impact Factor to bib_df_no_cyrillic using the normalized names
bib_df['Impact Factor'] = bib_df['Normalized Journal'].map(merged_impact_factors)

# Check which journals don't have an IF after normalization
journals_without_if = bib_df[bib_df['Impact Factor'].isnull()]


In [77]:
def check_author_position(authors_list, name_variations):
    # Check if any of the name variations is present as the first author
    first_author_match = any(variation in authors_list[0] for variation in name_variations)
    # Check if any of the name variations is present as the last author
    last_author_match = any(variation in authors_list[-1] for variation in name_variations)
    return first_author_match, last_author_match

# List of all variations of the name
name_variations = [
    'Parakhonskiy, Bogdan V.',
    'Parakhonskiy, Bogdan',
    'Parakhonskiy, B.V.',
    'Parakhonskiy, B.',
    'Parakhonskiy, B. V.',
    'Parakhonskiy, B',
    'Bogdan V. Parakhonskiy',
    'Parakhonsky, B',
    'Parakhonskiy, Bogdan V'
    # Add any other variations if needed
]

# Applying the check_author_position function to each row in the dataframe
bib_df['Parakhonskiy_first_author'], bib_df['Parakhonskiy_last_author'] = zip(
    *bib_df['Authors'].apply(lambda x: check_author_position(x, name_variations))
)

In [78]:
def check_name_variations_absent(authors_list, name_variations):
    # Check if any of the name variations is present in the authors list
    if not any(variation in authors_list for variation in name_variations):
        return authors_list  # Return the authors list if none of the variations are present
    return None  # Return None if any variation is present



# Apply the function to the dataframe and get the rows where the name variation is not present
absent_authors_lists = bib_df['Authors'].apply(lambda x: check_name_variations_absent(x, name_variations))

# Filter out the None values to get only the rows where the name variation is absent
absent_authors_df = bib_df[absent_authors_lists.notnull()]

In [79]:
absent_authors_df['Authors']

45     [Ivanov, Aleksei, Kurtukova, Mariya, Kozadayev...
108    [Bukreeva, T. V., Dembo, K. A., Myagkov, I. V....
Name: Authors, dtype: object

In [None]:

# Display the updated DataFrame
bib_df[['Authors', 'Parakhonskiy_first_author', 'Parakhonskiy_last_author']]

In [None]:
bib_df.head()

In [None]:
# Convert the 'Year' column to integers
bib_df['Year'] = bib_df['Year'].astype(int)

In [None]:
# Convert 'Year' column to integers
bib_df['Year'] = pd.to_numeric(bib_df['Year'], errors='coerce')

In [None]:
bib_df['Parakhonskiy_last_author'][5]

In [None]:
absent_authors_df

In [None]:
journals_without_if

In [None]:
# Function to calculate the desired statistics
def calculate_author_statistics(df, current_year, years_window):
    # Filter records with DOI
    with_doi = df.dropna(subset=['DOI'])
    
    # Total amount of publications with DOI
    total_with_doi = with_doi.shape[0]
    
    # Total amount of publications with first author
    first_author_count = with_doi['Parakhonskiy_first_author'].sum()
    
    # Total amount of publications with last author
    last_author_count = with_doi['Parakhonskiy_last_author'].sum()
    first_last = first_author_count+last_author_count
    
    # Calculate the percentage of publications where the author is first or last author
    first_author_percent = round((first_author_count / total_with_doi) * 100, 1) if total_with_doi else 0
    last_author_percent = round((last_author_count / total_with_doi) * 100, 1) if total_with_doi else 0
    first_last_percentage = round((first_last/total_with_doi)*100,1) if total_with_doi else 0
    # Now, calculate statistics for the last 'years_window' years
    
    
    recent_df = with_doi[with_doi['Year'] >= (current_year - years_window + 1)]

    
    # Total amount of recent publications with DOI
    recent_total_with_doi = recent_df.shape[0]
    
    # Recent total amount of publications with first author
    recent_first_author_count = recent_df['Parakhonskiy_first_author'].sum()
    
    # Recent total amount of publications with last author
    recent_last_author_count = recent_df['Parakhonskiy_last_author'].sum()
    recent_first_last_author_count = recent_first_author_count+recent_last_author_count
    # Recent percentage of first author publications
    recent_first_author_percent = round((recent_first_author_count / recent_total_with_doi) * 100,1) if recent_total_with_doi else 0
    
    # Recent percentage of last author publications
    recent_last_author_percent = round((recent_last_author_count / recent_total_with_doi) * 100,1) if recent_total_with_doi else 0
    recent_first_last_percentage = round((recent_first_last_author_count/total_with_doi)*100,1) if total_with_doi else 0
    
    # Get counts of publications per journal and their total impact factors
    journal_stats = df.groupby('Normalized Journal').agg({
        'DOI': 'count',
        'Impact Factor': 'first'  # Assuming the impact factor is the same for all rows of the same journal
    }).rename(columns={'DOI': 'count'}).reset_index()

    # Sort the journals by Impact Factor and get the top 5
    top_journals = journal_stats.sort_values(by='Impact Factor', ascending=False).head(5)

    # Filter for publications where Parakhonskiy is first or last author
    parakhonskiy_first_last_df = df[ (df['Parakhonskiy_first_author'] == True) | (df['Parakhonskiy_last_author'] == True) ]
    
     # Get counts of publications per journal where Parakhonskiy is first or last author and their total impact factors
    parakhonskiy_journal_stats = parakhonskiy_first_last_df.groupby('Normalized Journal').agg({
        'DOI': 'count',
        'Impact Factor': 'first'  # Assuming the impact factor is the same for all rows of the same journal
    }).rename(columns={'DOI': 'count'}).reset_index()
    
    
    # Sort the journals by count of publications where Parakhonskiy is first or last author and get the top 5
    top_parakhonskiy_journals = parakhonskiy_journal_stats.sort_values(by='Impact Factor', ascending=False).head(5)
    
    # Compile statistics into a dictionary
    statistics = {
        'total_with_doi': total_with_doi,
        'first_author_count': first_author_count,
        'last_author_count': last_author_count,
        'first_last' : first_last,
        'first_author_percent': first_author_percent,
        'last_author_percent': last_author_percent,
        'first_last_percentage':first_last_percentage,
        'recent_total_with_doi': recent_total_with_doi,
        'recent_first_author_count': int(recent_first_author_count),
        'recent_last_author_count': int(recent_last_author_count),
        'recent_first_last_author_count' : int(recent_first_last_author_count),
        'recent_first_author_percent': recent_first_author_percent,
        'recent_last_author_percent': recent_last_author_percent,
        'recent_first_last_percentage' : recent_first_last_percentage,
        'top_journals': top_journals.to_dict(orient='records'),  # Convert the top 5 journals to a list of dictionaries
        'top_parakhonskiy_journals': top_parakhonskiy_journals.to_dict(orient='records')  # Convert the top 5 journals to a list of dictionaries
    }
    
    return statistics

# Assuming the current year is 2023 and we want statistics for the last 5 years
current_year = 2023
years_window = 5

# Calculate statistics
author_stats = calculate_author_statistics(bib_df, current_year, years_window)
author_stats

In [None]:
def create_author_summary(author_stats, current_year, years_window):
    # Construct the high impact journals string without the year
    high_impact_string = ', '.join([
        f"{journal['Normalized Journal'].title()} (IF {journal['Impact Factor']})"
        for journal in author_stats['top_journals']
    ])

    # Construct the Parakhonskiy journals string
    parakhonskiy_journal_string = ', '.join([
        f"{journal['Normalized Journal'].title()} (IF {journal['Impact Factor']}, {journal['count']} publications)"
        for journal in author_stats['top_parakhonskiy_journals']
    ])

    # Construct the summary string
    summary = (
        f"I have authored {author_stats['total_with_doi']} publications "
        f"({author_stats['recent_total_with_doi']} since {current_year - years_window + 1}), "
        f"including papers in such high impact journals as {high_impact_string}. "
        f"Among these, I am the first or last author on {author_stats['first_last']} papers, "
        f"with {author_stats['recent_first_last_author_count']} of those since {current_year - years_window + 1}. "
        f"I have served as the first author on {author_stats['first_author_percent']:.1f}% of my publications and as the last author on "
        f"{author_stats['last_author_percent']:.1f}%. In the past {years_window} years, "
        f"I have been the first or last author on {author_stats['recent_first_last_author_count']} publications, "
        f"{author_stats['recent_last_author_count']} of which list me as the last author. "
        f"Significant contributions include {parakhonskiy_journal_string}."
    )
    return summary

# You would then call this function with your author_stats dictionary:
author_summary = create_author_summary(author_stats, current_year, years_window)






In [None]:
# Display with increased font size
display(HTML(f"<div style='font-size: 1.25em;'>{author_summary}</div>"))

In [None]:
def create_author_summary(author_stats, current_year, years_window):
     # Check if 'top_journals' exists in author_stats and construct the high impact journals string
    if 'top_journals' in author_stats and author_stats['top_journals']:
        high_impact_string = ', '.join([
            f"{journal.get('name', 'Unknown Journal')} {journal.get('year', 'Unknown Year')} (IF {journal.get('impact_factor', 'N/A')})"
            for journal in author_stats['top_journals']
        ])
    else:
        high_impact_string = 'N/A'

    # Check if 'top_parakhonskiy_journals' exists in author_stats and construct the parakhonskiy journals string
    if 'top_parakhonskiy_journals' in author_stats and author_stats['top_parakhonskiy_journals']:
        parakhonskiy_journal_string = ', '.join([
            f"{journal.get('name', 'Unknown Journal')} (IF {journal.get('impact_factor', 'N/A')}, {journal.get('count', 'N/A')} publications {journal.get('year_range', 'Unknown Year Range')})"
            for journal in author_stats['top_parakhonskiy_journals']
        ])
    else:
        parakhonskiy_journal_string = 'N/A'

    # Construct the summary string
    summary = (
        f"I have authored {author_stats['total_with_doi']} publications "
        f"({author_stats['recent_total_with_doi']} since {current_year - years_window + 1}), "
        f"including papers in such high impact journals as {high_impact_string}. "
        f"Among these, I am the first or last author on {author_stats['first_author_count'] + author_stats['last_author_count']} papers, "
        f"with {author_stats['recent_first_author_count'] + author_stats['recent_last_author_count']} of those since {current_year - years_window + 1}. "
        f"I have served as the first author on {author_stats['first_author_percent']:.1f}% of my publications and as the last author on "
        f"{author_stats['last_author_percent']:.1f}%. In the past {years_window} years, "
        f"I have been the first or last author on {author_stats['recent_first_author_count'] + author_stats['recent_last_author_count']} publications, "
        f"{author_stats['recent_last_author_count']} of which list me as the last author. "
        f"Significant contributions include {parakhonskiy_journal_string}."
    )
    return summary

# You would then call this function with your author_stats dictionary:
author_summary = create_author_summary(author_stats, current_year, years_window)


In [None]:
author_summary

In [None]:
# Filter the DataFrame to show records where 'Parakhonskiy_first_author' or 'Parakhonskiy_last_author' is True
parakhonskiy_author_records = bib_df[
    (bib_df['Parakhonskiy_first_author']) | (bib_df['Parakhonskiy_last_author'])
]

parakhonskiy_author_records.head(30)