In [20]:
import json
import pandas as pd
from text_processing import find_value_by_key,find_values_by_key
from difflib import SequenceMatcher

with open("arxiv.json", 'r', encoding='utf-8') as file:
    json_data = json.load(file)


In [21]:
papers_df = pd.read_json('paper_2024_11_19.json', orient="records", lines=True)
authors_df = pd.read_json('author_2024_11_19.json', orient="records", lines=True)
affiliations_df = pd.read_json('affiliation_2024_11_19.json', orient="records", lines=True)

In [22]:
authors_df.columns

Index(['auid', 'given-name', 'initials', 'surname', 'indexed-name',
       'affliation', 'paper'],
      dtype='object')

In [23]:



def check_author_exist(input_name,df,new_author):
    given_name, surname = input_name.split()[0],input_name.split()[-1]
    existing = df[(df['given-name'] == given_name) & (df['surname'] == surname)]

    if existing.empty:
        new_id = f"S{new_author:09d}"  # Format new id as s00xxx
        new_row = pd.DataFrame({'auid': [new_id], 'given-name': [given_name], 'surname': [surname]})
        df = pd.concat([df, new_row], ignore_index=True)
        new_author += 1  # Increment the author number for next use
    else:
        new_id = existing['auid'].iloc[0]
        print(given_name, surname)
    return new_id,df,new_author

In [24]:
new_author = 1
new_affiliation = 1
for data in find_value_by_key(json_data, "entry"):
    paper_id = data["id"].split("/")[-1]
    authors = []
    paper_affs = []
    if not isinstance(data["author"], list):
        aus = [data["author"]]
    else:
        aus = data["author"]
    for name in aus:
        # Handle "name" and "surname" or "indexed-name"
        if "." in name["name"].split()[-1]:  # Handle the case where initials are in "name"
            indexed_name = name["name"]
            surname = indexed_name.split()[0]
            initials = indexed_name.split()[1]
            given_name = None
        else:  # Handle the standard case
            given_name, surname = name["name"].split()[0], name["name"].split()[-1]
            initials = f"{given_name[0]}."
            indexed_name = f"{surname} {given_name[0]}."

        existing = authors_df[
            (authors_df["given-name"] == given_name) & (authors_df["surname"] == surname)
        ]
        if existing.empty:
            if "arxiv:affiliation" in name:
                if not isinstance(name["arxiv:affiliation"], list):
                    affs = [name["arxiv:affiliation"]["#text"]]
                else:
                    affs = [aff["#text"] for aff in name["arxiv:affiliation"]]
                affiliations = []
                for aff_name in affs:
                    # Check if any name in the DataFrame matches aff_name (case-insensitively)
                    match_found = False
                    for _, row in affiliations_df.iterrows():
                        if "INFN" in aff_name or "Academy of Science" in aff_name:
                            if aff_name == row["name"]:
                                match_found = True
                                matched_affid = row["affid"]
                                # print(f"Match found for '{aff_name}': {row['name']}")
                                break
                        elif aff_name in row["name"]:
                            if len(aff_name) > len(row["name"]) / 2:
                                match_found = True
                                matched_affid = row["affid"]
                                break
                        elif row["name"] in aff_name:
                            if len(aff_name) / 2 < len(row["name"]):
                                matched_affid = row["affid"]
                                match_found = True
                                break

                    if not match_found:
                        # If no sufficient match, add a new row to the DataFrame
                        new_aff_id = f"S{new_affiliation:07d}"
                        new_affiliation += 1
                        new_row = {"affid": new_aff_id, "name": aff_name}
                        affiliations_df = pd.concat([affiliations_df, pd.DataFrame([new_row])], ignore_index=True)
                        matched_affid = new_aff_id
                    affiliations.append(matched_affid)
                    if matched_affid not in paper_affs:
                        paper_affs.append(matched_affid)

            new_id = f"S{new_author:09d}"
            new_row = pd.DataFrame({
                "auid": [new_id],
                "given-name": [given_name],
                "surname": [surname],
                "indexed-name": [indexed_name],
                "paper": [[paper_id]],
                "affiliation": [affiliations],
            })
            authors_df = pd.concat([authors_df, new_row], ignore_index=True)
            new_author += 1
        else:
            new_id = existing["auid"].iloc[0]
            idx = existing.index[0]
            authors_df.at[idx, "paper"] = authors_df.at[idx, "paper"] + [paper_id]
        authors.append(str(new_id))
    new_row = pd.DataFrame({
        "id": [paper_id],
        "title": [data["title"].replace("\n", " ")],
        "description": [data["summary"].replace("\n", " ")],
        "date": [data["published"]],
        "year": [data["published"].split("-")[0]],
        "authors": [authors],
        "affiliations": [paper_affs],
    })
    papers_df = pd.concat([papers_df, new_row], ignore_index=True)


In [25]:
name

{'name': 'Danny Hermelin'}

In [26]:
data["author"]

[{'name': 'Klaus Heeger'}, {'name': 'Danny Hermelin'}]

In [29]:
affiliations_df.tail(50)

Unnamed: 0,affid,city,name,country,paper_count
12245,S0000127,,Laboratoire de Physique de l'École Normale Sup...,,
12246,S0000128,,"Observatoire de Paris, Université PSL, Sorbonn...",,
12247,S0000129,,"Astrophysics Group, Blackett Laboratory, Imper...",,
12248,S0000130,,"Scuola Normale Superiore, Piazza dei Cavalieri...",,
12249,S0000131,,"Dipartimento di Fisica e Scienze della Terra, ...",,
12250,S0000132,,Dipartimento di Fisica - Sezione di Astronomia...,,
12251,S0000133,,"NASA Ames Research Center, Moffett Field, CA 9...",,
12252,S0000134,,Kavli Institute for Particle Astrophysics \& C...,,
12253,S0000135,,"Minnesota Institute for Astrophysics, Universi...",,
12254,S0000136,,"INAF, Istituto di Radioastronomia, Via Piero G...",,


In [28]:
papers_df.tail(50)

Unnamed: 0,id,title,description,publishername,copyright_type,date,year,class,affiliations,cites,authors
20490,2401.01691v1,2-Rainbow domination number of circulant graph...,Let $k$ be a positive integer. A $k$-rainbow d...,,,2024-01-03T11:49:46Z,2024,,[],,"[S000004329, S000004330, S000004331]"
20491,2401.01692v1,Predicting challenge moments from students' di...,Effective collaboration requires groups to str...,,,2024-01-03T11:54:30Z,2024,,[],,"[S000004332, S000004333, S000004334]"
20492,2401.01693v1,AID-DTI: Accelerating High-fidelity Diffusion ...,Deep learning has shown great potential in acc...,,,2024-01-03T11:54:48Z,2024,,[],,"[S000004249, S000004248, 57208500704, S0000043..."
20493,2401.01694v2,Images of black holes viewed by distant observer,We describe the possible forms of black hole i...,,,2024-01-03T11:55:03Z,2024,,[],,[S000004336]
20494,2401.01695v3,Approximation in Hölder Spaces,We introduce new vanishing subspaces of the ho...,,,2024-01-03T11:55:43Z,2024,,[],,"[S000004337, S000004338]"
20495,2401.01696v1,Von Neumann entropy of the angle operator betw...,Given a pair of intermediate $C^*$-subalgebras...,,,2024-01-03T12:00:50Z,2024,,[],,"[S000004253, S000004254, S000004339]"
20496,2401.01697v3,Consideration of non-phase-matched nonlinear e...,Femtosecond optical parametric oscillators (OP...,,,2024-01-03T12:01:42Z,2024,,[],,"[S000004340, S000004341, S000004342, S000004343]"
20497,2401.01698v2,Patterns of Persistence and Diffusibility acro...,Language similarities can be caused by genetic...,,,2024-01-03T12:05:38Z,2024,,[],,"[S000004344, S000004345]"
20498,2401.01699v2,WordArt Designer API: User-Driven Artistic Typ...,This paper introduces the WordArt Designer API...,,,2024-01-03T12:06:02Z,2024,,[],,"[S000004346, S000004347, 57204546596, S0000043..."
20499,2401.01700v1,Tulczyjew triple on the Atiyah algebroid with ...,The Tulczyjew triple on a principal bundle wit...,,,2024-01-03T12:07:35Z,2024,,[],,"[S000004356, S000004357, S000004358]"


In [27]:
authors_df.tail(50)

Unnamed: 0,auid,given-name,initials,surname,indexed-name,affliation,paper,affiliation
78757,S000004501,Menahem,,Krief,Krief M.,,[2401.01726v1],[S0000173]
78758,S000004502,Michael,,Assaf,Assaf M.,,[2401.01726v1],[S0000173]
78759,S000004503,Zeyang,,Lu,Lu Z.,,[2401.01727v1],[S0000173]
78760,S000004504,Gang,,Wang,Wang G.,,[2401.01727v1],[S0000173]
78761,S000004505,Chan,,Li,Li C.,,[2401.01727v1],[S0000173]
78762,S000004506,Zhu,,Cao,Cao Z.,,[2401.01727v1],[S0000173]
78763,S000004507,Anirudh,,Menon,Menon A.,,[2401.01728v2],[S0000173]
78764,S000004508,Unnikrishnan,,Menon,Menon U.,,[2401.01728v2],[S0000173]
78765,S000004509,Kailash,,Ahirwar,Ahirwar K.,,[2401.01728v2],[S0000173]
78766,S000004510,Chirantan,,Das,Das C.,,[2401.01729v1],[S0000173]


In [2]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

a = np.array([[1,2],[3,4],[5,6]])
a[0].reshape(1, -1)

array([[1, 2]])

In [3]:
def get_row_cosine_similarity(matrix, row_index):
    row_vector = matrix[row_index].reshape(1, -1)  # Reshape to 2D array
    similarity = cosine_similarity(row_vector, matrix)  # Compute similarity
    return similarity.flatten()  # Flatten the result to a 1D array

In [4]:
get_row_cosine_similarity(a, 0)

array([1.        , 0.98386991, 0.97341717])