In [5]:
import pandas as pd
import numpy as np
import requests
pd.set_option('max_colwidth', 1000)


In [23]:
!pip install sparqlwrapper

Collecting sparqlwrapper
  Downloading SPARQLWrapper-2.0.0-py3-none-any.whl (28 kB)
Installing collected packages: sparqlwrapper
Successfully installed sparqlwrapper-2.0.0


In [24]:
from SPARQLWrapper import SPARQLWrapper, JSON

In [160]:
# given a DOI, get the topics that are relevant from OpenAlex
# We get the topics over 0.5 to get the most relevants only
def getTopicsAndAuthors(doi):
    # Set the URL for the OpenAlex API
    url = "https://api.openalex.org/works/https://doi.org/"
    # Set the headers
    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json',
    }
    # Set the parameters for the query
    #params = {
    #    'query': 'your_query_here',  # Replace 'your_query_here' with your actual query
    #    'apikey': 'your_api_key_here',  # Replace 'your_api_key_here' with your actual API key
    #}
    response = requests.get(url+str(doi), headers=headers)
    json_response = response.json()
    return_value = {"doi":doi}
    if(json_response["concepts"] is not None):
        return_list = []
        for concept in json_response["concepts"]:
            if(concept["score"]> 0.5):
                return_list.append(concept["display_name"])
    return_value["concepts"] = return_list
    if(json_response["authorships"] is not None):
        return_authors = []
        return_authors_openAlex = []
        for author in json_response["authorships"]:
            if(author["author"]):
                a = author["author"]
                return_authors_openAlex.append(a["id"])
                return_authors.append(a["display_name"])
    return_value["authors"] = return_authors
    return_value["authors_openAlex"] = return_authors_openAlex
    return return_value
    

In [134]:
#test of the previous function
print(getTopicsAndAuthors("10.18632/aging.102901"))

{'doi': '10.18632/aging.102901', 'concepts': ['White matter', 'Cognition', 'White (mutation)'], 'authors': ['https://openalex.org/A5048455627', 'https://openalex.org/A5017468786', 'https://openalex.org/A5035796732', 'https://openalex.org/A5040897654', 'https://openalex.org/A5030034440', 'https://openalex.org/A5052836448', 'https://openalex.org/A5044626751', 'https://openalex.org/A5073568638'], 'authors_openAlex': ['Cui Zhao', 'Ying Liang', 'Ting Cui', 'Yihua Zhong', 'Xianglong Li', 'Jing Wei', 'Chunlin Li', 'Xu Zhang']}


In [135]:
sparql = SPARQLWrapper("http://localhost:3030/czi/query")

# given a mention, return the mention variations and the group they belong to
def get_mention_variations(mention_name):
    query =""" 
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    SELECT distinct ?name ?alias ?soft WHERE {
      ?soft <https://w3id.org/softalias/alias>/<https://schema.org/name> "mention_name".
      ?soft  <https://w3id.org/softalias/alias>?alias .
          ?alias <https://schema.org/name> ?name .
      ?soft <https://schema.org/url> ?url .
    } LIMIT 100
    """
    query = query.replace("mention_name", mention_name)
    # print(query)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)

    # Execute the query and convert results to JSON format
    results = sparql.query().convert()

    # Process and return the results
    res = []
    for result in results["results"]["bindings"]:
        r = {"name":result["name"]["value"],
             "alias":result["alias"]["value"].replace("https://w3id.org/softalias/alias/",""), 
             "group":result["soft"]["value"].replace("https://w3id.org/softalias/SoftwareApplication/","")
            }
        res.append(r)
    return res



In [136]:
# test of the previous function.
# Note that this includes the individuals where the mentions belongs. 
# It may skip some individuals that belong to other similar clusters
print(get_mention_variations("PANDA"))

[{'name': 'PANDA (Pipeline for Analyzing braiN Diffusion imAges)', 'alias': 'SM696942', 'group': 'SM27626'}, {'name': 'panda', 'alias': 'SM586756', 'group': 'SM27626'}, {'name': 'PANDA', 'alias': 'SM27626', 'group': 'SM27626'}]


In [137]:
# Read CZI whole dataset
raw_df = pd.read_csv('comm_raw.tsv.gz', sep = '\\t', engine = 'python', compression = 'gzip')

In [163]:
# Given a mention id, this method prints out the DOI, 
# text and topics associated with the first 4 mentions in papers
# Limit is four to avoid ban from OpenAlex
# a mention is the dictionary object returned by getMentionVariations
limit = 5
def getSampleForMention(mention, writer):
    i = 0
    mention_name = mention['name']
    mention_id = mention['alias']
    mention_group = mention['group']
    for index, row in raw_df[raw_df['ID'] == mention_id].iterrows():
        try:
            info = getTopicsAndAuthors(row['doi'])
            authors = info['authors']
            authors_oa = info['authors_openAlex']
            concepts = info ['concepts']
            authors_string = ','.join(authors)
            authors_string_oa = ','.join(authors_oa)
            concepts_string = ','.join(concepts)
            #print(mention_name, mention_id, mention_group, row['doi'], row['text'], authors, concepts)
            writer.writerow([mention_name, mention_id, mention_group, 
                             row['doi'], row['text'], authors_string, authors_string_oa, concepts_string])
            #print(getTopicsAndAuthors(row['doi']))
            i = i+1
            # we only take the first 4 as a sample
            if i > limit:
                break
        except:
            pass

In [165]:
import csv
#raw_df[raw_df['ID'] == 'SM27626']
#getSampleForVariation("SM27626")

# Separate analysis on potential problematic mentions
# NON-PROBLEMATIC: SKLEARN
# Problematic ones are: Pandas, PRISM (done separately), Activity, sets, rhinoceros, slingshot, flip, mem, earth, eva
# Problematic ones are found because there are at least 2 package managers with that information. 
# But maybe the clustering is correct.

# Analysis:
# 1) Get mention variations for a mention
# 2) Sample publications for that mentions.
# 3) Enrich the mentions with topics, authors.
# 4) Validate and add to corpus. Maybe try and find the target URL too.

target_mention = "star"

with open('sample_data.csv', mode='w', newline='') as file:
    writer = csv.writer(file,quoting=csv.QUOTE_ALL, delimiter=';')
    writer.writerow(['Mention Name', 'Mention ID', 'Mention Group', 'DOI', 'Text', 'Authors', 'Authors_oa','Concepts'])
    # The following is a test without programming
    mention_list = get_mention_variations(target_mention)
    # now let's take the 'panda' mention (SM586756)
    for i in mention_list:
        getSampleForMention(i, writer)