In [None]:
pip install arxiv
pip install PyPDF2

In [None]:
# import arxiv
import pandas as pd
import requests
# from PyPDF2 import PdfReader
import time
import re
import ast
import numpy as np

# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

# Fetch and clean Data

## Fetch papers' data from arXiv

Define a list of search queries related to various topics in AI and ML.

In [None]:
search_query_list = ['Knowledge graph', 'Natural Language Processing','Graph Neural Network', 'Attention Networks','Contrastive Learning','Transformers','Generative Adversarial Net','Computer Vision','Artificial intelligence','Knowledge Graph Embedding']

search_list = []

# Loop through each query in the search_query_list and perform a search on arXiv.
# For each query, it fetches up to 600 results sorted by relevance and appends the results to search_list.
for elem in search_query_list:
  search = arxiv.Search(
    query=elem,
    max_results=600,
    sort_by=arxiv.SortCriterion.Relevance
  )
  search_list.append(search.results())


  search_list.append(search.results())


Stock arXiv data in a dictionary

In [None]:

# Initialize a dictionary to store information about arXiv papers, with keys for various details such as Title, Summary, 
# Published Date, Updated Date, PDF URL, DOI, Journal Reference, Primary Category, Categories, Comments, and Authors.

data_arxiv = {
    # 'Paper_ID': [],
    'Title': [],
    'Summary': [],
    'Published_Date': [],
    'Updated_Date': [],
    'PDF_URL': [],
    'DOI': [],
    'Journal_Ref': [],
    'Primary_Category': [],
    'Categories': [],
    'Comments': [],
    # 'Report_No': [],
    'Authors_arXiv': [],
}

# Iterate over each set of results from the search queries stored in search_list.
for results in search_list:
  for result in results:
  # Check if the result is not empty, then extract and append details of each paper to the data_arxiv dictionary.
    if result:
    # data_arxiv['Paper_ID'].append(result.entry_id)
      data_arxiv['Title'].append(result.title)
      data_arxiv['Summary'].append(result.summary)
      data_arxiv['Published_Date'].append(result.published)
      data_arxiv['Updated_Date'].append(result.updated)
      data_arxiv['PDF_URL'].append(result.pdf_url)
      data_arxiv['DOI'].append(result.doi if result.doi else 'N/A')
      data_arxiv['Journal_Ref'].append(result.journal_ref if result.journal_ref else 'N/A')
      data_arxiv['Primary_Category'].append(result.primary_category)
      data_arxiv['Categories'].append(', '.join(result.categories))
      data_arxiv['Comments'].append(result.comment if result.comment else 'N/A')
      # data['Report_No'].append(result.report_no if result.report_no else 'N/A')
      data_arxiv['Authors_arXiv'].append(', '.join([author.name for author in result.authors]))

      # time.sleep(2)
    else:
      continue

Stock arXiv data in a dataframe

In [None]:
df_arxiv2 = pd.DataFrame(data_arxiv)

df_arxiv2.head()

Unnamed: 0,Title,Summary,Published_Date,Updated_Date,PDF_URL,DOI,Journal_Ref,Primary_Category,Categories,Comments,Authors_arXiv
0,Knowledge Graphs: Opportunities and Challenges,With the explosive growth of artificial intell...,2023-03-24 12:10:42+00:00,2023-03-24 12:10:42+00:00,http://arxiv.org/pdf/2303.13948v1,,,cs.AI,cs.AI,"43pages, 5 figures, 3 tables","Ciyuan Peng, Feng Xia, Mehdi Naseriparsa, Fran..."
1,Assisted Knowledge Graph Authoring: Human-Supe...,"Encyclopedic knowledge graphs, such as Wikidat...",2024-01-15 13:51:00+00:00,2024-01-15 13:51:00+00:00,http://arxiv.org/pdf/2401.07683v1,10.1145/3627508.3638340,,cs.CL,cs.CL,accepted at CHIIR 2024,"Marcel Gohsen, Benno Stein"
2,Construction and Application of Teaching Syste...,Through the combination of crowdsourcing knowl...,2020-10-18 14:26:10+00:00,2020-10-18 14:26:10+00:00,http://arxiv.org/pdf/2010.08995v1,10.1007/978-981-15-1956-7_3,4th China Conference on Knowledge Graph and Se...,cs.DB,"cs.DB, cs.AI, cs.CL",Number of references:15 Classification code:90...,"Jinta Weng, Ying Gao, Jing Qiu, Guozhu Ding, H..."
3,Fast Knowledge Graph Completion using Graphics...,Knowledge graphs can be used in many areas rel...,2023-07-22 12:00:54+00:00,2023-07-22 12:00:54+00:00,http://arxiv.org/pdf/2307.12059v1,,,cs.AI,"cs.AI, cs.DB, cs.LG",,"Chun-Hee Lee, Dong-oh Kang, Hwa Jeon Song"
4,Joint Embedding Learning of Educational Knowle...,As an efficient model for knowledge organizati...,2019-11-20 09:05:11+00:00,2019-12-23 14:52:03+00:00,http://arxiv.org/pdf/1911.08776v2,10.1007/978-3-030-41099-5_12,Artificial Intelligence Supported Educational ...,cs.CL,"cs.CL, cs.AI, cs.LG",,"Siyu Yao, Ruijie Wang, Shen Sun, Derui Bu, Jun..."


In [None]:
len(set(df_arxiv2["PDF_URL"]))

5715

Remove duplicate rows in df_arxiv2 based on the 'PDF_URL' column, keeping only unique entries and storing the result in df_arxiv_dup.

In [None]:
df_arxiv_dup = df_arxiv2.drop_duplicates(subset='PDF_URL')

In [None]:
df_arxiv_dup.shape

(5715, 11)

Split the dataframe df_arxiv_dup into smaller dataframes to manage large data chunks for further processing.


In [None]:
df1 = df_arxiv_dup[:1000]
df2 = df_arxiv_dup[1000:2000]
df3 = df_arxiv_dup[2000:3000]
df4 = df_arxiv_dup[3000:4000]
df5 = df_arxiv_dup[4000:5000]
df6 = df_arxiv_dup[5000:]

Save each split dataframe to separate CSV files for easy data handling and storage

In [None]:
df1.to_csv('df1.csv')
df2.to_csv('df2.csv')
df3.to_csv('df3.csv')
df4.to_csv('df4.csv')
df5.to_csv('df5.csv')
df6.to_csv('df6.csv')

## Fetch additional papers' details from Semantic Scholar and full paper pdf

Building a comprehensive dataset of research papers by merging data from a local DataFrame and external sources, enriching it with additional information like citation counts, references, full-text content, and keywords, making it suitable for further analysis or machine learning tasks.


Functions used in process

In [None]:
# Function to fetch paper details from Semantic Scholar
def fetch_from_semantic_scholar(title):

    url = f"https://api.semanticscholar.org/graph/v1/paper/search?query={title}&fields=authors,venue,fieldsOfStudy,citationCount,citations,references,referenceCount"
    response = requests.get(url)

    i=0
    while response.status_code != 200 or i<=5:
      response = requests.get(url)
      i+=1

    if response.status_code == 200:
        papers = response.json().get('data', [])
        if papers:
            return papers[0]
    return None

# Function to extract keywords from full paper
def extract_keywords(full_text):

    keywords_pattern = re.compile(r'\bKeywords\b\s*(.*)', re.IGNORECASE)
    keywords_match = keywords_pattern.search(full_text[:5000])
    keywords = 'N/A'
    if keywords_match:
        start = keywords_match.start()
        end = start + len(keywords_match.group(0))
        lines = full_text[start:].split('\n')
        keywords = lines[0]
        if len(lines) > 1:
            keywords += ' ' + lines[1]
            if 'Introduction' not in lines[2] or 'INTRODUCTION' not in lines[2]:
              keywords += ' ' + lines[2]

    return keywords

**The 3 codes below must be executed one time for each different DataFrame segment (df1 to df6), to ensure that all papers are processed and stored in the data dictionary.** 
*  Only the first (df1), second (df2), and sixth (df6) DataFrames are used in this code. This selection was made due to time constraints, as processing all DataFrames would take too much time.*

In [None]:
# The data dictionary is initialized with various keys corresponding to different attributes of research papers
data = {
    'Paper_ID': [],
    'Title': [],
    'Summary': [],
    'Published_Date': [],
    'Updated_Date': [],
    'PDF_URL': [],
    'DOI': [],
    'Journal_Ref': [],
    'Primary_Category': [],
    'Categories': [],
    'Comments': [],
    'Full_Text': [],
    'Citation_Count': [],
    # 'Citations': [],
    'References': [],
    'Venue': [],
    'Keywords': [],
    'Authors_sccho':[],
    'nbPages' : []
}

# The code iterates over each row of the specific DataFrame segment (e.g., df6) using iterrows(). 
# For each row, it appends relevant values (e.g., title, summary, DOI) from the DataFrame to the corresponding lists in the data dictionary.
for index, row in df6.iterrows():
    
    data['Title'].append(row['Title'])
    data['Summary'].append(row['Summary'])
    data['Published_Date'].append(row['Published_Date'])
    data['Updated_Date'].append(row['Updated_Date'])
    data['PDF_URL'].append(row['PDF_URL'])
    data['DOI'].append(row['DOI'])
    data['Journal_Ref'].append(row['Journal_Ref'])
    data['Primary_Category'].append(row['Primary_Category'])
    data['Categories'].append(row['Categories'])
    data['Comments'].append(row['Comments'])


    ##  The function is called with the paper's title to retrieve additional information about the paper (e.g., citation count, references, authors).
    paper_details = fetch_from_semantic_scholar(row['Title'])
    if paper_details:
        data['Paper_ID'].append(paper_details.get('paperId'))
        data['Citation_Count'].append(paper_details.get('citationCount', 0))
        # data['Citations'].append([
        #     {'citing_paper_id': citation['paperId'], 'citing_paper_title': citation['title']}
        #     for citation in paper_details.get('citations', [])
        # ])
        data['References'].append([
            {'reference_id': reference['paperId'], 'reference_title': reference['title']}
            for reference in paper_details.get('references', [])
        ])
        data['Venue'].append(paper_details.get('venue', {}))
        data['Authors_sccho'].append([
            {'authorId': author['authorId'],'name': author['name'], 'affiliation': author.get('affiliation', 'N/A')}
            for author in paper_details.get('authors', [])
        ])
    else:
        data['Paper_ID'].append('N/A')
        data['Citation_Count'].append(0)
        data['References'].append([])
        data['Venue'].append('N/A')
        data['Authors_sccho'].append([])

    ## Fetch full text from PDF URL
    pdf_url = row['PDF_URL'].replace('abs', 'pdf')
    full_text = ''
    nb_page = 0
    if pdf_url:
        try:
            response = requests.get(pdf_url, stream=True)
            if response.status_code == 200:
                with open("temp.pdf", "wb") as f:
                    f.write(response.content)
                with open("temp.pdf", "rb") as f:
                    pdf_reader = PdfReader(f)
                    nb_page = len(pdf_reader.pages)
                    for page in pdf_reader.pages:
                        full_text += page.extract_text() or ''
        except Exception as e:
            print(f"Error fetching PDF from {pdf_url}: {e}")

    ## Extract keywords from beginning of the full text 
    keywords = extract_keywords(full_text)
    data['Keywords'].append(keywords)
    data['nbPages'].append(nb_page)
    data['Full_Text'].append(full_text)

    # Sleep for 1 seconds to avoid hitting API rate limits
    time.sleep(1)





In [None]:
# Create a DataFrame
df = pd.DataFrame(data)

df.head()

Unnamed: 0,Paper_ID,Title,Summary,Published_Date,Updated_Date,PDF_URL,DOI,Journal_Ref,Primary_Category,Categories,Comments,Full_Text,Citation_Count,References,Venue,Keywords,Authors_sccho
0,8423b0efb07abbb121678556901119203308c1c3,Graph Entropy Guided Node Embedding Dimension ...,Graph representation learning has achieved gre...,2021-05-07 11:40:29+00:00,2021-08-31 08:07:41+00:00,http://arxiv.org/pdf/2105.03178v5,,,cs.LG,"cs.LG, cs.AI",,Graph Entropy Guided Node Embedding Dimension ...,21,[{'reference_id': '5ca519392bdd6b0536d1ac5a66a...,International Joint Conference on Artificial I...,,"[{'authorId': '2142400413', 'name': 'Gongxu Lu..."
1,87d3b93d74876383d1a3e603aa6e115ac0e96e37,Individual and Structural Graph Information Bo...,Out-of-distribution (OOD) graph generalization...,2023-06-28 03:52:41+00:00,2023-06-28 03:52:41+00:00,http://arxiv.org/pdf/2306.15902v1,,,cs.LG,"cs.LG, cs.AI, cs.CV",Accepted by IEEE Transactions on Knowledge and...,IEEE TRANSACTIONS ON KNOWLEDGE AND DATA ENGINE...,5,[{'reference_id': '4a3e6608cb75efdff66dc38ae97...,IEEE Transactions on Knowledge and Data Engine...,,"[{'authorId': '2155557947', 'name': 'Ling Yang..."
2,7bb4cd36de648ca44cc390fe886ee70a4b2ad1ac,Knowledge Graph Embedding using Graph Convolut...,Knowledge graph embedding methods learn embedd...,2021-02-14 17:19:44+00:00,2021-02-14 17:19:44+00:00,http://arxiv.org/pdf/2102.07200v1,,,cs.LG,"cs.LG, cs.AI",,Knowledge Graph Embedding using Graph Convolut...,8,[{'reference_id': '3cf68c49d5745cdcbdefe6f637b...,arXiv.org,"KEYWORDS Knowledge Graphs, Embedding, Graph At...","[{'authorId': '32216985', 'name': 'Nasrullah S..."
3,faff441c3b17abb7ff70f019dcb8b9e38cce70f6,Simple and Effective Relation-based Embedding ...,Relational graph neural networks have garnered...,2022-05-13 06:02:13+00:00,2022-05-13 06:02:13+00:00,http://arxiv.org/pdf/2205.06456v1,,,cs.CL,cs.CL,Accepted by IJCAI 2022,Simple and Effective Relation-based Embedding ...,8,[{'reference_id': '858e8322acf7a59c3919d354dde...,International Joint Conference on Artificial I...,,"[{'authorId': '2109616592', 'name': 'Huijuan W..."
4,4159c233b9c94062c25d21f3869cf6dafd6822a9,Inductive Link Prediction in Knowledge Graphs ...,Link prediction is a crucial research area in ...,2023-12-16 02:26:09+00:00,2023-12-16 02:26:09+00:00,http://arxiv.org/pdf/2312.10293v1,,,cs.LG,cs.LG,,Pre-print version\nInductive Link Prediction i...,0,[{'reference_id': '8c720eb939259971bcb87f46c92...,arXiv.org,,"[{'authorId': '1390468139', 'name': 'Canlin Zh..."


In [None]:
# Save DataFrame to CSV

df.to_csv('df6_papers.csv', encoding='utf-8-sig',  escapechar='\\', sep=',',errors='ignore')


( if needed !) Function to clean text by removing non-ASCII characters. 

In [None]:
# Function to replace non-UTF-8 characters with a placeholder
def clean_text(text):
    if isinstance(text, str):
        return ''.join(char if ord(char) < 128 else '?' for char in text)
    else:
        return text

# Clean the DataFrame, excluding columns with Timestamp objects
df_cleaned = df.applymap(clean_text)

# Save the cleaned DataFrame to CSV with specified encoding
df_cleaned.to_csv('df6_papers.csv', encoding='utf-8-sig', index=False, escapechar='\\', sep=',')


Load specific DataFrames from CSV files.

In [None]:
df1_full = pd.read_csv('df1_papers.csv', delimiter=',', index_col=0)
df2_full = pd.read_csv('df2_papers.csv', delimiter=',', index_col=0)
df6_full = pd.read_csv('df6_papers.csv', delimiter=',', index_col=0)

Concatenate the selected DataFrames (df1, df2, and df6) along the rows (axis=0).


In [None]:
df = pd.concat([df1_full,df2_full,df6_full], axis = 0)

Reset the index of the concatenated DataFrame, dropping the old index to maintain a clean, sequential order.


In [None]:
df.reset_index(drop=True, inplace=True)

Save the concatenated DataFrame to a CSV file named 'df1_2_6_full.csv'.


In [None]:
df.to_csv('df1_2_6_full.csv', encoding='utf-8-sig', index=False, escapechar='\\', sep=',')

## Clean Keywords column and Extract keywords

In [3]:
df = pd.read_csv("df1_2_6_full.csv", index_col=0)
df.shape

(1706, 21)

The following code cleans the Keywords column of a the dataFrame (df) by removing unwanted keywords and setting specific rows to NaN if they are in the null_rows list or contain keywords that need to be excluded.

In [None]:
# null_rows contains the indices of rows where the 'Keywords' column either contains 'NaN' strings or where the keywords were mistakenly 
# extracted from sections like the introduction, rather than being proper keywords from the text.

null_rows = [17,21, 22, 27, 51, 59, 65, 119, 933,982,121,2135,2402,1982, 123, 126, 149, 150,2670, 153, 160, 167, 179, 188, 190, 195, 198, 207, 220, 224, 228, 233, 239, 249, 257, 282,307,319,320,329,361,380, 427, 454, 488, 514, 529, 550, 572, 592, 601, 602, 634, 641, 644, 646, 657, 663, 673, 674, 690, 703, 713, 717, 718, 739, 743, 747, 748, 751, 753, 762, 768, 791, 796, 798, 800, 810, 829, 830, 834, 838, 846, 853, 855, 862, 876, 879, 905, 930, 944, 950, 953, 957, 957, 958, 981, 984, 1017, 1021, 1022, 1027, 1051, 1059, 1065, 1119, 1121, 1123, 1126, 1149, 1150, 1153, 1160, 1167, 1179, 1188, 1190, 1195, 1198, 1207, 1220, 1224, 1228, 1233, 1239, 1257, 1282, 1307, 1319, 1320, 1329, 1354, 1361, 1380, 1427, 1454, 1488, 1529, 1550, 1571, 1572, 1592, 1601, 1634, 1646, 1657, 1663, 1673, 1674, 1690, 1703, 1706,1713, 1717, 1718, 1739, 1743, 1747, 1748, 1751, 1753, 1755, 1762, 1768, 1791, 1796, 1798, 1800, 1810, 1829, 1830, 1834, 1838, 1846, 1853, 1855, 1862, 1876, 1879, 1905, 1930, 1933, 1950, 1953, 1957, 1958, 1968, 1981, 1984, 2050, 2077, 2079, 2120, 2121, 2165, 2268, 2335, 2367, 2449, 2467, 2478, 2491, 2501, 2584, 2634, 2651, 2695]

# Keywords that need to be removed from the 'Keywords' column if found.
keywords_to_remove = [
    'Introduction', 'Email', 'Abstract', 'MOTS-CLÉS', 'DOI',
    'Corresponding', 'arXiv', 'https', 'Preprint', 'This',
    'ACM Reference Format', 'Received', 'ACMReference Format', '1','1.','*','Please', 'I. INTRODUCTION','This version','the study',
    "∗this",'∗we thank','∗Electronic', '*these'
]

keywords_to_remove = [kw.lower() for kw in keywords_to_remove]

# Function to clean the 'Keywords' column by checking for null rows and removing unwanted keywords.
def clean_keywords(keyword, index):
    if pd.isnull(keyword) or index in null_rows:
        return np.nan
    keyword_lower = keyword.lower()
    for kw in keywords_to_remove:
        if kw in keyword_lower:
            keyword_lower = keyword_lower.split(kw)[0].strip()
            if not keyword_lower:  # If keyword becomes empty, set to NaN
                return np.nan
            return keyword_lower
    return keyword

df['Keyword_semi'] = df.apply(lambda row: clean_keywords(row['Keywords'], row.name), axis=1)


In [None]:
df[~df["Keywords"].isna()][["Keywords","Keyword_semi"]]

Unnamed: 0,Keywords,Keyword_semi
0,"Keywords: Knowledge graphs, articial intellig...","Keywords: Knowledge graphs, articial intellig..."
1,"KEYWORDS Knowledge Graph Construction, Semanti...","KEYWORDS Knowledge Graph Construction, Semanti..."
2,Keywords: Educational K nowledge Graph; learn...,Keywords: Educational K nowledge Graph; learn...
3,Keywords Knowledge Graph Embedding ·Knowedge G...,keywords knowledge graph embedding ·knowedge g...
4,Keywords Educational Technologies Knowledge G...,keywords educational technologies knowledge g...
...,...,...
2701,Keywords : Scholarly Knowledge Graph Knowledge...,Keywords : Scholarly Knowledge Graph Knowledge...
2706,Keywords: Event-centric knowledge graphs ·Repr...,keywords: event-centric knowledge graphs ·repr...
2711,KEYWORDS knowledge graph completion; relation ...,keywords knowledge graph completion; relation ...
2713,Keywords: Knowledge Representation and Reasoni...,keywords: knowledge representation and reasoni...


In [None]:
# A little overview to check for further cleaning
df.iloc[[864, 933, 982, 1864, 1891, 1982, 2135, 2143, 2157, 2402, 2561, 2666, 2670]][["Keyword_semi"]]

Unnamed: 0,Keyword_semi
864,keywords: process mining ·natural language gen...
933,
982,
1864,keywords: process mining ·natural language gen...
1891,"keywords: dependency distance, natural languag..."
1982,
2135,
2143,"keywords: artiﬁcial intelligence, deep neural ..."
2157,"keywords: quantitative causality, information ..."
2402,


This regex pattern identifies and removes occurrences of the word 'keywords' (case-insensitive) followed by optional punctuation (like colons, dashes, or em dashes) from the 'Keyword_semi' column, and then trims any leading or trailing spaces from the cleaned keyword strings.


In [None]:
# Define a regex to Clean the keywords column
pattern = r'(?i)\bkeywords\s*[:\-—]*\s*\b'

df['Keywords_cleaned'] = df['Keyword_semi'].str.replace(pattern, '', regex=True).str.strip()


This code removes any occurrences of the character '1' and the letter 'i' from the 'Keywords_cleaned' column, and then filters the DataFrame to display only the non-null values from both 'Keyword_semi' and 'Keywords_cleaned' columns.


In [None]:
df['Keywords_cleaned'] = df['Keywords_cleaned'].str.replace('1','')
df['Keywords_cleaned'] = df['Keywords_cleaned'].str.replace('i','')
df[~df["Keywords_cleaned"].isna()][["Keyword_semi","Keywords_cleaned"]]

Unnamed: 0,Keyword_semi,Keywords_cleaned
0,"Keywords: Knowledge graphs, articial intellig...","Knowledge graphs, artcal ntellgence, graph em..."
1,"KEYWORDS Knowledge Graph Construction, Semanti...","Knowledge Graph Constructon, Semantc Web, Info..."
2,Keywords: Educational K nowledge Graph; learn...,Educatonal K nowledge Graph; learn ng analyss ...
3,keywords knowledge graph embedding ·knowedge g...,knowledge graph embeddng ·knowedge graph compl...
4,keywords educational technologies knowledge g...,educatonal technologes knowledge graph embedd...
...,...,...
2701,Keywords : Scholarly Knowledge Graph Knowledge...,Scholarly Knowledge Graph Knowledge Graph Embe...
2706,keywords: event-centric knowledge graphs ·repr...,event-centrc knowledge graphs ·representaton l...
2711,keywords knowledge graph completion; relation ...,knowledge graph completon; relaton extrapolato...
2713,keywords: knowledge representation and reasoni...,"knowledge representaton and reasonng, robotcs,..."


Spliting the 'Keywords_cleaned' string into a list of individual keywords based on specified delimiters (comma, semicolon, middle dot, or double space). It also removes any leading or trailing whitespace from each keyword. The resulting list is stored in a new column 'Keywords_list'.


In [None]:
# Define a function to split each string in the column
def split_keywords(keyword):
    if pd.isnull(keyword):
        return np.nan
    # Use regular expression to split by , ; · or double space
    split_pattern = re.compile(r',|;|·|  ')
    keywords_list = split_pattern.split(keyword)
    # Remove leading and trailing whitespace from each keyword
    keywords_list = [kw.strip() for kw in keywords_list]
    return keywords_list

df['Keywords_list'] = df['Keywords_cleaned'].apply(split_keywords)

In [None]:
df[~df["Keywords_cleaned"].isna()][["Keywords_cleaned","Keywords_list"]]

Unnamed: 0,Keywords_cleaned,Keywords_list
0,"Knowledge graphs, artcal ntellgence, graph em...","[Knowledge graphs, artcal ntellgence, graph e..."
1,"Knowledge Graph Constructon, Semantc Web, Info...","[Knowledge Graph Constructon, Semantc Web, Inf..."
2,Educatonal K nowledge Graph; learn ng analyss ...,"[Educatonal K nowledge Graph, learn ng analyss..."
3,knowledge graph embeddng ·knowedge graph compl...,"[knowledge graph embeddng, knowedge graph comp..."
4,educatonal technologes knowledge graph embedd...,[educatonal technologes knowledge graph embed...
...,...,...
2701,Scholarly Knowledge Graph Knowledge Graph Embe...,[Scholarly Knowledge Graph Knowledge Graph Emb...
2706,event-centrc knowledge graphs ·representaton l...,"[event-centrc knowledge graphs, representaton ..."
2711,knowledge graph completon; relaton extrapolato...,"[knowledge graph completon, relaton extrapolat..."
2713,"knowledge representaton and reasonng, robotcs,...","[knowledge representaton and reasonng, robotcs..."


Creating a flattened list of all keywords from the 'Keywords_list' column, by iterating through each sublist (which represents individual keyword lists) and extracting each keyword, while excluding any NaN values.


In [None]:
all_keywords = [item for sublist in df['Keywords_list'].dropna() for item in sublist]

In [None]:
all_keywords = set(all_keywords)

In [45]:
keywords_final = ['knowledge graph','neural-symbolic integraton','recommender system','explainable predicton','uzbek language',
                  'embedding','completion','knowledge graph reasoning','reinforcement learnng','causal graph',
                  'natural language processing','knowledge graph completion','emotional analysis','prompt-tuning','literate programmng',
                  'financial regulaton','entity retrieval','graph mining','large computing systems','human-robot interacton',
                  'probabilistic medical knowledge graph','hausa language','biomedical graph','link prediction','clustering',
                  'meta-knowledge transfer','low resource language','spatio-temporal data','reasoning','similarity join','neural networks',
                  'graph convolutional networks','ontology learning','dialogue systems','Cross-domain recommendation','language models',
                  'word similarity','knowledge enhanced recommendation','representation learning','ontologies','artificial intelligence',
                  'contrastve learnng','visualization','interface design','timestamp distribution','natural language generation',
                  'knowledge graph embeddings','neural networks','text classification','protégé', 'security data','information extraction',
                  'automatic programming','random walk','software engineering','knowledge bases','semantic query','language acquisition',
                  'meta learning','medical expert systems','large language models','generative ai','negative sampling','human language technologies',
                  'semantic web','semantic data integration','recursive neural networks','event prediction','knowledge transfer',
                  'graph summarization','dynamic graph embedding','classification','probabilistic', 'bipartite heterogeneous graph','stop words' ,
                  'bayesian networks', 'biomedical knowledge graphs','multi-task learning','language processng','time-aware knowledge graph',
                  'question answering','graph neural networks','pretrained language models','process mining','hierarchical clustering',
                  'intelligence modelling','database model', 'time series forecasting','relational learning','autonomous intelligence',
                  'collaboratve filtering','alignment relationship','systems security','explainable ai','retrieval augmentation',
                  'smart manufacturing','machine translation','turkish natural language processing','informaton theory','entity alignment',
                  'attention network','ontology embedding','word embedding','owl2vec','artificial intelligence','neo4j','lstm','chatgpt',
                  'entity linking','knowledge base question answering','transformer','entity clustering','triple embedding',
                  'graph representaton learnng','inductive knowledge graph','cyber security','reinforcement learning','machine learning',
                  'event extraction','natural language inference','social network analysis','quantum computing','kolmo gorov complexity',
                  'negative sampling','chatbot','deep learning','sentiment analysis','prediction','meta learning','interaction graph','rdf2vec',
                  'computer vision','representation learning','natural language','non-sampling machne learnng','graph reasoning','text-to-speech',
                  'pattern recognition','healthcare systems','digital marketing','translators','node embedding','deep neural networks',
                  'tensor decomposition','label classification','sql databases','stemmer','temporal knowledge graph','covid 19','text2kg',
                  'federated learning','internet of things','multi-agent systems','encoders','data science','markov model','term memory',
                  'decision making','data augmentaton','adversarial transfer learning','similarity','label propagation','visual encoding',
                  'graph augmentation','causal inference','data Annotation','retrieval augmented generation','uncertainty', 'graph kernels',
                  'named entity recognition','hyperparameter tuning','commonsense reasoning','link prediction','construction','message passing',
                  'node embedding', 'information extraction','word2vec','self-supervised learning','support vector machine','robotics',
                  'augmented llms','knowledge acquisition','text classification','unsupervised learning','tensor factorization',
                  'document classification','graph convolutional network','causality','temporal','extraction','prediction','tensor',
                  'semantic', 'language model','data extraction','question answering','query','link prediction','human-robot','interaction',
                  'graph augmentation'
                  ]

commun_keywords = [
    'Ontology', 'Semantic Web', 'Triple Store', 'Resource Description Framework', 'Linked Data', 'Entity Linking', 'Relation Extraction',
  'Graph Embedding', 'Text Mining', 'Sentiment Analysis', 'Named Entity Recognition ', 'Machine Translation', 'Text Classification',
  'Word Embedding', 'Language Modeling', 'Text Generation', 'Tokenization', 'Node Classification', 'Graph Convolutional Networks',
  'Graph Attention Networks', 'Graph Embedding', 'Graph Representation Learning', 'Link Prediction', 'Graph Clustering', 'Network Embedding',
  'Message Passing', 'Self-Attention', 'Multi-Head Attention', 'Transformer', 'Attention Mechanism', 'Sequence-to-Sequence', 'Context Vector',
  'Neural Machine Translation', 'Attention Weights', 'Scaled Dot-Product Attention', 'Self-Supervised Learning', 'Representation Learning',
  'Contrastive Loss', 'Positive Pairs', 'Negative Pairs', 'Data Augmentation', 'Embedding Space', 'Siamese Networks', 'Mutual Information',
  'Transformer Architecture', 'Encoder-Decoder', 'Self-Attention', 'Generative Pre-trained Transformer', 'Pre-training', 'Fine-Tuning', 'Sequence Modeling',
  'Attention Head', 'Generator', 'Discriminator', 'Adversarial Loss', 'Data Augmentation', 'Image Synthesis', 'Style Transfer', 'GAN', 'Latent Space', 'Conditional GANs',
  'Object Detection', 'Image Segmentation', 'Convolutional Neural Networks' , 'Image Classification', 'Feature Extraction', 'Image Recognition',
  'Face Detection', 'Image Augmentation', 'Optical Flow', 'Image Generation', 'Machine Learning', 'Deep Learning', 'Reinforcement Learning', 'Neural Networks',
  'Autonomous Systems', 'AI Applications', 'Intelligent Systems', 'AI Algorithms', 'Predictive Modeling', 'Embedding Space', 'Link Prediction',
  'Triple Embedding', 'Translational Models', 'Graph Neural Networks', 'Relational Learning', 'Knowledge Base Completion', 'Entity Embedding',
  'Relation Embedding', 'Matrix Factorization', 'Tensor Decomposition','generative adversarial network', 'graph', 'graphs', 'markov',
    'knowledge', 'language', 'neurotechnologies'
]

commun_keywords = [kw.lower() for kw in commun_keywords]

keywords_final.extend(commun_keywords)


In [46]:
len(set(keywords_final))

262

In [48]:
# Clean the keywords List
keywords_final = set(keywords_final)


Cleaning up the DataFrame structure by Removing the 'Extracted_Keywords' column from the DataFrame 'df'

In [49]:
df.drop(columns=["Extracted_Keywords"], inplace=True)

Extracting keywords from the abstracts of papers that **do not have any keywords**.

It takes an abstract and a list of keywords, checks for each keyword's presence in the abstract, and collects those that match into a set to avoid duplicates. The extracted keywords are then returned as a list. The function is applied to the 'Summary' column of the DataFrame 'df', and the results are stored in a new column named 'Extracted_Keywords'.


In [50]:
# Extract from the abstract of the papers with none keywords

def extract_keywords(abstract, keywords_list):
    if pd.isna(abstract):
        return []
    abstract_keywords = set()
    for keyword in keywords_list:
        if keyword.lower() in abstract.lower():
            abstract_keywords.add(keyword.lower())
    return list(abstract_keywords)

# Apply the function to the 'Abstracts' column
df['Extracted_Keywords'] = df['Summary'].apply(lambda x: extract_keywords(x, keywords_final))


Dropping rows from 'df' that have NaN values in the 'Paper_ID' and 'Title' columns. The index is reset to ensure a clean sequential index

In [None]:
df_na = df.dropna(subset=["Paper_ID"])
df_na = df_na.dropna(subset=["Title"])

df_na.reset_index(drop=True, inplace=True)

df_na.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2704 entries, 0 to 2703
Data columns (total 21 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Paper_ID            2704 non-null   object 
 1   Title               2704 non-null   object 
 2   Summary             2704 non-null   object 
 3   Published_Date      2704 non-null   object 
 4   Updated_Date        2704 non-null   object 
 5   PDF_URL             2704 non-null   object 
 6   DOI                 328 non-null    object 
 7   Journal_Ref         254 non-null    object 
 8   Primary_Category    2704 non-null   object 
 9   Categories          2704 non-null   object 
 10  Comments            1650 non-null   object 
 11  Full_Text           2685 non-null   object 
 12  Citation_Count      2704 non-null   float64
 13  References          2704 non-null   object 
 14  Venue               2595 non-null   object 
 15  Keywords            1059 non-null   object 
 16  Author

In [51]:
df = df_na

df.columns

Index(['Paper_ID', 'Title', 'Summary', 'Published_Date', 'Updated_Date',
       'PDF_URL', 'DOI', 'Journal_Ref', 'Primary_Category', 'Categories',
       'Comments', 'Full_Text', 'Citation_Count', 'References', 'Venue',
       'Keywords', 'Authors_sccho', 'Keyword_semi', 'Keywords_cleaned',
       'Keywords_list', 'Extracted_Keywords'],
      dtype='object')

Removing duplicate rows from the DataFrame 'df', keeping only the first occurrence of each unique value in the 'PDF_URL' column. 

In [None]:
df.drop_duplicates(subset=['PDF_URL'], inplace=True)

In [None]:
df.shape

(1708, 21)

In [None]:
df[df.duplicated(subset=["Paper_ID"], keep=False)]

Unnamed: 0,Paper_ID,Title,Summary,Published_Date,Updated_Date,PDF_URL,DOI,Journal_Ref,Primary_Category,Categories,Comments,Full_Text,Citation_Count,References,Venue,Keywords,Authors_sccho,Keyword_semi,Keywords_cleaned,Keywords_list,Extracted_Keywords


In [None]:
df.reset_index(drop=True, inplace=True)

Saving the cleanded file 

In [36]:
df.to_csv("df1_2_6_full.csv", sep=',')

# Preparing Data for Knowledge Graph construction

This section prepares data for constructing a knowledge graph based on the defined ontology.
The ontology outlines the relationships and properties of entities involved in academic research papers, including classes such as Paper, Author, Venue, Category, and Keyword. 
Each entity is represented in its own DataFrame, capturing relevant attributes and their relationships, 
enabling the representation of academic papers and their metadata in a structured format for further analysis.


"""
The ontology defines six main entities: 
- **Paper**: Represents academic research papers.
- **Author**: Represents the authors of the papers.
- **Venue**: Represents the publication venues for the papers.
- **Category**: Represents the categories under which the papers are classified.
- **Keyword**: Represents keywords associated with the papers.
- **VenueType**: Represents different types of venues.

Additionally, it specifies relationships between these entities, such as:
- Papers being published in venues.
- Authors writing papers.
- Papers belonging to categories.
- Papers having associated keywords.

DataFrames have been created for each entity and relationship, facilitating the construction of the knowledge graph.
"""


## create category_df by webscrapping https://arxiv.org/category_taxonomy

In [None]:
from bs4 import BeautifulSoup
import requests

In [None]:
url = 'https://arxiv.org/category_taxonomy'

response = requests.get(url)

categoryID = []
categoryName = []
categoryDescription = []

# Check if the request was successful
if response.status_code == 200:
    html_content = response.content

    soup = BeautifulSoup(html_content, 'html.parser')

    title = soup.title.string
    print(f"Title of the webpage: {title}")

    divs = soup.find_all('div', class_='column is-one-fifth')
    for div in divs:
      h4 = div.find('h4')
      if h4:
        category_id = h4.text.split(' ')[0]
        category_name = h4.find('span').text.strip('()')
        category_description = div.find_next_sibling('div').find('p').text

        categoryID.append(category_id)
        categoryName.append(category_name)
        categoryDescription.append(category_description)
else:
    print(f"Failed to retrieve the webpage. Status code: {response.status_code}")

category_df = pd.DataFrame({
    'categoryID': categoryID,
    'categoryName': categoryName,
    'categoryDescription': categoryDescription
})


Title of the webpage: Category Taxonomy


In [None]:
category_df.tail()

Unnamed: 0,categoryID,categoryName,categoryDescription
150,stat.CO,Computation,"Algorithms, Simulation, Visualization"
151,stat.ME,Methodology,"Design, Surveys, Model Selection, Multiple Tes..."
152,stat.ML,Machine Learning,"Covers machine learning papers (supervised, un..."
153,stat.OT,Other Statistics,Work in statistics that does not fit into the ...
154,stat.TH,Statistics Theory,"stat.TH is an alias for math.ST. Asymptotics, ..."


In [None]:
category_df.to_csv("category_df.csv")

## Create entities and relationships entities

**Papers Dataframe**

In [None]:
paper_df = df.copy()
paper_df.head()

Unnamed: 0,Paper_ID,Title,Summary,Published_Date,Updated_Date,PDF_URL,DOI,Journal_Ref,Primary_Category,Categories,Comments,Full_Text,Citation_Count,References,Venue,Keywords,Authors_sccho,Keyword_semi,Keywords_cleaned,Keywords_list,Extracted_Keywords
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,Knowledge Graphs: Opportunities and Challenges,With the explosive growth of artificial intell...,2023-03-24 12:10:42+00:00,2023-03-24 12:10:42+00:00,http://arxiv.org/pdf/2303.13948v1,,,cs.AI,cs.AI,"43pages, 5 figures, 3 tables",Knowledge Graphs: Opportunities and\nChallenge...,60.0,[{'reference_id': 'ca89c3461a35e1341c48e43aa7b...,Artificial Intelligence Review,"Keywords: Knowledge graphs, articial intellig...","[{'authorId': '1726114642', 'name': 'Ciyuan Pe...","Keywords: Knowledge graphs, articial intellig...","Knowledge graphs, artcal ntellgence, graph em...","[Knowledge graphs, artcal ntellgence, graph e...","[embedding, artificial intelligence, knowledge..."
1,af063194baee08931f12facdffc573e4bd29c463,Assisted Knowledge Graph Authoring: Human-Supe...,"Encyclopedic knowledge graphs, such as Wikidat...",2024-01-15 13:51:00+00:00,2024-01-15 13:51:00+00:00,http://arxiv.org/pdf/2401.07683v1,10.1145/3627508.3638340,,cs.CL,cs.CL,accepted at CHIIR 2024,Assisted Knowledge Graph Authoring: Human-Supe...,0.0,[{'reference_id': '3c331785f7f5629563fbc4aabfe...,Conference on Human Information Interaction an...,"KEYWORDS Knowledge Graph Construction, Semanti...","[{'authorId': '22247218', 'name': 'Marcel Gohs...","KEYWORDS Knowledge Graph Construction, Semanti...","Knowledge Graph Constructon, Semantc Web, Info...","[Knowledge Graph Constructon, Semantc Web, Inf...","[knowledge graph, natural language, construction]"
2,f2aca403781821618d7a69d98a9fbddccb236081,Construction and Application of Teaching Syste...,Through the combination of crowdsourcing knowl...,2020-10-18 14:26:10+00:00,2020-10-18 14:26:10+00:00,http://arxiv.org/pdf/2010.08995v1,10.1007/978-981-15-1956-7_3,4th China Conference on Knowledge Graph and Se...,cs.DB,"cs.DB, cs.AI, cs.CL",Number of references:15 Classification code:90...,Construction and Application of Teaching Syste...,10.0,[{'reference_id': '2582ab7c70c9e7fcb84545944eb...,China Conference on Knowledge Graph and Semant...,Keywords: Educational K nowledge Graph; learn...,"[{'authorId': '1473373066', 'name': 'Jinta Wen...",Keywords: Educational K nowledge Graph; learn...,Educatonal K nowledge Graph; learn ng analyss ...,"[Educatonal K nowledge Graph, learn ng analyss...","[knowledge graph, construction]"
3,9921e8a2ed1d7ddaf6fdc6491d7cea321073c0b3,Fast Knowledge Graph Completion using Graphics...,Knowledge graphs can be used in many areas rel...,2023-07-22 12:00:54+00:00,2023-07-22 12:00:54+00:00,http://arxiv.org/pdf/2307.12059v1,,,cs.AI,"cs.AI, cs.DB, cs.LG",,FAST KNOWLEDGE GRAPH COMPLETION USING GRAPHICS...,0.0,[{'reference_id': '9e712910ff1b81b6fe2a5e71515...,J. Parallel Distributed Comput.,Keywords Knowledge Graph Embedding ·Knowedge G...,"[{'authorId': '2118658200', 'name': 'Chun-Hee ...",keywords knowledge graph embedding ·knowedge g...,knowledge graph embeddng ·knowedge graph compl...,"[knowledge graph embeddng, knowedge graph comp...","[embedding, knowledge graph completion, knowle..."
4,fb08c6d5f8fe80cfbd9efa75b710f326f4846a0f,Joint Embedding Learning of Educational Knowle...,As an efficient model for knowledge organizati...,2019-11-20 09:05:11+00:00,2019-12-23 14:52:03+00:00,http://arxiv.org/pdf/1911.08776v2,10.1007/978-3-030-41099-5_12,Artificial Intelligence Supported Educational ...,cs.CL,"cs.CL, cs.AI, cs.LG",,Noname manuscript No.\n(will be inserted by th...,6.0,[{'reference_id': '697b988f8ad6337a979b9ec3475...,Advances in Analytics for Learning and Teaching,Keywords Educational Technologies Knowledge G...,"[{'authorId': '2067824542', 'name': 'Siyu Yao'...",keywords educational technologies knowledge g...,educatonal technologes knowledge graph embedd...,[educatonal technologes knowledge graph embed...,"[embedding, knowledge graph, gan, graph embedd..."


In [None]:
paper_df.columns

Index(['Paper_ID', 'Title', 'Summary', 'Published_Date', 'Updated_Date',
       'PDF_URL', 'DOI', 'Journal_Ref', 'Primary_Category', 'Categories',
       'Comments', 'Full_Text', 'Citation_Count', 'References', 'Venue',
       'Keywords', 'Authors_sccho', 'Keyword_semi', 'Keywords_cleaned',
       'Keywords_list', 'Extracted_Keywords'],
      dtype='object')

In [None]:
columns_to_drop = ["Updated_Date", "Journal_Ref","Primary_Category", "Categories", "Comments", "References", "Venue","Keywords", "Authors_sccho",'Keyword_semi', 'Keywords_cleaned',
       'Keywords_list', 'Extracted_Keywords']

paper_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
paper_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2704 entries, 0 to 2703
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Paper_ID        2704 non-null   object 
 1   Title           2704 non-null   object 
 2   Summary         2704 non-null   object 
 3   Published_Date  2704 non-null   object 
 4   PDF_URL         2704 non-null   object 
 5   DOI             328 non-null    object 
 6   Full_Text       2685 non-null   object 
 7   Citation_Count  2704 non-null   float64
dtypes: float64(1), object(7)
memory usage: 169.1+ KB


In [None]:
paper_df["yearPublication"] = pd.to_datetime(paper_df["Published_Date"]).dt.year
paper_df.head()

Unnamed: 0,Paper_ID,Title,Summary,Published_Date,PDF_URL,DOI,Full_Text,Citation_Count,yearPublication
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,Knowledge Graphs: Opportunities and Challenges,With the explosive growth of artificial intell...,2023-03-24 12:10:42+00:00,http://arxiv.org/pdf/2303.13948v1,,Knowledge Graphs: Opportunities and\nChallenge...,60.0,2023
1,af063194baee08931f12facdffc573e4bd29c463,Assisted Knowledge Graph Authoring: Human-Supe...,"Encyclopedic knowledge graphs, such as Wikidat...",2024-01-15 13:51:00+00:00,http://arxiv.org/pdf/2401.07683v1,10.1145/3627508.3638340,Assisted Knowledge Graph Authoring: Human-Supe...,0.0,2024
2,f2aca403781821618d7a69d98a9fbddccb236081,Construction and Application of Teaching Syste...,Through the combination of crowdsourcing knowl...,2020-10-18 14:26:10+00:00,http://arxiv.org/pdf/2010.08995v1,10.1007/978-981-15-1956-7_3,Construction and Application of Teaching Syste...,10.0,2020
3,9921e8a2ed1d7ddaf6fdc6491d7cea321073c0b3,Fast Knowledge Graph Completion using Graphics...,Knowledge graphs can be used in many areas rel...,2023-07-22 12:00:54+00:00,http://arxiv.org/pdf/2307.12059v1,,FAST KNOWLEDGE GRAPH COMPLETION USING GRAPHICS...,0.0,2023
4,fb08c6d5f8fe80cfbd9efa75b710f326f4846a0f,Joint Embedding Learning of Educational Knowle...,As an efficient model for knowledge organizati...,2019-11-20 09:05:11+00:00,http://arxiv.org/pdf/1911.08776v2,10.1007/978-3-030-41099-5_12,Noname manuscript No.\n(will be inserted by th...,6.0,2019


In [None]:
paper_df.rename(columns={"Paper_ID": "paperID", "Title": "title", "Summary": "abstract", "Published_Date":"dateSubmission", "PDF_URL":"url", "Full_Text":"fullText", "Citation_Count":"nbCitations"}, inplace=True)

In [None]:
# add the columns nbPages
def add_nbPages(url):
    pdf_url = url.replace('abs', 'pdf')
    nb_page =0
    if pdf_url:
        try:
            response = requests.get(pdf_url, stream=True)
            if response.status_code == 200:
                with open("temp.pdf", "wb") as f:
                    f.write(response.content)
                with open("temp.pdf", "rb") as f:
                    pdf_reader = PdfReader(f)
                    nb_page = len(pdf_reader.pages)
        except Exception as e:
            print(f"Error fetching PDF from {pdf_url}: {e}")

    return nb_page

In [None]:
paper_df["nbPages"] = paper_df["url"].apply(add_nbPages)



In [None]:
paper_df.tail()

Unnamed: 0,paperID,title,abstract,dateSubmission,url,DOI,fullText,nbCitations,yearPublication,nbPages
2699,4d336e6510d96daef3339e9a4d5323af367d9ea2,Zero-Shot Relational Learning for Multimodal K...,Relational learning is an essential task in th...,2024-04-09 11:14:45+00:00,http://arxiv.org/pdf/2404.06220v1,,Zero-Shot Relational Learning for Multimodal K...,0.0,2024,11
2700,2583b76e110b76860a0f0f87c3b65d44b9fc243a,Hierarchical Aggregations for High-Dimensional...,We investigate the problem of multiplex graph ...,2023-12-28 05:39:33+00:00,http://arxiv.org/pdf/2312.16834v1,10.1109/TKDE.2023.3305809,"JOURNAL OF LATEX CLASS FILES, VOL. X, NO. X, M...",0.0,2023,12
2701,1319080d08861fbf20ba8c6e3157223e43d7b09d,Table-Top Scene Analysis Using Knowledge-Super...,"In this paper, we propose a probabilistic meth...",2020-02-19 20:10:38+00:00,http://arxiv.org/pdf/2002.08417v1,,arXiv:2002.08417v1 [cs.CV] 19 Feb 2020Table-...,5.0,2020,17
2702,d64befb74b871231418a0e4dd1bd2b5ae0eb9e0e,A First Experiment on Including Text Literals ...,Graph embedding models produce embedding vecto...,2018-07-31 11:18:18+00:00,http://arxiv.org/pdf/1807.11761v1,,A First Experiment on Including Text Literals\...,12.0,2018,4
2703,09a416c88b9fb9b090cd82e883d46a53b4a8cece,Improving Molecule Generation and Drug Discove...,Recent advancements in generative models have ...,2024-02-13 20:58:36+00:00,http://arxiv.org/pdf/2402.08790v1,,Improving Molecule Generation and Drug Discove...,0.0,2024,12


In [None]:
paper_df.drop_duplicates(inplace=True)

In [None]:
paper_df[paper_df.duplicated(keep=False)]

Unnamed: 0,paperID,title,abstract,dateSubmission,url,DOI,fullText,nbCitations,yearPublication,nbPages


In [None]:
paper_df.shape

(1708, 10)

In [None]:
paper_df.to_csv("paper_df.csv", sep=',')

**Author Dataframe**

In [None]:
authors_df = df[["Authors_sccho"]]

In [None]:
authors_df.head()

Unnamed: 0,Authors_sccho
0,"[{'authorId': '1726114642', 'name': 'Ciyuan Pe..."
1,"[{'authorId': '22247218', 'name': 'Marcel Gohs..."
2,"[{'authorId': '1473373066', 'name': 'Jinta Wen..."
3,"[{'authorId': '2118658200', 'name': 'Chun-Hee ..."
4,"[{'authorId': '2067824542', 'name': 'Siyu Yao'..."


In [None]:
# ast.literal_eval safely evaluates a string containing a Python literal or container display to convert it to a list

authors_df['Authors_sccho'] = authors_df['Authors_sccho'].apply(ast.literal_eval)

df_exploded = authors_df.explode('Authors_sccho')

author_df = pd.json_normalize(df_exploded['Authors_sccho'])

author_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  authors_df['Authors_sccho'] = authors_df['Authors_sccho'].apply(ast.literal_eval)


Unnamed: 0,authorId,name,affiliation
0,1726114642,Ciyuan Peng,
1,2143633281,Feng Xia,
2,2149932639,Mehdi Naseriparsa,
3,2052329,Francesco Osborne,
4,22247218,Marcel Gohsen,


In [None]:
author_df.rename(columns={"authorId": "authorID", "name":"authorName", "affiliation":"authorAffiliation"}, inplace=True)
author_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6351 entries, 0 to 6350
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   authorID           6317 non-null   object
 1   authorName         6350 non-null   object
 2   authorAffiliation  6350 non-null   object
dtypes: object(3)
memory usage: 149.0+ KB


In [None]:
author_df.drop_duplicates(subset=['authorName'],inplace=True)

In [None]:
author_df[author_df['authorName']=='Hao Peng']

Unnamed: 0,authorID,authorName,authorAffiliation
43,2138443697,Hao Peng,


In [None]:
author_df[author_df.duplicated(subset=['authorName'])]

Unnamed: 0,authorID,authorName,authorAffiliation


In [None]:
author_df.reset_index(drop=True, inplace=True)
author_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5133 entries, 0 to 5132
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   authorID           5108 non-null   object
 1   authorName         5132 non-null   object
 2   authorAffiliation  5132 non-null   object
dtypes: object(3)
memory usage: 120.4+ KB


In [None]:
author_df.to_csv("author_df.csv", sep=',')

**Written_by Dataframe**

In [None]:
written_df = df[["Paper_ID", "Authors_sccho","PDF_URL"]]

In [None]:
written_df.shape

(1706, 3)

In [None]:
written_df.head()

Unnamed: 0,Paper_ID,Authors_sccho,PDF_URL
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,"[{'authorId': '1726114642', 'name': 'Ciyuan Pe...",http://arxiv.org/pdf/2303.13948v1
1,af063194baee08931f12facdffc573e4bd29c463,"[{'authorId': '22247218', 'name': 'Marcel Gohs...",http://arxiv.org/pdf/2401.07683v1
2,f2aca403781821618d7a69d98a9fbddccb236081,"[{'authorId': '1473373066', 'name': 'Jinta Wen...",http://arxiv.org/pdf/2010.08995v1
3,9921e8a2ed1d7ddaf6fdc6491d7cea321073c0b3,"[{'authorId': '2118658200', 'name': 'Chun-Hee ...",http://arxiv.org/pdf/2307.12059v1
4,fb08c6d5f8fe80cfbd9efa75b710f326f4846a0f,"[{'authorId': '2067824542', 'name': 'Siyu Yao'...",http://arxiv.org/pdf/1911.08776v2


In [None]:
written_df['Authors_sccho'] = written_df['Authors_sccho'].apply(ast.literal_eval)

# Explode the 'Authors_sccho' column to have one author per row
written_df_exploded = written_df.explode('Authors_sccho').reset_index(drop=True)

# Normalize the 'Authors_sccho' column to separate the dictionary into columns
authors_df = pd.json_normalize(written_df_exploded['Authors_sccho'])

# Combine the original 'Paper_ID' with the normalized authors DataFrame
written_by_df = pd.concat([written_df_exploded[['Paper_ID','PDF_URL']], authors_df], axis=1)

written_by_df = written_by_df[['Paper_ID','PDF_URL', 'authorId', 'name']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  written_df['Authors_sccho'] = written_df['Authors_sccho'].apply(ast.literal_eval)


In [None]:
written_by_df.rename(columns={"Paper_ID": "paperID", "name":"authorName","PDF_URL":"url"}, inplace=True)
written_by_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6345 entries, 0 to 6344
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   paperID     6345 non-null   object
 1   url         6345 non-null   object
 2   authorId    6311 non-null   object
 3   authorName  6344 non-null   object
dtypes: object(4)
memory usage: 198.4+ KB


In [None]:
written_by_df[written_by_df['authorName']=='Hao Peng']

Unnamed: 0,paperID,url,authorId,authorName
43,9ea5874d261359e287eabb735de38a8edba1e091,http://arxiv.org/pdf/2302.05019v1,2138443697.0,Hao Peng
126,ef1ef85adc38356023ada0b4abc3db4d395587dd,http://arxiv.org/pdf/2105.07615v2,,Hao Peng
3836,a5a13071e3f834ec90c0c69087b84363abc2fb38,http://arxiv.org/pdf/2205.01757v1,1818378366.0,Hao Peng
5496,8423b0efb07abbb121678556901119203308c1c3,http://arxiv.org/pdf/2105.03178v5,49349645.0,Hao Peng


In [None]:
written_by_df.drop_duplicates(subset=["url","authorName"], inplace=True)

In [None]:
written_by_df[written_by_df.duplicated(subset=["paperID","authorName"],keep=False)]

Unnamed: 0,paperID,url,authorId,authorName


In [None]:
written_by_df.reset_index(drop=True, inplace=True)
written_by_df.to_csv("written_by_df.csv", sep=',')

**Belongs_to Dataframe**

In [None]:
belongs_to_df = df[["Paper_ID", "Primary_Category"]]

In [None]:
belongs_to_df.rename(columns={"Paper_ID": "paperID", "Primary_Category":"categoryID"}, inplace=True)
belongs_to_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1706 entries, 0 to 1706
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   paperID     1706 non-null   object
 1   categoryID  1706 non-null   object
dtypes: object(2)
memory usage: 40.0+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  belongs_to_df.rename(columns={"Paper_ID": "paperID", "Primary_Category":"categoryID"}, inplace=True)


In [None]:

belongs_to_df.to_csv("belongs_to_df.csv", sep=',')

**Cites Dataframe**

In [None]:
cites = df[["Paper_ID", "References"]]

In [None]:
cites.head(1)

Unnamed: 0,Paper_ID,References
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,[{'reference_id': 'ca89c3461a35e1341c48e43aa7b...


In [None]:
def test_literal_eval(val):
    try:
        ast.literal_eval(val)
        return True
    except (ValueError, SyntaxError):
        return False

# Identify problematic rows
problematic_rows = cites[~cites['References'].apply(test_literal_eval)]
problematic_rows

Unnamed: 0,Paper_ID,References
81,6a4717480671318c55a4ea21844f77e1567ecb0c,[{'reference_id': '102110351b44dfa8164ab020bb9...
97,f68e10067596e87c16c7ba383468077f77ed8fc1,[{'reference_id': '6a4717480671318c55a4ea21844...
364,102110351b44dfa8164ab020bb98e6ba6936b7fd,[{'reference_id': '07679278169720ce0385a169801...
667,9c62b72cfcb44a509af0c9401648f31c21e794e6,[{'reference_id': '8bb1a24840c6a31ec6bf3b88f26...
672,02773bd1d9b2c832f33de207082aeb6e98a67d22,[{'reference_id': '1c6dcfbc059feededfac9163890...
796,042764240d8047b7df96fd6e3519c17076250b9b,[{'reference_id': '0271547a46c1aecdd69b6825597...
875,ae2e4d847334966e3b9404f40debc8be81547c60,[{'reference_id': '9536904b4d9e10e386a71bde487...
1007,eafc3160627576c2cd419c7ceb4b25ff38f94953,[{'reference_id': '44b00080c36822b83f91cbe1c13...
1031,b4bf49605f0c4c250c95a1da58b70b0e93a248b2,[{'reference_id': '873259438b927a32db66682f346...
1045,7fdf6b191c6bb665dcf9dabce61b924e067a8a65,[{'reference_id': 'e1d2f2a717aa03280126f87c8e5...


In [None]:
def safe_literal_eval(val):
    try:
        return ast.literal_eval(val)
    except (ValueError, SyntaxError):
        ast.Continue


In [None]:
cites['References'] = cites['References'].apply(safe_literal_eval)

cites_df_exploded = cites.explode('References').reset_index(drop=True)

references_df = pd.json_normalize(cites_df_exploded['References'])

cites_df = pd.concat([cites_df_exploded['Paper_ID'], references_df], axis=1)

In [None]:
cites_df.columns = ['citingPaperID', 'citedPaperID', 'citedPaperName']
cites_df.head()

Unnamed: 0,citingPaperID,citedPaperID,citedPaperName
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,ca89c3461a35e1341c48e43aa7bcc4b8671c23c3,ScheRe: Schema Reshaping for Enhancing Knowled...
1,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,f19405b6237b988aacaa6384f68140e723d32d1b,Relational Structure-Aware Knowledge Graph Rep...
2,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,8e5598257e1abae4c22e6834e75376940ed2d84e,Step by step: A hierarchical framework for mul...
3,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,0b4fe03cbc7a51d706b68a0ac6600482e54bc344,Constructing social media links to formal lear...
4,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,da6245620d3b68bdc2ad902e3d8dc1b5425b226f,"Knowledge Graphs: Introduction, History and, P..."


In [None]:
cites_df[cites_df["citingPaperID"]=='6a4717480671318c55a4ea21844f77e1567ecb0c']

Unnamed: 0,citingPaperID,citedPaperID,citedPaperName


In [None]:
cites_df.dropna(subset=["citedPaperID"], inplace=True)

In [None]:
cites_df.shape

(66791, 3)

In [None]:
cites_df.reset_index(drop=True, inplace=True)
cites_df.to_csv("cites_df.csv", sep=',')

**Venue Dataframe**

In [None]:
df.columns

Index(['Paper_ID', 'Title', 'Summary', 'Published_Date', 'Updated_Date',
       'PDF_URL', 'DOI', 'Journal_Ref', 'Primary_Category', 'Categories',
       'Comments', 'Full_Text', 'Citation_Count', 'References', 'Venue',
       'Keywords', 'Authors_sccho', 'Keyword_semi', 'Keywords_cleaned',
       'Keywords_list', 'Extracted_Keywords'],
      dtype='object')

In [None]:
venue_df = df[["Venue","Journal_Ref"]]

In [None]:
venue_df.shape

(1706, 2)

In [None]:
venue_df.drop_duplicates(subset=["Venue"], inplace=True)
venue_df.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  venue_df.drop_duplicates(subset=["Venue"], inplace=True)


(427, 2)

In [None]:
venue_df.dropna(subset=["Venue"], inplace=True)
venue_df.reset_index(drop=True, inplace=True)
venue_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 426 entries, 0 to 425
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Venue        426 non-null    object
 1   Journal_Ref  86 non-null     object
dtypes: object(2)
memory usage: 6.8+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  venue_df.dropna(subset=["Venue"], inplace=True)


In [None]:
venue_df["Venue"].value_counts().reset_index()

Unnamed: 0,Venue,count
0,Artificial Intelligence Review,1
1,Biotechnology,1
2,The international journal of intelligence and ...,1
3,iScience,1
4,Algorithms,1
5,IEEE Vehicular Technology Magazine,1
6,Artificial Life,1
7,IEEE Transactions on Human-Machine Systems,1
8,Journal of New Music Research,1
9,EthNLP@EACL,1


In [None]:
def update_type(row):
    if 'journal' in row['Venue'].lower() or 'transaction' in row['Venue'].lower():
        return 'journal'
    elif 'conference' in row['Venue'].lower():
        return 'conference'
    elif 'workshop' in row['Venue'].lower():
        return 'workshop'
    else:
        if row['venueType'] == 'journal':
          return 'conference'
        else:
          return 'journal'

# Apply the function to the DataFrame
venue_df['venueType'] = venue_df.apply(update_type, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  venue_df['venueType'] = venue_df.apply(update_type, axis=1)


In [None]:
venue_df

Unnamed: 0,Venue,Journal_Ref,venueType,impactFactor
0,Artificial Intelligence Review,,journal,
1,Conference on Human Information Interaction an...,,conference,
2,China Conference on Knowledge Graph and Semant...,4th China Conference on Knowledge Graph and Se...,conference,
3,J. Parallel Distributed Comput.,,conference,
4,Advances in Analytics for Learning and Teaching,Artificial Intelligence Supported Educational ...,journal,
5,arXiv.org,,conference,
6,Fusion,,journal,
7,IEEE International Conference on Data Engineering,,conference,
8,North American Chapter of the Association for ...,,conference,
9,ACM Computing Surveys,,journal,


In [None]:
venue_df[["Venue","venueType"]].to_csv("venue_df.csv", sep=',')

**Keyword Dataframe**

In [37]:
keywords_final = list(set(keywords_final))

In [38]:
keyword_df = pd.DataFrame({'keywordName':keywords_final})

In [39]:
keyword_df.to_csv("keyword_df.csv", sep=',')

**VenueType Dataframe**

In [None]:
venueType_df = pd.DataFrame({
    "typeName" : ['Journal', 'Conference', 'arXiv','Workshop']
})

In [None]:
venueType_df.to_csv("venueType_df.csv", sep=',')

**hasKeyword Dataframe**

In [52]:
hasKeyword_df = df[["Paper_ID","Extracted_Keywords"]]
hasKeyword_df.head()

Unnamed: 0,Paper_ID,Extracted_Keywords
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,"[knowledge acquisition, graphs, knowledge grap..."
1,af063194baee08931f12facdffc573e4bd29c463,"[graphs, knowledge, natural language, graph, k..."
2,f2aca403781821618d7a69d98a9fbddccb236081,"[graphs, knowledge, graph, knowledge graph, co..."
3,9921e8a2ed1d7ddaf6fdc6491d7cea321073c0b3,"[graphs, knowledge graph completion, knowledge..."
4,fb08c6d5f8fe80cfbd9efa75b710f326f4846a0f,"[graphs, knowledge, graph, embedding, gan, kno..."


In [56]:
# hasKeyword_df['Extracted_Keywords'] = hasKeyword_df['Extracted_Keywords'].apply(ast.literal_eval)

hasKeyword_df = hasKeyword_df.explode('Extracted_Keywords').reset_index(drop=True)

In [57]:
hasKeyword_df.head()

Unnamed: 0,Paper_ID,Extracted_Keywords
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,knowledge acquisition
1,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,graphs
2,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,knowledge graph completion
3,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,knowledge
4,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,knowledge graph embeddings


In [58]:
hasKeyword_df.columns = ['paperID', 'keywordName']
hasKeyword_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10834 entries, 0 to 10833
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   paperID      10834 non-null  object
 1   keywordName  10787 non-null  object
dtypes: object(2)
memory usage: 169.4+ KB


In [63]:
hasKeyword_df.dropna(subset=["keywordName"], inplace=True)

In [64]:
hasKeyword_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10787 entries, 0 to 10833
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   paperID      10787 non-null  object
 1   keywordName  10787 non-null  object
dtypes: object(2)
memory usage: 252.8+ KB


In [65]:
hasKeyword_df.reset_index(drop=True, inplace=True)
hasKeyword_df.to_csv("hasKeyword_df.csv", sep=',')

**publishedIn Dataframe**

In [66]:
df.columns

Index(['Paper_ID', 'Title', 'Summary', 'Published_Date', 'Updated_Date',
       'PDF_URL', 'DOI', 'Journal_Ref', 'Primary_Category', 'Categories',
       'Comments', 'Full_Text', 'Citation_Count', 'References', 'Venue',
       'Keywords', 'Authors_sccho', 'Keyword_semi', 'Keywords_cleaned',
       'Keywords_list', 'Extracted_Keywords'],
      dtype='object')

In [67]:
publishedIn_df = df[["Paper_ID", "Venue"]]
publishedIn_df.head()

Unnamed: 0,Paper_ID,Venue
0,42b323b6df79e49c9bf5cee2a91398a7fa3d594d,Artificial Intelligence Review
1,af063194baee08931f12facdffc573e4bd29c463,Conference on Human Information Interaction an...
2,f2aca403781821618d7a69d98a9fbddccb236081,China Conference on Knowledge Graph and Semant...
3,9921e8a2ed1d7ddaf6fdc6491d7cea321073c0b3,J. Parallel Distributed Comput.
4,fb08c6d5f8fe80cfbd9efa75b710f326f4846a0f,Advances in Analytics for Learning and Teaching


In [69]:
publishedIn_df.to_csv("publishedIn_df.csv", sep=',')

**Venue Dataframe and hasType Dataframe**

In [71]:
venue_type_df = pd.read_csv("venue_df.csv", index_col=0)
venue_type_df.head()

Unnamed: 0,Venue,venueType;;;
0,Artificial Intelligence Review,journal;;;
1,Conference on Human Information Interaction an...,conference;;;
2,China Conference on Knowledge Graph and Semant...,conference;;;
3,J. Parallel Distributed Comput.,journal;;;
4,Advances in Analytics for Learning and Teaching,journal;;;


In [86]:
venue_type_df.drop_duplicates(subset=['Venue'],inplace=True)

In [91]:
venue_type_df.reset_index(drop=True, inplace=True)

In [88]:
venue_type_df.shape

(405, 2)

In [92]:
venue_df = venue_type_df[["Venue"]]
venue_df.columns=['venueName']
venue_df.head()

Unnamed: 0,venueName
0,Artificial Intelligence Review
1,Conference on Human Information Interaction an...
2,China Conference on Knowledge Graph and Semant...
3,J. Parallel Distributed Comput.
4,Advances in Analytics for Learning and Teaching


In [93]:
venue_df.to_csv('venue_df.csv', sep=',')

In [94]:
venueType_df = venue_type_df.copy()

venueType_df.head()

Unnamed: 0,Venue,venueType;;;
0,Artificial Intelligence Review,journal;;;
1,Conference on Human Information Interaction an...,conference;;;
2,China Conference on Knowledge Graph and Semant...,conference;;;
3,J. Parallel Distributed Comput.,journal;;;
4,Advances in Analytics for Learning and Teaching,journal;;;


In [95]:
venueType_df.columns = ['venueName','typeName']
venueType_df['typeName'] = venueType_df['typeName'].str.replace(";","")
venueType_df.head()

Unnamed: 0,venueName,typeName
0,Artificial Intelligence Review,journal
1,Conference on Human Information Interaction an...,conference
2,China Conference on Knowledge Graph and Semant...,conference
3,J. Parallel Distributed Comput.,journal
4,Advances in Analytics for Learning and Teaching,journal


In [96]:
venueType_df.to_csv("hasType_df.csv",sep=',')