In [1]:
import pandas as pd
import json
import ast
import csv
from pprint import pprint
from tqdm.notebook import tqdm, trange

import re
from datetime import datetime
import numpy as np
import faker
import yake

In [None]:
np.random.seed(42)

## Initial Data Collection

In [None]:
df = pd.read_csv('all_data.csv')
df.info()

In [None]:
language = "en"
max_ngram_size = 2
deduplication_threshold = 0.5
deduplication_algo = "seqm"
windowSize = 5
numOfKeywords = 3

kw_extractor = yake.KeywordExtractor(
    lan=language,
    n=max_ngram_size,
    dedupLim=deduplication_threshold,
    dedupFunc=deduplication_algo,
    windowsSize=windowSize,
    top=numOfKeywords,
)

keywords = []
for abstract in tqdm(df['abstract'].tolist()):
    results = kw_extractor.extract_keywords(abstract)
    keywords.append([result[0] for result in results])
df['keywords'] = keywords

In [None]:
df.to_csv('all_data_with_keywords.csv', index=False)

In [4]:
df = pd.read_csv('all_data_with_keywords.csv')
# Convert string representations to actual data structures
df['publicationVenue'] = df['publicationVenue'].apply(ast.literal_eval)
df['journal'] = df['journal'].apply(ast.literal_eval)
df['authors'] = df['authors'].apply(ast.literal_eval)
df['keywords'] = df['keywords'].apply(ast.literal_eval)

In [5]:
df.drop_duplicates(subset=['paperId'], inplace=True)

In [6]:
# trim to 100 characters
df['abstract'] = df['abstract'].apply(lambda x: x[:100])
df.dropna(inplace=True)

In [7]:
papers = df[['paperId', 'title', 'abstract', 'publicationDate', 'year']]
papers.rename(columns={'paperId': 'paperID'}, inplace=True)
papers.loc[:,'abstract'] = papers['abstract'].str.replace(r'[^\w\s]', '', regex=True)
papers.loc[:, 'title'] = papers['title'].str.replace(r'[^\w\s]', '', regex=True)
papers.dropna(inplace=True)
papers.to_csv('papers.csv', index=False)
papers = pd.read_csv('papers.csv')
papers.dropna(inplace=True)
papers.to_csv('papers.csv', index=False)
papers.info()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.rename(columns={'paperId': 'paperID'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  papers.dropna(inplace=True)


<class 'pandas.core.frame.DataFrame'>
Index: 120789 entries, 0 to 120813
Data columns (total 5 columns):
 #   Column           Non-Null Count   Dtype 
---  ------           --------------   ----- 
 0   paperID          120789 non-null  object
 1   title            120789 non-null  object
 2   abstract         120789 non-null  object
 3   publicationDate  120789 non-null  object
 4   year             120789 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 5.5+ MB


In [8]:
keywords = df['keywords'].explode().dropna().unique()
with open("keywords.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(['keyword'])
    for keyword in keywords:
        writer.writerow([keyword])

In [9]:
keyword_mapping = df[['paperId', 'keywords']].explode('keywords').dropna()
keyword_mapping.info()
keyword_mapping.to_csv('keyword_mapping.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 362301 entries, 0 to 276083
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   paperId   362301 non-null  object
 1   keywords  362301 non-null  object
dtypes: object(2)
memory usage: 8.3+ MB


In [10]:
# # Function to get random cited paper IDs
# def get_random_cited_papers(paper_id, all_paper_ids, min_citations=1, max_citations=10):
#     # Ensure the number of citations does not exceed the number of available papers
#     max_citations = min(max_citations, len(all_paper_ids) - 1)
#     num_citations = np.random.randint(min_citations, max_citations + 1)
#     # Randomly select cited papers, excluding the current paper
#     cited_papers = np.random.choice([pid for pid in all_paper_ids if pid != paper_id], num_citations, replace=False)
#     return cited_papers

# # List to hold tuples of (paperId, citedPaperId)
# citations_list = []

# # Extract all paper IDs as a list
# all_paper_ids = df['paperId'].tolist()

# # Generate cited papers for each paperId
# for paper_id in tqdm(all_paper_ids):
#     cited_paper_ids = get_random_cited_papers(paper_id, all_paper_ids)
#     for cited_paper_id in cited_paper_ids:
#         citations_list.append((paper_id, cited_paper_id))

# # Create a new dataframe for the citations
# citations_df = pd.DataFrame(citations_list, columns=['paperId', 'citedPaperId'])

# # Write the dataframe to a CSV file
# citations_df.to_csv('citations.csv', index=False)
# citations_df.sample(5)

In [11]:
with open('references.json') as f:
    references = json.load(f)

citations = []
for paper in references:
    paper_id = paper['paperId']
    for reference in paper['references']:
        if reference['paperId'] is not None:
            citations.append((paper_id, reference['paperId']))
    for citation in paper['citations']:
        if citation['paperId'] is not None:
            citations.append((citation['paperId'], paper_id))
references_df = pd.DataFrame(citations, columns=['paperID', 'referenceID'])
references_df.drop_duplicates(inplace=True)
references_df.info()
references_df.to_csv('citations.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 404217 entries, 0 to 436911
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   paperID      404217 non-null  object
 1   referenceID  404217 non-null  object
dtypes: object(2)
memory usage: 9.3+ MB


In [12]:
from pandas import json_normalize

# Extract the authors column, which contains nested JSON-like data
# Normalize this data to create a separate DataFrame for authors
authors_expanded = json_normalize(df['authors'].explode())  # Normalize and explode the nested data

# Create the authors DataFrame with unique authors (drop duplicates)
authors_df = authors_expanded[['authorId', 'name']].drop_duplicates().reset_index(drop=True)
authors_df.dropna(inplace=True)
authors_df.rename(columns={'authorId': 'authorID'}, inplace=True)

authors_df.info()
authors_df.to_csv('authors.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 241051 entries, 0 to 241947
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   authorID  241051 non-null  object
 1   name      241051 non-null  object
dtypes: object(2)
memory usage: 5.5+ MB


In [13]:
# Step 1: Expand the DataFrame to include one row per author per paper
expanded_df = df.explode('authors')

# Extract paperID and authorID
expanded_df['paperID'] = expanded_df['paperId']
expanded_df['authorID'] = expanded_df['authors'].apply(lambda x: x['authorId'])

# Initialize 'corresponds' to False
expanded_df['corresponds'] = False

expanded_df.loc[expanded_df.groupby('paperId').cumcount() == 0, 'corresponds'] = True

# Now, we can create the 'writes' DataFrame with just the necessary columns
writes_df = expanded_df[['paperId', 'authorID', 'corresponds']].rename(columns={'paperId': 'paperID'}).reset_index(drop=True)
writes_df.drop_duplicates(subset=['paperID', 'authorID'], inplace=True)
writes_df.dropna(inplace=True)
writes_df.to_csv('writes.csv', index=False)

writes_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 540478 entries, 0 to 543131
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   paperID      540478 non-null  object
 1   authorID     540478 non-null  object
 2   corresponds  540478 non-null  bool  
dtypes: bool(1), object(2)
memory usage: 12.9+ MB


Use 'journal' and 'publicationVenue' fields. if 'publicationVenue'.name.lower() == arxiv then ignore

1. Journal: use publicationVenue.'name' and journal.'volume'. if either not present then ignore; publicationVenue.type == 'journal'
2. Conference: use 'name'. publicationVenue.type == 'conference'
3. Same as conference but either journal.name or publicationVenue.name contains 'workshop'

In [14]:
# Initialize a Faker generator
fake = faker.Faker()

# Generate a random list of 50 names
random_names = [fake.name() for _ in range(100)]
random_cities = [fake.city() for _ in range(100)]

In [15]:
# Journals
# name is neither empty nor is equal to arxiv
# volume should not be None and it should be a numbeer
journals_cond = (df["publicationVenue"].apply(lambda x: x.get("type")) == "journal") & (
    df["journal"].apply(
        lambda x: x.get("name", "") != ""
        and x.get("name").lower() != "arxiv"
        and x.get("volume", "") != ""
        and x.get("volume").isdigit()
    )
)
journals_data = df[journals_cond]

# Extracting journal information
journals = (
    pd.DataFrame(
        {
            "journalID": journals_data["publicationVenue"].apply(lambda x: x.get("id")),
            "name": journals_data["journal"].apply(lambda x: x.get("name")),
            "issn": journals_data["publicationVenue"].apply(lambda x: x.get("issn")),
        }
    )
    .drop_duplicates(subset=["journalID"])
    .reset_index(drop=True)
)
journals.dropna(inplace=True)
journals["editor"] = np.random.choice(random_names, size=len(journals))
journals.info()
journals.to_csv("journals.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 1479 entries, 0 to 1491
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   journalID  1479 non-null   object
 1   name       1479 non-null   object
 2   issn       1479 non-null   object
 3   editor     1479 non-null   object
dtypes: object(4)
memory usage: 57.8+ KB


In [16]:
# Conferences
# name is neither empty nor contains 'workshop' or 'symposium' or 'arxiv'
conferences_cond = (
    df["publicationVenue"].apply(lambda x: x.get("type")) == "conference"
) & df["journal"].apply(
    lambda x: x.get("name") is not None
    and "workshop" not in x.get("name").lower()
    and "symposium" not in x.get("name").lower()
    and "arxiv" not in x.get("name").lower()
)
conferences_data = df[conferences_cond]

# Extracting conference information
conferences = (
    pd.DataFrame(
        {
            "conferenceID": conferences_data["publicationVenue"].apply(
                lambda x: x.get("id")
            ),
            "name": conferences_data["publicationVenue"].apply(lambda x: x.get("name")),
        }
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
conferences["chair"] = np.random.choice(random_names, size=len(conferences))
conferences.info()
conferences.to_csv("conferences.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1242 entries, 0 to 1241
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   conferenceID  1242 non-null   object
 1   name          1242 non-null   object
 2   chair         1242 non-null   object
dtypes: object(3)
memory usage: 29.2+ KB


In [17]:
workshops_cond = (
    df["publicationVenue"].apply(lambda x: x.get("type")) == "conference"
) & df["journal"].apply(
    lambda x: x.get("name") is not None
    and ("workshop" in x.get("name").lower()
    or "symposium" in x.get("name").lower())
)
workshops_data = df[workshops_cond]
# select indices not in conferences
workshops_data = workshops_data[~workshops_data.index.isin(conferences_data.index)]

# Extracting workshop information
workshops = (
    pd.DataFrame(
        {
            "workshopID": workshops_data["publicationVenue"].apply(
                lambda x: x.get("id")
            ),
            "name": workshops_data["publicationVenue"].apply(lambda x: x.get("name")),
        }
    )
    .drop_duplicates()
    .reset_index(drop=True)
)
workshops["chair"] = np.random.choice(random_names, size=len(workshops))
workshops.to_csv('workshops.csv', index=False)
workshops.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 326 entries, 0 to 325
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   workshopID  326 non-null    object
 1   name        326 non-null    object
 2   chair       326 non-null    object
dtypes: object(3)
memory usage: 7.8+ KB


In [18]:
# function to generate a random publication date in a given year in the format YYYY-MM-DD
def random_date(year):
    start_date = datetime(year, 1, 1)
    end_date = datetime(year, 12, 31)
    time_between_dates = end_date - start_date
    days_between_dates = time_between_dates.days
    random_number_of_days = np.random.randint(0, days_between_dates)
    random_date = start_date + pd.Timedelta(days=random_number_of_days)
    return random_date.strftime("%Y-%m-%d")

In [19]:
journals_volume_data = journals_data.copy()
journals_volume_data["volNumber"] = journals_volume_data["journal"].apply(
    lambda x: x.get("volume")
)
journals_volume_data["journalID"] = journals_volume_data["publicationVenue"].apply(
    lambda x: x.get("id")
)
journals_volume_data["volID"] = (
    journals_volume_data["volNumber"].astype(str)
    + "_"
    + journals_volume_data["journalID"]
)
journals_volume_data["publicationDate"] = journals_volume_data["year"].apply(
    lambda x: random_date(int(x))
)

volume_df = journals_volume_data[["volID", "volNumber", "journalID"]].reset_index(
    drop=True
)
paper_volume_df = (
    journals_volume_data[["paperId", "volID"]]
    .rename(columns={"paperId": "paperID"})
    .reset_index(drop=True)
)
volume_df.drop_duplicates(subset=["volID"], inplace=True)

volume_df.info()
paper_volume_df.info()

volume_df.to_csv("volumes.csv", index=False)
paper_volume_df.to_csv("paper_published_in_volume.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 5446 entries, 0 to 25948
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   volID      5446 non-null   object
 1   volNumber  5446 non-null   object
 2   journalID  5446 non-null   object
dtypes: object(3)
memory usage: 170.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25952 entries, 0 to 25951
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   paperID  25952 non-null  object
 1   volID    25952 non-null  object
dtypes: object(2)
memory usage: 405.6+ KB


In [20]:
conferences_cond = (
    df["publicationVenue"].apply(lambda x: x.get("type")) == "conference"
) & df["journal"].apply(
    lambda x: x.get("name") is not None
    and "workshop" not in x.get("name").lower()
    and "symposium" not in x.get("name").lower()
    and "arxiv" not in x.get("name").lower()
    and "corr" not in x.get("name").lower()
    and "abs/" not in x.get("volume", "").lower()
)
conferences_data = df[conferences_cond]

workshops_cond = (
    df["publicationVenue"].apply(lambda x: x.get("type")) == "conference"
) & df["journal"].apply(
    lambda x: x.get("name") is not None
    and ("workshop" in x.get("name").lower()
    or "symposium" in x.get("name").lower())
)
workshops_data = df[workshops_cond]
workshops_data = workshops_data[~workshops_data.index.isin(conferences_data.index)]

# Proceedings
proceedings_data = pd.concat([conferences_data, workshops_data], ignore_index=True)
proceedings_data['conferenceID'] = proceedings_data['publicationVenue'].apply(lambda x: x.get('id'))

In [21]:
# Function to extract edition using regex
def extract_edition(name):
    match = re.search(r'(\d+)(st|nd|rd|th)', name)
    if match:
        return int(match.group(1))
    else:
        match = re.search(r'\b\d{4}\b', name)
        if match:
            return int(match.group())
    return np.nan

# Function to generate random start and end dates within the given year
def generate_dates(year):
    start_date = datetime.strptime(f"{year}-01-01", "%Y-%m-%d") + pd.to_timedelta(np.random.randint(0, 365), 'D')
    end_date = start_date + pd.to_timedelta(np.random.randint(1, 5), 'D')  # End date is 1 to 5 days after start date
    return start_date, end_date


In [22]:
proceedings_data['edition'] = proceedings_data['journal'].apply(lambda x: extract_edition(x.get('name')) if x and 'name' in x else np.nan)
proceedings_data.loc[proceedings_data['edition'].isna(), 'edition'] = proceedings_data['journal'].apply(lambda x: int(x.get('volume')) if x and 'volume' in x and x.get('volume').isdigit() else np.nan)

proceedings_data = proceedings_data.dropna(subset=['edition'])
proceedings_data['proceedingsID'] = proceedings_data['edition'].astype(int).astype(str) + "_" + proceedings_data['conferenceID']
proceedings_data.loc[0:conferences_data.shape[0], 'type'] = 'conference'
proceedings_data.loc[conferences_data.shape[0]:, 'type'] = 'workshop'
proceedings_data['edition'] = proceedings_data['edition'].astype(int)
paper_in_proceedings = proceedings_data[['paperId', 'proceedingsID']].rename(columns={'paperId': 'paperID'}).reset_index(drop=True)

paper_in_proceedings.info()
paper_in_proceedings.to_csv('paper_presented_at_proceedings.csv', index=False)

proceedings_data.drop_duplicates(subset=['proceedingsID'], inplace=True)
proceedings_data['venue'] = np.random.choice(random_cities, size=len(proceedings_data))
proceedings_data[['startDate', 'endDate']] = proceedings_data.apply(lambda x: generate_dates(x['year']), axis=1, result_type="expand")
proceedings_data = proceedings_data[['proceedingsID', 'edition', 'conferenceID', 'type', 'venue', 'startDate', 'endDate']]
proceedings_data.info()
proceedings_data.to_csv('proceedings.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39026 entries, 0 to 39025
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   paperID        39026 non-null  object
 1   proceedingsID  39026 non-null  object
dtypes: object(2)
memory usage: 609.9+ KB
<class 'pandas.core.frame.DataFrame'>
Index: 5007 entries, 0 to 41374
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   proceedingsID  5007 non-null   object        
 1   edition        5007 non-null   int64         
 2   conferenceID   5007 non-null   object        
 3   type           5007 non-null   object        
 4   venue          5007 non-null   object        
 5   startDate      5007 non-null   datetime64[ns]
 6   endDate        5007 non-null   datetime64[ns]
dtypes: datetime64[ns](2), int64(1), object(4)
memory usage: 312.9+ KB


## Sampling

In [23]:
writes = pd.read_csv('writes.csv')
paper_presented_at_proceedings = pd.read_csv('paper_presented_at_proceedings.csv')
proceedings = pd.read_csv('proceedings.csv')
paper_published_in_volume = pd.read_csv('paper_published_in_volume.csv')
authors = pd.read_csv('authors.csv')
papers = pd.read_csv('papers.csv')

filtered_paper_published_in_volume = paper_published_in_volume.sample(5000)

print('Writes', writes.shape[0])
print('Paper in Proc', paper_presented_at_proceedings.shape[0])
print('Paper in Vol', paper_published_in_volume.shape[0])
print('Proc', proceedings.shape[0])
print('Authors', authors.shape[0])
print('Papers', papers.shape[0])

Writes 540478
Paper in Proc 39026
Paper in Vol 25952
Proc 5007
Authors 241051
Papers 120789


In [25]:
# Step 1: Count the number of editions each author has published in for each conference
author_editions_count = (
    writes.merge(paper_presented_at_proceedings, on="paperID")
    .merge(proceedings, on="proceedingsID")
    .groupby(["authorID", "conferenceID"])
    .agg({"edition": "nunique"})
    .reset_index()
)
# Step 2: Filter out authors who have published in less than 4 editions of any conference
author_editions_count_filtered = author_editions_count.groupby('authorID').filter(lambda x: (x['edition'] >= 4).any())
# Step 3: Get the list of authors to keep
authors_to_keep = author_editions_count_filtered['authorID'].unique()
# Step 4: Filter out data points associated with authors not in authors_to_keep
papers_to_keep_from_editions = writes[writes['authorID'].isin(authors_to_keep)]['paperID']
# New Step 5: Include papers that have been published in a volume
papers_to_keep_from_volumes = paper_published_in_volume['paperID'].unique()
# New Step 6: Combine papers to keep from both criteria
papers_to_keep = pd.Series(np.union1d(papers_to_keep_from_editions, papers_to_keep_from_volumes))
# Step 7: Filter out data points associated with papers to keep
proceedings_to_keep = paper_presented_at_proceedings[paper_presented_at_proceedings['paperID'].isin(papers_to_keep)]['proceedingsID']
# Now you can filter your original data frames using papers_to_keep and proceedings_to_keep
filtered_writes = writes[writes['paperID'].isin(papers_to_keep)]
filtered_proceedings = proceedings[proceedings['proceedingsID'].isin(proceedings_to_keep)]
filtered_paper_presented_at_proceedings = paper_presented_at_proceedings[paper_presented_at_proceedings['proceedingsID'].isin(proceedings_to_keep)]

print(filtered_writes.shape[0])
print(filtered_proceedings.shape[0])
print(filtered_paper_presented_at_proceedings.shape[0])


256566
1137
31233


In [26]:
filtered_writes = filtered_writes.sample(10000)
# filtered_paper_presented_at_proceedings = filtered_paper_presented_at_proceedings.sample(10000)

In [27]:
filtered_papers = filtered_writes['paperID'].unique()
filtered_authors = filtered_writes['authorID'].unique()

papers_new = papers[papers['paperID'].isin(filtered_papers)]
authors_new = authors[authors['authorID'].isin(filtered_authors)]

filtered_paper_presented_at_proceedings = filtered_paper_presented_at_proceedings[
    filtered_paper_presented_at_proceedings["paperID"].isin(filtered_papers)
]

print(papers.shape[0], papers_new.shape[0])
print(authors.shape[0], authors_new.shape[0])
print(paper_presented_at_proceedings.shape[0], filtered_paper_presented_at_proceedings.shape[0])

papers_new.to_csv('papers_new.csv', index=False)
authors_new.to_csv('authors_new.csv', index=False)
filtered_writes.to_csv('writes_new.csv', index=False)
filtered_proceedings.to_csv('proceedings_new.csv', index=False)
filtered_paper_presented_at_proceedings.to_csv('paper_presented_at_proceedings_new.csv', index=False)
filtered_paper_published_in_volume.to_csv('paper_published_in_volume_new.csv', index=False)


120789 8984
241051 8344
39026 2680


In [28]:
# Create a map of paperID to its authors to speed up lookup
paper_authors_map = filtered_writes.groupby('paperID')['authorID'].apply(set).to_dict()

# Prepare a list to collect review assignments
review_assignments = []

# Iterate over paperIDs to assign reviewers
for paper_id, authors in tqdm(paper_authors_map.items()):
    # Find eligible reviewers: all authors excluding those who wrote the paper
    eligible_reviewers = authors_new[~authors_new['authorID'].isin(authors_new)]
    num_reviewers = np.random.choice([2, 3, 4, 5])
    # Ensure there are enough eligible reviewers
    if len(eligible_reviewers) >= num_reviewers:
        # Randomly select 3 reviewers without replacement
        selected_reviewers = eligible_reviewers.sample(n=num_reviewers, random_state=1)['authorID'].tolist()
        for author_id in selected_reviewers:
            review_assignments.append({'paperID': paper_id, 'authorID': author_id})

# Convert the list of review assignments into a DataFrame
reviews = pd.DataFrame(review_assignments)
reviews.to_csv('reviews_new.csv', index=False)
reviews.info()

# Display the head of the optimized reviews DataFrame
reviews.head()

  0%|          | 0/8986 [00:00<?, ?it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31428 entries, 0 to 31427
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   paperID   31428 non-null  object
 1   authorID  31428 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 491.2+ KB


Unnamed: 0,paperID,authorID
0,000bc5921e71c5d3e1c7b39364244987c109a212,2237630226
1,000bc5921e71c5d3e1c7b39364244987c109a212,120625177
2,000bc5921e71c5d3e1c7b39364244987c109a212,2166262174
3,00102916233d568656659b8263efb6d8fb4a46d6,2237630226
4,00102916233d568656659b8263efb6d8fb4a46d6,120625177


In [29]:
years = papers_new['year'].unique()
with open("years.csv", "w") as f:
    f.write("year\n")
    for year in years:
        f.write(str(year) + "\n")

In [30]:
# filter citations to only keep those for which paperId is in filtered_papers
citations = pd.read_csv("citations.csv")
filtered_citations = citations[
    citations["paperID"].isin(papers_new["paperID"])
    & citations["referenceID"].isin(papers_new["paperID"])
]
print(citations.shape, filtered_citations.shape)
filtered_citations.to_csv("citations_new.csv", index=False)

(404217, 2) (3288, 2)


In [31]:
keyword_mapping = pd.read_csv('keyword_mapping.csv')
filtered_keyword_mapping = keyword_mapping[keyword_mapping['paperId'].isin(papers_new['paperID'])]
filtered_keyword_mapping['keywords'] = filtered_keyword_mapping['keywords'].str.lower()
print(keyword_mapping.shape, filtered_keyword_mapping.shape)
filtered_keyword_mapping.to_csv('keyword_mapping_new.csv', index=False)

(362301, 2) (26950, 2)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_keyword_mapping['keywords'] = filtered_keyword_mapping['keywords'].str.lower()


## Evolution

In [32]:
# Sample review contents to choose from
review_contents = [
    "This paper presents a novel approach that shows promising results.",
    "The methodology is sound, but the paper lacks sufficient experimental validation.",
    "Excellent work! The results are well presented and clearly support the conclusions.",
    "The paper is well-written, but the relevance to the field is not clearly established.",
    "Significant contribution to the field, but the study lacks originality.",
    "The experimental section is thorough, but the paper is overly verbose.",
    "Interesting approach, but the paper does not provide enough context for the results.",
    "The paper could be improved by adding more details about the data collection process.",
    "Solid work, but the analysis lacks depth in some areas.",
    "The paper addresses an important problem, but the solution is not convincingly better than existing methods."
]

reviews['content'] = np.random.choice(review_contents, size=len(reviews))
reviews['decision'] = np.random.choice([True, False], size=len(reviews))

reviews.head()
reviews.to_csv('reviews_new.csv', index=False)

In [33]:
conferences = pd.read_csv('conferences.csv')
workshops = pd.read_csv('workshops.csv')
journals = pd.read_csv('journals.csv')

reviewer_policy_choices = [2, 3, 4, 5]

conferences['reviewerPolicy'] = np.random.choice(reviewer_policy_choices, size=len(conferences))
workshops['reviewerPolicy'] = np.random.choice(reviewer_policy_choices, size=len(workshops))
journals['reviewerPolicy'] = np.random.choice(reviewer_policy_choices, size=len(journals))

conferences.to_csv('conferences.csv', index=False)
workshops.to_csv('workshops.csv', index=False)
journals.to_csv('journals.csv', index=False)

In [34]:
# Generate a synthetic dataset for organizations
organization_names = [
    "Tech Innovations Inc.", "Global Research Ltd.", "Future Technologies LLC",
    "Quantum Computing Corp.", "AI Solutions Inc.", "Renewable Energy Systems Ltd.",
    "Deep Learning Technologies LLC", "Space Exploration Corp.", "Biotech Innovations Inc.",
    "Cyber Security Solutions Ltd.", "University of Science and Technology", "Global University of Engineering",
    "Institute of Advanced Studies", "National University of Arts and Sciences", 
    "International College of Information Technology", "University of Renewable Energies",
    "Institute for Space Research", "University of Biotech Innovations",
    "College of Quantum Computing", "Academy of Cyber Security"
]
organization_types = ["company"] * 10 + ["university"] * 10
organizations_data = {
    'orgID': [f"O{i+1}" for i in range(len(organization_names))],
    'name': organization_names,
    'type': organization_types
}
organizations_df = pd.DataFrame(organizations_data)

authors = pd.read_csv('authors_new.csv')
# Assign a random organization to each author
authors['affiliation'] = np.random.choice(organizations_df['orgID'], size=len(authors))

authors.to_csv('authors_new.csv', index=False)
organizations_df.to_csv('organizations.csv', index=False)

organizations_df.head()

Unnamed: 0,orgID,name,type
0,O1,Tech Innovations Inc.,company
1,O2,Global Research Ltd.,company
2,O3,Future Technologies LLC,company
3,O4,Quantum Computing Corp.,company
4,O5,AI Solutions Inc.,company


In [35]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv("keyword_mapping_new.csv")

# Assuming the column containing keywords is named 'keywords'
# Split the keywords column into a list of keywords
keywords_list = df['keywords'].str.split(',')

# Flatten the list of lists into a single list of keywords
all_keywords = [keyword.strip() for sublist in keywords_list for keyword in sublist]

# Create a frequency table of keywords containing the word "graph"
graph_keywords = [keyword for keyword in all_keywords if 'graph' in keyword.lower()]
graph_freq_table = pd.Series(graph_keywords).value_counts()

# Display the frequency table
print(graph_freq_table)


graph                     107
graph neural               52
knowledge graph            20
knowledge graphs           18
graph convolutional        11
                         ... 
subgraph pattern            1
graph structured            1
subgraph queries            1
geographic information      1
graph theories              1
Name: count, Length: 214, dtype: int64


In [36]:
print(graph_freq_table.head(6).index.tolist())

['graph', 'graph neural', 'knowledge graph', 'knowledge graphs', 'graph convolutional', 'graph embedding']
