In [2]:
import networkx as nx

import re
import requests

import io
import PyPDF2

import openai
from openai import OpenAI

In [3]:
chiwork_base = nx.read_gml("/Users/parthgoel/Desktop/AuthorName/data/chiwork_base.gml")
chiwork_abstracts = nx.read_gml("/Users/parthgoel/Desktop/AuthorName/data/chiwork_snowball_abstracts.gml")

In [4]:
def get_emails(url):
    response = requests.get(url, stream=True)

    if response.status_code != 200:
        print(f"Blocked!")
        return False

    if not response.content[:len(b'%PDF-')] == b'%PDF-':
                print(f"Skipping URL ({url}): Not a PDF (based on file signature)\n")
                return []

    pdf_reader = PyPDF2.PdfReader(io.BytesIO(response.content))
    num_pages = len(pdf_reader.pages)

    text = ""
    page = pdf_reader.pages[0]
    text += page.extract_text()
    
    email_regex = r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+"

    email_matches = re.findall(email_regex, text)

    acm_emails = ["permissions@acm.org", "permissions@acm.org."]
    email_matches = [email for email in email_matches if email not in acm_emails]

    if email_matches == []:
      print("No Emails Found")
      return []

    return email_matches

In [5]:
def query_openai(emails, author_names, api_key):
  client = OpenAI(api_key = api_key)
  model_choice = "gpt-3.5-turbo"

  query = f"""
    The list of author emails is: {emails}
    The list of author names is: {author_names}

    From the above list of emails and names, return a python dictionary of authors and their corresponding emails in this form:
    'author1': 'email1", 'author2': 'email2' etc.

    If there are no or fewer emails than necessary, return a dictionary with as many elements as reasonable.
    Respond with a single dictionary ONLY, no other text or words
  """

  try:
        response = client.chat.completions.create(
        model=model_choice,
        messages=[
            {"role": "system", "content": "You are an email analyser"},
            {"role": "user", "content": query}
            ]
        )
        return eval(response.choices[0].message.content)

  except Exception:
        return False

In [6]:
def get_papers_dict(graph):
  papers_dict = {}

  for u, v, data in graph.edges(data=True):
      paper_doi, paper_title, paper_url, *info = data['paperinfo']
      
      if paper_doi not in papers_dict:
          papers_dict[paper_doi] = {'title': paper_title, 'url': paper_url, 
                                    'misc': info, 'coauthors': []}
      
      if str(u) not in [coauthor['url'] for coauthor in papers_dict[paper_doi]['coauthors']]:
          papers_dict[paper_doi]['coauthors'].append({'url': str(u), 'name': graph.nodes[u]['authorname']})
      if str(v) not in [coauthor['url'] for coauthor in papers_dict[paper_doi]['coauthors']]:
          papers_dict[paper_doi]['coauthors'].append({'url': str(v), 'name': graph.nodes[v]['authorname']})

  return papers_dict

papers_dict = get_papers_dict(chiwork_abstracts)

In [7]:
def trim_papers_dict(papers_dict):
    all_authors = set()
    papers_to_remove = []

    for doi, paper_data in papers_dict.items():
      
        authors_in_this_paper = {author['url'] for author in paper_data['coauthors']}

        if authors_in_this_paper.issubset(all_authors):
            papers_to_remove.append((doi, len(authors_in_this_paper)))
            continue

        if 'CLOSED' in paper_data['misc'] or 'OPEN':
            papers_to_remove.append((doi, len(authors_in_this_paper)))  
            continue  

        all_authors.update(authors_in_this_paper)

    papers_to_remove.sort(key=lambda x: x[1])  

    for doi, _ in papers_to_remove:
        del papers_dict[doi]

    return papers_dict

trimmed = trim_papers_dict(papers_dict)

In [8]:
doi_list = [item['url'] for item in trimmed.values()]
pdf_url_list = [re.sub(r'^https://doi\.org/', r'https://dl.acm.org/doi/pdf/', url) for url in doi_list]

In [70]:
with open("/Users/parthgoel/Desktop/AuthorName/data/temp_index.txt") as index_store:
    index_start = index_store.readlines()
    if index_start != []:
        index_start = int(index_start[-1])

print(index_start)

105


In [71]:
master_emails_dict = dict()
# api_key = "YOUR API KEY HERE"

for url in pdf_url_list[index_start:]:
  print(f"Processing {url}")

  emails = get_emails(url)

  if emails == False:

    index_of_break = pdf_url_list.index(url)
    print(f"\n Scrapping Blocked. Resume from: {index_of_break}")
    with open("/Users/parthgoel/Desktop/AuthorName/data/temp_index.txt", "w") as index_store:
        index_store.write(f"{index_of_break}")

    break

  names = [author['name'] for author in papers_dict[url.split("pdf/")[-1]]['coauthors']]

  if emails != []:
    names_to_emails = query_openai(emails, names, api_key)
    master_emails_dict.update(names_to_emails)


print(master_emails_dict)

Processing https://dl.acm.org/doi/pdf/10.1145/3604254
Processing https://dl.acm.org/doi/pdf/10.1145/3643558
Processing https://dl.acm.org/doi/pdf/10.1145/3596671.3598576
Processing https://dl.acm.org/doi/pdf/10.1145/3491102.3501831
Processing https://dl.acm.org/doi/pdf/10.1145/3582269.3615595
Processing https://dl.acm.org/doi/pdf/10.1145/3491102.3517546
Processing https://dl.acm.org/doi/pdf/10.1145/3610092
Processing https://dl.acm.org/doi/pdf/10.1145/3563657.3595982
Processing https://dl.acm.org/doi/pdf/10.1145/3555202
No Emails Found
Processing https://dl.acm.org/doi/pdf/10.1145/3544548.3580882
Processing https://dl.acm.org/doi/pdf/10.1145/3531146.3533110
Processing https://dl.acm.org/doi/pdf/10.1145/3479535
Processing https://dl.acm.org/doi/pdf/10.1145/3531146.3533113
Processing https://dl.acm.org/doi/pdf/10.1145/3610925
Processing https://dl.acm.org/doi/pdf/10.1002/asi.24205
Skipping URL (https://dl.acm.org/doi/pdf/10.1002/asi.24205): Not a PDF (based on file signature)

Processing

In [72]:
import json
dict_path = "/Users/parthgoel/Desktop/AuthorName/data/intermediate_data/emails_dict.txt"

with open(dict_path, "a") as d_path:
    json_str = json.dumps(master_emails_dict)
    d_path.write(json_str + '\n')