Install dependencies

In [None]:
%%capture
! pip install markdown2 beautifulsoup4 tqdm

Import dependencies

In [15]:
import markdown2
from bs4 import BeautifulSoup
import re
from tqdm import tqdm
import datetime

Define functions, one for extracting the text visible to the user, one additionally extracting the href values from links

In [8]:
def extract_raw_text_from_markdown(markdown_content):
  html_content = markdown2.markdown(markdown_content)
  soup = BeautifulSoup(html_content, 'html.parser')

  raw_text = soup.get_text()
  stripped_text = raw_text.strip()
  cleaned_text = re.sub(r'(\n|\r)+', r'\\n', stripped_text)

  return cleaned_text

In [None]:
def extract_raw_text_from_markdown_with_links(markdown_content):
  html_content = markdown2.markdown(markdown_content)
  soup = BeautifulSoup(html_content, 'html.parser')

  raw_text = ""
  for element in soup.recursiveChildGenerator():
    if isinstance(element, str):
      raw_text += element
    elif element.name == 'a':
      link_text = element.get_text()
      raw_text += link_text
      link_href = element.get('href')
      raw_text += f'({link_href}) '

  return raw_text

Run the code against an example file

In [None]:
input_file = 'example.md'
with open(input_file, 'r', encoding='utf-8') as f:
  markdown_content = f.read()
raw_text = extract_raw_text_from_markdown(markdown_content)

print(raw_text)

Load from the fetched JSON files

In [6]:
import os
import json
import csv
from tqdm import tqdm

In [2]:
def create_repo_to_row_func(year, field, term):
  def repo_to_row(repo):
    return [
      repo['full_name'],
      year,
      field,
      term,
      repo['url'],
      repo['homepage'],
      repo['programming_language'],
      repo['forks'],
      repo['stars'],
      repo['description'],
      repo['contents'],
      extract_raw_text_from_markdown(repo['readme']) # clean
      # repo['readme'] # clean
    ]
  
  return repo_to_row

In [None]:
with open('out2.csv', 'w', newline='') as csv_file:
  csv_writer = csv.writer(csv_file, escapechar='"', quoting=csv.QUOTE_MINIMAL)
  # csv_writer.writerow([
  #   'full_name',
  #   'year',
  #   'field',
  #   'term',
  #   'url',
  #   'homepage',
  #   'programming_language',
  #   'forks',
  #   'stars',
  #   'description',
  #   'contents',
  #   'readme'
  # ])

  total_iterations = 1140
  progress_bar = tqdm(total=total_iterations, desc="Processing", dynamic_ncols=True)

  for year in os.listdir('out'):
    # print(f'reading year {year}')
    for field in os.listdir(f'out/{year}'):
      # print(f'reading field {field}')
      for term in os.listdir(f'out/{year}/{field}'):
        # print(f'reading term {term}')
        with open(f'out/{year}/{field}/{term}') as json_file:
          repos = json.load(json_file)
          rows = map(create_repo_to_row_func(year, field, term[:-5]), repos)
          csv_writer.writerows(rows)
          progress_bar.update(1)
  
  progress_bar.close()

In [19]:
fields = {
  "mathematics": ["Applied Math", "Combinatorics", "Number Theory", "Financial Math",
                  "Geometry", "Probability", "Representation Theory", "Symplectic Geometry", "Topology"],
  "chemistry": ["analytical spectroscopy", "electrochemistry ", "mass spectrometry", "separation science", "chemical biology",
                "enzymology", "bioinorganic chemistry ", "inorganic materials ", "physical inorganic chemistry", "synthetic inorganic chemistry"],
  "biology": ["Biochemistry", "Biophysics", "Structural Biology", "Cell Biology",
              "Cancer", "Genetics", "Genomics", "Microbiology", "Virology", "Neuroscience"],
  "computer_sciences": ["Systems", "Networking", "Security", "Privacy", "Artificial Intelligence", "Theoretical Computer Science", "Machine Learning",
                        "Human-Computer Interaction", "Information Visualization", "Vision", "Graphics", "Robotics", "Computer Engineering", "Software Engineering"],
  "physics": ["planetary astronomy", "infrared astronomy", "theoretical astrophysics", "radio astronomy", "Plasma physics", "Nanoscience", "Nanotechnology",
              "Condensed Matter", "Materials Physics", "Energy Systems", "Biophysics", "Microfluidics", "Microsystems", "Optical Physics", "Quantum Information Science"],
  "medicine": ["Cardiology", "Endocrinology", "Diabetes", "Metabolism", "Gastroenterology", "Hepatology", "Internal Medicine", "Clinical Innovation ", "Geriatric Medicine ",
               "Palliative Care ", "Hematology", "Infectious Diseases", "Immunology", "Nephrology", "Precision Medicine", "Pulmonary", "Critical Care", "Rheumatology"]
}

def store_state(year, field_index, term_index):
  state = {
    "year": year,
    "field_index": field_index,
    "term_index": term_index
  }

  with open("state.json", "w") as file:
    json.dump(state, file)

def load_state():
  try:
    with open("state.json", "r") as file:
      return json.load(file)
  except FileNotFoundError:
    return {
      "year": 2023,
      "field_index": 0,
      "term_index": 0
    }

def log_message(message):
  timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  print(f"[{timestamp}] {message}")

In [26]:
state = load_state()

year = state['year']
field_index = state['field_index']
term_index = state['term_index']

keys_list = list(fields.keys())
while year > 2008:
  log_message(f'Processing year {year}')
  while field_index < len(keys_list):
    field = keys_list[field_index]
    terms = fields[field]
    log_message(f'Processing field: {field}')
    while term_index < len(terms):
      store_state(year, field_index, term_index)
      term = terms[term_index]
      log_message(f'Processing term: {term}')
      with open(f'out/{year}/{field}/{term}.json', 'r') as json_file:
        repos = json.load(json_file)
        relative_path = f'out/csv/{year}/{field}/{term}.csv'
        directory = os.path.dirname(relative_path)
        if not os.path.exists(directory):
          os.makedirs(directory)
        log_message(f'Saving results to {relative_path}')
        with open(relative_path, 'w', newline='') as csv_file:
          csv_writer = csv.writer(csv_file, escapechar='"', quoting=csv.QUOTE_MINIMAL)
          rows = map(create_repo_to_row_func(year, field, term), repos)
          csv_writer.writerow([
            'full_name',
            'year',
            'field',
            'term',
            'url',
            'homepage',
            'programming_language',
            'forks',
            'stars',
            'description',
            'contents',
            'readme'
          ])
          csv_writer.writerows(rows)
      log_message(f'Results saved for {term} in {field} for {year}')
      term_index += 1
    field_index += 1
    term_index = 0
  year -= 1
  field_index = 0
  term_index = 0

[2023-10-17 13:16:10] Processing year 2023
[2023-10-17 13:16:10] Processing field: mathematics
[2023-10-17 13:16:10] Processing term: Applied Math
[2023-10-17 13:16:10] Saving results to out/csv/2023/mathematics/Applied Math.csv
[2023-10-17 13:17:25] Results saved for Applied Math in mathematics for 2023
[2023-10-17 13:17:25] Processing term: Combinatorics
[2023-10-17 13:17:25] Saving results to out/csv/2023/mathematics/Combinatorics.csv
[2023-10-17 13:17:26] Results saved for Combinatorics in mathematics for 2023
[2023-10-17 13:17:26] Processing term: Number Theory
[2023-10-17 13:17:26] Saving results to out/csv/2023/mathematics/Number Theory.csv
[2023-10-17 13:19:09] Results saved for Number Theory in mathematics for 2023
[2023-10-17 13:19:09] Processing term: Financial Math
[2023-10-17 13:19:09] Saving results to out/csv/2023/mathematics/Financial Math.csv
[2023-10-17 13:20:27] Results saved for Financial Math in mathematics for 2023
[2023-10-17 13:20:27] Processing term: Geometry
[

In [27]:
import os

# Function to join file contents from a folder recursively
def join_files_in_folder(root_folder, output_file):
    with open(output_file, 'w') as output:
        for folder, subfolders, files in os.walk(root_folder):
            for filename in files:
                file_path = os.path.join(folder, filename)
                with open(file_path, 'r') as input_file:
                    output.write(input_file.read())

# Specify the root folder and the output file
root_folder = 'out/csv/2009'  # Replace with the path to your folder
output_file = 'output.csv'  # Replace with the desired output file name

# Call the function to join files
join_files_in_folder(root_folder, output_file)


In [29]:
import os
import glob
import fileinput

def join_files_in_folder(folder_path, output_file_path):
  file_pattern = os.path.join(folder_path, '**/*.csv')
  file_list = glob.glob(file_pattern, recursive=True)

  with open(output_file_path, 'w') as output_file:
    for file_path in file_list:
      with fileinput.input(file_path) as f:
        for line_number, line in enumerate(f, start=1):
          if line_number != 1:
            output_file.write(line)

In [33]:
for year in range(2009, 2024):
  folder_path = f'out/csv/{year}'
  output_file_path = f'out/csv_combined/{year}.csv'
  directory = os.path.dirname(output_file_path)
  if not os.path.exists(directory):
    os.makedirs(directory)
  join_files_in_folder(folder_path, output_file_path)

In [34]:
folder_path = 'out/csv_combined'
output_file_path = 'out/full.csv'
join_files_in_folder(folder_path, output_file_path)