### 1. Mount Google Drive
Mounts Google Drive to access the raw dataset collected in the previous step.

In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)


Mounted at /content/drive


### 2. Inspect Raw Data
This cell defines a helper function `quick_inspect_jsonl` to read the raw JSONL file. It counts the total number of lines (papers) and prints the first paper object to verify the data structure before processing.

In [None]:
import json
from itertools import islice

FILE_PATH = "/content/drive/MyDrive/OpenAlex_CS_2025/openalex_cs_2025.jsonl"

def quick_inspect_jsonl(path):
    with open(path, "r", encoding="utf-8") as f:
        first = None
        count = 0
        for line in f:
            if line.strip():
                if first is None:
                    first = json.loads(line)
                count += 1
        return count, first

total, sample = quick_inspect_jsonl(FILE_PATH)

print("Total papers:", total)
print("\nFirst paper sample:\n")
print(json.dumps(sample, indent=2))


Total papers: 331600

First paper sample:

{
  "openalex_id": "https://openalex.org/W1803273808",
  "doi": "https://doi.org/10.4135/9781036235611",
  "title": "The Coding Manual for Qualitative Researchers",
  "abstract": null,
  "publication_year": 2025,
  "publication_date": "2025-01-01",
  "authors": [
    {
      "author_id": "https://openalex.org/A5058231379",
      "name": "Johnny Salda\u00f1a"
    }
  ],
  "concepts": [
    {
      "id": "https://openalex.org/C179518139",
      "name": "Coding (social sciences)"
    },
    {
      "id": "https://openalex.org/C2780031656",
      "name": "Glossary"
    },
    {
      "id": "https://openalex.org/C73231260",
      "name": "Tunstall coding"
    },
    {
      "id": "https://openalex.org/C130811719",
      "name": "Shannon\u2013Fano coding"
    },
    {
      "id": "https://openalex.org/C60603091",
      "name": "Variable-length code"
    },
    {
      "id": "https://openalex.org/C41008148",
      "name": "Computer science"
    },
  

### 3. Setup Paths
Defines the input path (raw data) and the output directory where the processed, domain-partitioned files will be stored. It ensures the output directory exists.

In [None]:
import json
import os
from tqdm import tqdm

INPUT_FILE = "/content/drive/MyDrive/OpenAlex_CS_2025/openalex_cs_2025.jsonl"
OUTPUT_DIR = "/content/drive/MyDrive/OpenAlex_CS_2025_Data/processed"

os.makedirs(OUTPUT_DIR, exist_ok=True)


### 4. Define Domain Keywords
This is a crucial configuration step. We define a dictionary `DOMAIN_KEYWORDS` that maps specific keywords (e.g., 'neural network', 'text mining') to their respective domains (AI, ML, DL, NLP, CV, RL). This dictionary drives the domain detection logic.

In [None]:
DOMAIN_KEYWORDS = {
    "ai": {"artificial intelligence"},
    "ml": {"machine learning"},
    "dl": {"deep learning", "neural network"},
    "nlp": {"natural language processing", "speech recognition", "text mining"},
    "cv": {"computer vision", "image processing", "object detection"},
    "rl": {"reinforcement learning"},
}


### 5. Define Cleaning & Detection Functions
We define two helper functions:
*   `clean_nulls`: Recursively removes empty fields (None, empty strings, empty lists) from the paper object to reduce storage size and noise.
*   `detect_domains`: Iterates through a paper's concepts and checks them against the `DOMAIN_KEYWORDS` to identify which domain(s) the paper belongs to. A paper can belong to multiple domains.

In [None]:
def clean_nulls(obj):
    """Remove null / empty fields recursively"""
    if isinstance(obj, dict):
        return {k: clean_nulls(v) for k, v in obj.items() if v not in [None, "", [], {}]}
    elif isinstance(obj, list):
        return [clean_nulls(v) for v in obj if v not in [None, "", [], {}]]
    return obj


def detect_domains(concepts):
    """Detect CS subdomains from concept names"""
    found = set()
    for c in concepts:
        name = c.get("name", "").lower()
        for domain, keywords in DOMAIN_KEYWORDS.items():
            if any(k in name for k in keywords):
                found.add(domain)
    return found


### 6. Process and Partition Data
This is the main processing loop:
1.  Opens output files for each domain (plus 'other_cs').
2.  Iterates through the raw dataset line by line.
3.  Cleans each paper object.
4.  Detects the domain(s) based on concepts.
5.  Writes the paper to the corresponding domain file(s). If no specific domain is found, it goes to 'other_cs'.
6.  Prints the final counts for each domain.

In [None]:
output_files = {
    domain: open(os.path.join(OUTPUT_DIR, f"{domain}.jsonl"), "a", encoding="utf-8")
    for domain in DOMAIN_KEYWORDS
}
output_files["other_cs"] = open(os.path.join(OUTPUT_DIR, "other_cs.jsonl"), "a", encoding="utf-8")

counts = {k: 0 for k in output_files}

with open(INPUT_FILE, "r", encoding="utf-8") as infile:
    for line in tqdm(infile, desc="Processing CS 2025 papers"):
        paper = json.loads(line)

        # Clean nulls
        paper = clean_nulls(paper)

        concepts = paper.get("concepts", [])
        domains = detect_domains(concepts)

        if domains:
            for d in domains:
                output_files[d].write(json.dumps(paper) + "\n")
                counts[d] += 1
        else:
            output_files["other_cs"].write(json.dumps(paper) + "\n")
            counts["other_cs"] += 1

# Close files
for f in output_files.values():
    f.close()

print("✅ DONE")
print(json.dumps(counts, indent=2))


Processing CS 2025 papers: 331600it [01:05, 5049.71it/s]

✅ DONE
{
  "ai": 127537,
  "ml": 36967,
  "dl": 18751,
  "nlp": 10611,
  "cv": 19221,
  "rl": 2963,
  "other_cs": 186251
}



