### 1. Mount Google Drive
Mounts Google Drive to access the processed Natural Language Processing (NLP) dataset.

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


### 2. Initialize Qdrant Client
Initializes the connection to the Qdrant Cloud vector database. This client will be used to create the `nlp_collection` and upload vectors.

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.http import models as rest

qdrant_client = QdrantClient(
     url="Enter your api URL***",
     api_key="Enter your Api key***"
)


### 3. Inspect Processed Data
Reads and prints the first line of the processed NLP dataset (`nlp.jsonl`) to verify the data format before indexing.

In [5]:
import json

path = "/content/drive/MyDrive/OpenAlex_CS_2025_Data/processed/nlp.jsonl"

with open(path, "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(json.loads(line))
        break


{'openalex_id': 'https://openalex.org/W2626778328', 'doi': 'https://doi.org/10.65215/ne77pf66', 'title': 'Attention Is All You Need', 'publication_year': 2025, 'publication_date': '2025-08-23', 'authors': [{'author_id': 'https://openalex.org/A5103024730', 'name': 'Ashish Vaswani'}, {'author_id': 'https://openalex.org/A5021878400', 'name': 'Noam Shazeer'}, {'author_id': 'https://openalex.org/A5005777963', 'name': 'Niki Parmar'}, {'author_id': 'https://openalex.org/A5022416424', 'name': 'Jakob Uszkoreit'}, {'author_id': 'https://openalex.org/A5023448834', 'name': 'Llion Jones'}, {'author_id': 'https://openalex.org/A5079288315', 'name': 'Aidan N. Gomez'}, {'author_id': 'https://openalex.org/A5031789995', 'name': '≈Åukasz Kaiser'}, {'author_id': 'https://openalex.org/A5045719436', 'name': 'Illia Polosukhin'}], 'concepts': [{'id': 'https://openalex.org/C41008148', 'name': 'Computer science'}, {'id': 'https://openalex.org/C203005215', 'name': 'Machine translation'}, {'id': 'https://openalex.

### 4. Full Indexing Pipeline (NLP Domain)
This is the main ingestion script for the NLP domain:
1.  **Config**: Sets the collection name (`nlp_collection`), batch size, and vector size.
2.  **Setup**: Creates the Qdrant collection if it doesn't exist.
3.  **Ingest**:
    *   Reads the `nlp.jsonl` file.
    *   Generates embeddings using `all-MiniLM-L6-v2`.
    *   Constructs the payload with metadata.
    *   Uploads points in batches to Qdrant.

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct, VectorParams, Distance
from sentence_transformers import SentenceTransformer
import json
from tqdm import tqdm

# ---------------- CONFIG ----------------
COLLECTION = "nlp_collection"   # ‚úÖ NEW COLLECTION
BATCH_SIZE = 64             # safer for cloud
VECTOR_SIZE = 384

# ---------------- MODEL -----------------
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# ---------------- QDRANT ----------------
client = QdrantClient(
    url="Enter your api URL***",
     api_key="Enter your Api key***"
    timeout=60  # ‚úÖ IMPORTANT
)

# ---------------- COLLECTION SETUP ----------------
if not client.collection_exists(COLLECTION):
    client.create_collection(
        collection_name=COLLECTION,
        vectors_config=VectorParams(
            size=VECTOR_SIZE,
            distance=Distance.COSINE
        )
    )

# ---------------- INGEST ----------------
points = []
idx = 0

with open(
    "/content/drive/MyDrive/OpenAlex_CS_2025_Data/processed/nlp.jsonl",
    "r",
    encoding="utf-8"
) as f:

    for line in tqdm(f, desc="Indexing NLP Papers"):
        paper = json.loads(line)

        text = f"{paper.get('title','')} {paper.get('abstract','')}".strip()
        if len(text) < 30:
            continue

        vector = model.encode(text).tolist()

        payload = {
            "openalex_id": paper.get("openalex_id"),
            "doi": paper.get("doi"),
            "title": paper.get("title"),
            "abstract": paper.get("abstract"),
            "publication_year": paper.get("publication_year"),
            "publication_date": paper.get("publication_date"),
            "venue": paper.get("venue"),
            "citation_count": paper.get("citation_count"),
            "is_open_access": paper.get("is_open_access"),
            "oa_status": paper.get("oa_status"),
            "url": paper.get("url"),

            # üî• STRUCTURED FIELDS
            "authors": [
                {
                    "id": a.get("author_id"),
                    "name": a.get("name")
                } for a in paper.get("authors", [])
            ],

            "concepts": [
                {
                    "id": c.get("id"),
                    "name": c.get("name")
                } for c in paper.get("concepts", [])
            ]
        }

        points.append(
            PointStruct(
                id=idx,
                vector=vector,
                payload=payload
            )
        )
        idx += 1

        if len(points) == BATCH_SIZE:
            client.upsert(
                collection_name=COLLECTION,
                points=points,
                wait=False  # ‚úÖ avoids timeout
            )
            points = []

# Final flush
if points:
    client.upsert(
        collection_name=COLLECTION,
        points=points,
        wait=False
    )

print("‚úÖ FULL NLP COLLECTION INDEXED SUCCESSFULLY")

Indexing NLP Papers: 10611it [04:43, 37.37it/s]


‚úÖ FULL NLP COLLECTION INDEXED SUCCESSFULLY


### 5. Test Retrieval (NLP Domain)
Performs a sanity check on the indexed `nlp_collection`:
1.  Encodes a test query ("What are the open research gaps in nlp?").
2.  Searches the collection for the top 5 nearest neighbors.
3.  Prints the detailed results to verify retrieval quality.

In [None]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

client = QdrantClient(
    url="Enter your api URL***",
    api_key="Enter your Api key***"
)

query = "What are the open research gaps in nlp?"
vector = model.encode(query).tolist()

results = client.query_points(
    collection_name="nlp_collection"  ,
    query=vector,
    limit=5,
    with_payload=True
)

print(f"Returned points: {len(results.points)}")

for i, hit in enumerate(results.points, start=1):
    p = hit.payload

    print(f"\n--- Result {i} ---")
    print("Score:", round(hit.score, 4))
    print("Title:", p.get("title"))
    print("DOI:", p.get("doi"))
    print("Year:", p.get("publication_year"))
    print("Venue:", p.get("venue"))
    print("Citations:", p.get("citation_count"))
    print("Open Access:", p.get("is_open_access"), "| Status:", p.get("oa_status"))
    print("URL:", p.get("url"))

    # ‚úÖ SAFE abstract handling
    abstract = p.get("abstract")
    if abstract:
        print("Abstract:", abstract[:600])
    else:
        print("Abstract: ‚ùå Not available")

    # ‚úÖ Authors
    authors = p.get("authors", [])
    if authors:
        print("Authors:", ", ".join(a["name"] for a in authors[:5]))
    else:
        print("Authors: ‚ùå Not available")

    # ‚úÖ Concepts
    concepts = p.get("concepts", [])
    if concepts:
        print("Concepts:", ", ".join(c["name"] for c in concepts[:6]))
    else:
        print("Concepts: ‚ùå Not available")

Returned points: 5

--- Result 1 ---
Score: 0.5685
Title: The Technological Trajectory of Semantic Analysis: A Historical-Methodological Review of NLP in Social Sciences
DOI: https://doi.org/10.2139/ssrn.5022988
Year: 2025
Venue: SSRN Electronic Journal
Citations: 1
Open Access: True | Status: green
URL: https://openalex.org/W4406351227
Abstract: ‚ùå Not available
Authors: Rodrigo Kataishi
Concepts: Trajectory, Natural language processing, Semantic analysis (machine learning), Computer science, Artificial intelligence, Sentiment analysis

--- Result 2 ---
Score: 0.5509
Title: Bridging language gaps: The role of NLP and speech recognition in oral english instruction
DOI: https://doi.org/10.1016/j.mex.2025.103359
Year: 2025
Venue: MethodsX
Citations: 1
Open Access: True | Status: gold
URL: https://openalex.org/W4410176650
Abstract: ‚ùå Not available
Authors: Parul Dubey, Pushkar Dubey, Rohit Raja, Sapna Singh Kshatri
Concepts: Bridging (networking), Natural language processing, Computer 