In [1]:
# Install necessary libraries
!pip install transformers pandas scikit-learn nltk spacy beautifulsoup4 requests matplotlib seaborn torch
!python -m spacy download en_core_web_sm

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [5]:
import pandas as pd
import requests
import json
from urllib.parse import quote
from bs4 import BeautifulSoup

# Updated function to fetch data from OpenAlex API
def fetch_journal_data(query="computer science", limit=100):
    # Base URL for the OpenAlex API
    base_url = "https://api.openalex.org/works"

    # For searching venues specifically, we can use a filter
    # Let's try with a more basic query first
    encoded_query = quote(query)
    url = f"{base_url}?filter=default.search:{encoded_query}&per-page={limit}"

    print(f"Requesting URL: {url}")

    # Add a user-agent header as some APIs require this
    headers = {
        "User-Agent": "Mozilla/5.0 Academic Research Project"
    }

    response = requests.get(url, headers=headers)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error: {response.status_code} - {response.text}")

        # Let's try an alternative approach
        print("Trying alternative endpoint...")
        venues_url = f"https://api.openalex.org/venues?search={encoded_query}&per-page={limit}"
        print(f"Requesting URL: {venues_url}")
        alt_response = requests.get(venues_url, headers=headers)

        if alt_response.status_code == 200:
            return alt_response.json()
        else:
            print(f"Alternative request failed: {alt_response.status_code} - {alt_response.text}")
            return None

# Get sample journal data
cs_journals = fetch_journal_data("computer science")

# Check if cs_journals is not None before proceeding
if cs_journals is not None:
    # Convert to DataFrame for easier manipulation
    journal_df = pd.json_normalize(cs_journals['results'])
    print(f"Successfully retrieved {len(journal_df)} journals/works")
    # Display first few records
    print(journal_df.head(3)[['id', 'display_name']].to_string())
else:
    print("Could not fetch data from OpenAlex API.")

    # Let's try a simple test request to confirm the API is working
    test_url = "https://api.openalex.org/works?filter=default.search:science"
    print(f"Testing basic API functionality with: {test_url}")
    test_response = requests.get(test_url)
    print(f"Test response code: {test_response.status_code}")
    if test_response.status_code == 200:
        print("API is working, but there may be an issue with the specific query or endpoint.")

Requesting URL: https://api.openalex.org/works?filter=default.search:computer%20science&per-page=100
Successfully retrieved 100 journals/works
                                 id                                                              display_name
0  https://openalex.org/W2561675875                                    Lecture Notes in Computer Science 1205
1  https://openalex.org/W2132750992                                  Handbook of theoretical computer science
2  https://openalex.org/W2011781303  Calibration of the Computer Science and Applications, Inc. accelerometer


In [6]:
import nltk
from nltk.corpus import stopwords
import spacy
import re

nltk.download('stopwords')
nltk.download('punkt')
nlp = spacy.load('en_core_web_sm')

def preprocess_text(text):
    """Basic preprocessing for academic text"""
    # Remove special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = text.split()
    text = ' '.join([word for word in words if word not in stop_words])
    return text

def extract_key_phrases(text):
    """Extract important noun phrases from text"""
    doc = nlp(text)
    key_phrases = []
    for chunk in doc.noun_chunks:
        key_phrases.append(chunk.text)
    return key_phrases

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
from transformers import AutoTokenizer, AutoModel
import torch

# Load SciBERT or other scientific text model
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def generate_embeddings(text):
    """Generate embeddings for text using pretrained model"""
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)

    # Use [CLS] token embedding as document representation
    embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

In [8]:
# Sample Matching Function
from sklearn.metrics.pairwise import cosine_similarity

def match_paper_to_journals(paper_embedding, journal_embeddings, journal_df, top_n=5):
    """Match paper embedding to journal embeddings and return top matches"""
    similarities = cosine_similarity(paper_embedding, journal_embeddings)
    top_indices = similarities[0].argsort()[-top_n:][::-1]

    results = []
    for idx in top_indices:
        journal_info = journal_df.iloc[idx]
        results.append({
            'journal_name': journal_info['display_name'],
            'similarity_score': similarities[0][idx],
            'publisher': journal_info.get('publisher', 'Unknown'),
            'topics': journal_info.get('x_concepts', [])
        })

    return results

In [11]:
# Sample paper abstract
sample_abstract = """
This paper presents a novel approach to natural language processing using transformer
architectures. We demonstrate improvements in sentiment analysis and named entity recognition
tasks across multiple benchmarks. Our method reduces computational requirements while
maintaining state-of-the-art accuracy.
"""

# Preprocess and generate embedding
processed_abstract = preprocess_text(sample_abstract)
paper_embedding = generate_embeddings(processed_abstract)

# For demo purposes, generate embeddings for journal descriptions
# In practice, you'd have these pre-computed
journal_descriptions = journal_df['display_name'].tolist()
journal_embeddings = []

# Generate embeddings for demo journals
for journal in journal_descriptions[:20]:  # Limit for demo
    journal_embedding = generate_embeddings(journal)
    journal_embeddings.append(journal_embedding[0])

# Import numpy and use vstack
import numpy as np  # Import numpy and assign it to the alias 'np'
journal_embeddings = np.vstack(journal_embeddings)

# Match paper to journals
recommendations = match_paper_to_journals(paper_embedding, journal_embeddings, journal_df[:20], top_n=3)
print("Top journal recommendations:")
for i, rec in enumerate(recommendations):
    print(f"{i+1}. {rec['journal_name']} (Similarity: {rec['similarity_score']:.4f})")

Top journal recommendations:
1. Quantum Computer Science (Similarity: 0.7235)
2. Calibration of the Computer Science and Applications, Inc. accelerometer (Similarity: 0.7133)
3. Some computer science issues in ubiquitous computing (Similarity: 0.6902)


In [13]:
import os
from google.colab import drive

drive.mount('/content/drive')

# Define the directory path
dir_path = '/content/drive/My Drive/NLP_Research_Forge'

# Create the directory if it doesn't exist
if not os.path.exists(dir_path):
    os.makedirs(dir_path)
    print(f"Directory '{dir_path}' created successfully.")
else:
    print(f"Directory '{dir_path}' already exists.")

# Now you can save your dataframe
journal_df.to_csv('/content/drive/My Drive/NLP_Research_Forge/journal_data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Directory '/content/drive/My Drive/NLP_Research_Forge' created successfully.
