In [None]:
import random
from typing import List
import requests
import json
import pandas as pd
import pymongo
from dotenv import load_dotenv
import os
from data_pipeline.db import get_all_data



In [2]:
load_dotenv()
MONGO_URL = os.getenv("MONGO_URL")

In [96]:
op_list, dt_list = get_all_data(limit=0)

In [97]:
df3 = pd.DataFrame(dt_list)
df3.shape

(21714, 39)

In [5]:
client = pymongo.MongoClient(host=MONGO_URL)

In [None]:
openAlex_data_collection = client['dsde'].get_collection('openAlex_data')
data_collection = client.get_database('dsde').get_collection('data')

In [21]:
test = data_collection.find(filter = {"srctype": "j"}, limit = 10)
for i in test:
    print(f"issn: {i['prism:issn']}")
    print(f"isbn :{i['prism:isbn']}")

issn: 16879597 16879589
isbn :
issn: 19387879 19386362
isbn :
issn: 19391307 10768998
isbn :
issn: 16118227 14384957
isbn :
issn: 1756185X 17561841
isbn :
issn: 10982825 08878013
isbn :
issn: 19763808 12267988
isbn :
issn: 01700839
isbn :
issn: 16181255 16181247
isbn :
issn: 18831958
isbn :


In [None]:
class OpenAlexScraper:
    def __init__(self, mongo_uri: str, base_url="https://api.openalex.org/works"):
        self.base_url = base_url
        self.scraped_issns = self.load_scraped_issns()
        self.mongo_uri = mongo_uri
        self.client = pymongo.MongoClient(self.mongo_uri)

    def load_scraped_issns(self, file_path="issns.json"):
        try:
            # Fetch distinct ISSNs from MongoDB
            issns = set(self.client['dsde']['openAlex_data'].distinct('primary_location.source.issn'))
            data_issns = set(self.client['dsde']['data'].distinct('prism:isbn'))
            issns.update(data_issns)
            # Optionally load from file if it exists
            if os.path.exists(file_path):
                with open(file_path, "r") as f:
                    file_issns = set(json.load(f))
                    issns.update(file_issns)
                    print(f"Loaded {len(file_issns)} ISSNs from {file_path}")

            print(f"Loaded {len(issns)} total ISSNs (MongoDB + file)")
            return issns

        except Exception as e:
            print(f"Error loading ISSNs: {e}")
            return set()

    def fetch_papers(self, filter_string: str, sample_size: int, per_page: int):
        params = {
            "filter": filter_string,
            "sample": sample_size,
            "per-page": per_page,
        }

        try:
            response = requests.get(self.base_url, params=params)
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Error: {response.text}")
                return {"error": f"Failed to retrieve data, status code: {response.status_code}"}
        except Exception as e:
            return {"error": f"An error occurred: {e}"}
        
    def clean_data(self, data: List[dict]) -> List[dict]:
        df = pd.DataFrame(data)
        columns_to_keep = [
            'title', 'fwci', 'cited_by_count', 'type', 'type_crossref', 'topics', 
            'locations', 'locations_count', 'primary_topic', 'concepts', 
            'relevance_score', 'publication_date', 'authorships', 
            'publication_year', 'language', 'abstract_inverted_index', 
            'referenced_works', 'apc_list', 'apc_paid'
        ]

        df = df[columns_to_keep]
        return df.to_dict(orient="records")
    
    def save_file(self, file_path: str, data):
        try:
            with open(file_path, "w") as f:
                json.dump(data, f, indent=4)
            print(f"Scraped data saved to {file_path}")
        except Exception as e:
            print(f"Error saving data to {file_path}: {e}")

    def save_to_mongo(self, data: List[dict]):
        try:
            cleaned_data = self.clean_data(data)
            self.openAlex_data_collection.insert_many(cleaned_data)
            print(f"Scraped data saved to MongoDB")
        except Exception as e:
            print(f"Error saving data to MongoDB: {e}")

    def scrape_papers(self, keyword_ids: List[str], per_page=50,
                      ignore_issns=False, target_count=None, save_path=None, 
                      save_to_file=None, save_to_mongo=True
                      ):
        all_filtered_papers = []
        total_collected = 0

        # Build filter string
        keyword_filters = [f"keywords/{kw}" for kw in keyword_ids]
        keyword_filter_string = "|".join(keyword_filters) if keyword_filters else ""
        filter_string = "open_access.is_oa:true,language:en"
        if keyword_filter_string:
            filter_string += f",keywords.id:{keyword_filter_string}"

        print(f"Filter string: {filter_string}")

        while total_collected < target_count:
            papers_data = self.fetch_papers(
                filter_string=filter_string, sample_size=per_page, per_page=per_page
            )

            if "results" not in papers_data:
                print(f"Error fetching papers: {papers_data.get('error', 'Unknown error')}")
                continue

            # Filter papers based on ISSNs if needed
            filtered_papers = []
            if ignore_issns:
                filtered_papers = papers_data["results"]
            else:
                for paper in papers_data["results"]:
                    try:
                        issns = paper["primary_location"]["source"].get("issn", [])
                        if not any(issn in self.scraped_issns for issn in issns):
                            filtered_papers.append(paper)
                    except (KeyError, AttributeError, TypeError):
                        continue

            all_filtered_papers.extend(filtered_papers)
            total_collected += len(filtered_papers)

            if total_collected >= target_count:
                print(f"Target of {target_count} papers reached. Stopping scrape.")
                break

            print(f"Collected {total_collected} papers so far.")

        all_filtered_papers = all_filtered_papers[:target_count]
        if save_to_mongo:
            self.save_to_mongo(all_filtered_papers)
        if save_path and save_to_file:
            self.save_file(save_path, all_filtered_papers)

        return all_filtered_papers


In [79]:
async def main():
    openAlexScraper = OpenAlexScraper(mongo_uri=MONGO_URL)
    keywords = []
    save_path = "scraped_papers_random_2.json"
    target_count = 200
    # Scrape papers in a random manner
    filtered_papers = openAlexScraper.scrape_papers(
        keyword_ids=keywords, ignore_issns=True, target_count=target_count, save_to_file=False, save_to_mongo=True, per_page=100,
    )
    print(f"Found {len(filtered_papers)} papers with new ISSNs")
    return filtered_papers

In [80]:
filtered_papers = await main()

Error loading ISSNs: 'OpenAlexScraper' object has no attribute 'client'
Filter string: open_access.is_oa:true,language:en
Collected 100 papers so far.
Target of 200 papers reached. Stopping scrape.
Scraped data saved to MongoDB
Found 200 papers with new ISSNs


In [47]:
df = pd.DataFrame(filtered_papers)

In [55]:
df.columns

Index(['id', 'doi', 'title', 'display_name', 'relevance_score',
       'publication_year', 'publication_date', 'ids', 'language',
       'primary_location', 'type', 'type_crossref', 'indexed_in',
       'open_access', 'authorships', 'institution_assertions',
       'countries_distinct_count', 'institutions_distinct_count',
       'corresponding_author_ids', 'corresponding_institution_ids', 'apc_list',
       'apc_paid', 'fwci', 'has_fulltext', 'cited_by_count',
       'citation_normalized_percentile', 'cited_by_percentile_year', 'biblio',
       'is_retracted', 'is_paratext', 'primary_topic', 'topics', 'keywords',
       'concepts', 'mesh', 'locations_count', 'locations', 'best_oa_location',
       'sustainable_development_goals', 'grants', 'datasets', 'versions',
       'referenced_works_count', 'referenced_works', 'related_works',
       'abstract_inverted_index', 'cited_by_api_url', 'counts_by_year',
       'updated_date', 'created_date', 'fulltext_origin'],
      dtype='object')

In [35]:
op_df = pd.read_json("/Users/caesar/Desktop/cedt_2_1/intro2ds/project_dsde/DataSci_project/data_pipeline/scraped_papers_random_2.json")


In [None]:
columns_to_keep = [
    'title', 'fwci', 'cited_by_count', 'type', 'type_crossref', 'topics', 
    'locations', 'locations_count', 'primary_topic', 'concepts', 
    'relevance_score', 'publication_date', 'authorships', 
    'publication_year', 'language', 'abstract_inverted_index', 
    'referenced_works', 'apc_list', 'apc_paid'
]

op_df_filtered = op_df[columns_to_keep]




Unnamed: 0,title,fwci,cited_by_count,type,type_crossref,topics,locations,locations_count,primary_topic,concepts,relevance_score,publication_date,authorships,publication_year,language,abstract_inverted_index,referenced_works,apc_list,apc_paid
0,基于Zernike矩特征提取的改进FCM手写体数字识别&lt;br&gt;Improved ...,0.0,0,article,journal-article,"[{'id': 'https://openalex.org/T10601', 'displa...","[{'is_oa': True, 'landing_page_url': 'https://...",1,"{'id': 'https://openalex.org/T10601', 'display...","[{'id': 'https://openalex.org/C153180895', 'wi...",1.0,2013-01-01,"[{'author_position': 'first', 'author': {'id':...",2013,en,,[],,
1,Anzeige eines mit der Camera lucida verbundnen...,0.0,0,article,journal-article,"[{'id': 'https://openalex.org/T10857', 'displa...","[{'is_oa': False, 'landing_page_url': 'https:/...",2,"{'id': 'https://openalex.org/T10857', 'display...","[{'id': 'https://openalex.org/C2778805511', 'w...",1.0,1812-01-01,"[{'author_position': 'first', 'author': {'id':...",1812,en,"{'Annalen': [0], 'der': [1, 12], 'PhysikVolume...",[],"{'value': 4020, 'currency': 'USD', 'value_usd'...",
2,Contribution to Specifying and Verifying Compo...,,0,preprint,dissertation,"[{'id': 'https://openalex.org/T10703', 'displa...","[{'is_oa': True, 'landing_page_url': 'https://...",1,"{'id': 'https://openalex.org/T10703', 'display...","[{'id': 'https://openalex.org/C199360897', 'wi...",1.0,2011-07-04,"[{'author_position': 'first', 'author': {'id':...",2011,en,,[],,
3,Sociomaterial analysis of Music Notation Lesso...,0.0,0,article,journal-article,"[{'id': 'https://openalex.org/T10803', 'displa...","[{'is_oa': True, 'landing_page_url': 'https://...",2,"{'id': 'https://openalex.org/T10803', 'display...","[{'id': 'https://openalex.org/C45357846', 'wik...",1.0,2013-01-01,"[{'author_position': 'first', 'author': {'id':...",2013,en,"{'The': [0], 'present': [1], 'research': [2], ...","[https://openalex.org/W1497388952, https://ope...",,
4,درجة ممارسة قادة مدراس التعليم العام بمحافظة ا...,0.0,0,article,journal-article,"[{'id': 'https://openalex.org/T12998', 'displa...","[{'is_oa': True, 'landing_page_url': 'https://...",1,"{'id': 'https://openalex.org/T12998', 'display...","[{'id': 'https://openalex.org/C15744967', 'wik...",1.0,2021-03-01,"[{'author_position': 'first', 'author': {'id':...",2021,en,"{'Study': [0], 'Objectives': [1], 'Finding': [...",[],,


In [38]:
data_dict = op_df_filtered.to_dict("records")

openAlex_data_collection.insert_many(data_dict)

InsertManyResult([ObjectId('6757e3331cb3988480f787cc'), ObjectId('6757e3331cb3988480f787cd'), ObjectId('6757e3331cb3988480f787ce'), ObjectId('6757e3331cb3988480f787cf'), ObjectId('6757e3331cb3988480f787d0'), ObjectId('6757e3331cb3988480f787d1'), ObjectId('6757e3331cb3988480f787d2'), ObjectId('6757e3331cb3988480f787d3'), ObjectId('6757e3331cb3988480f787d4'), ObjectId('6757e3331cb3988480f787d5'), ObjectId('6757e3331cb3988480f787d6'), ObjectId('6757e3331cb3988480f787d7'), ObjectId('6757e3331cb3988480f787d8'), ObjectId('6757e3331cb3988480f787d9'), ObjectId('6757e3331cb3988480f787da'), ObjectId('6757e3331cb3988480f787db'), ObjectId('6757e3331cb3988480f787dc'), ObjectId('6757e3331cb3988480f787dd'), ObjectId('6757e3331cb3988480f787de'), ObjectId('6757e3331cb3988480f787df')], acknowledged=True)