# Packages

In [1]:
import requests
import pandas as pd
import re
from datetime import datetime
import matplotlib.pyplot as plt
from requests.auth import HTTPBasicAuth
import time
from datetime import datetime

# Main functions

In [3]:
def parse_rate_limit_headers(headers):
    limit = int(headers.get('RateLimit-Limit', 0))
    remaining = int(headers.get('RateLimit-Remaining', 0))
    reset_timestamp = int(headers.get('RateLimit-Reset', 0))
    
    current_timestamp = int(time.time())
    reset_in_seconds = reset_timestamp - current_timestamp

    print(f"Límite máximo: {limit}")
    print(f"Solicitudes restantes: {remaining}")
    print(f"Tiempo hasta reinicio: {reset_in_seconds} segundos ({reset_in_seconds / 60:.2f} minutos)")

    return limit, remaining, reset_in_seconds

In [5]:
def get_altmetric_data(doi):
    base_url = "https://api.altmetric.com/v1/doi/"
    try:
        response = requests.get(base_url + doi)
        if response.status_code == 200:
            data = response.json()
            twitter_mentions = data.get('cited_by_tweeters_count', 0)
            return twitter_mentions
        else:
            return 0
    except Exception as e:
        return None

In [7]:
def doi_to_url(doi):
    try:
        response = requests.head(doi, allow_redirects=True, timeout=10)
        if response.status_code == 200:
            return response.url
    except Exception:
        pass
    return None

In [9]:
def generar_cadena_or_completa(row):
    partes = []
    
    doi = row['doi']
    if pd.notna(doi):
        partes.append(re.sub(r'^https://doi.org/', '', doi))
    
    resolved_url = row['url_final']
    if pd.notna(resolved_url):
        partes.append(resolved_url)
    
    titulo = row['title']
    if pd.notna(titulo):
        partes.append(f'"{titulo}"')
    
    if 'locations.landing_page_url' in row and pd.notna(row['locations.landing_page_url']):
        partes.extend(row['locations.landing_page_url'].split('|'))
    if 'locations.pdf_url' in row and pd.notna(row['locations.pdf_url']):
        partes.extend(row['locations.pdf_url'].split('|'))
    
    partes = [word for word in partes if word != 'None']
    
    return "|".join(set(partes))

# Bluesky login

In [100]:
username = '' 
app_password = ''

auth_url = 'https://bsky.social/xrpc/com.atproto.server.createSession'

auth_data = {
    'identifier': username,
    'password': app_password
}

auth_response = requests.post(auth_url, json=auth_data)

In [None]:
auth_response

# 1. Scientometrics

In [None]:
df = pd.read_csv('data/scientometrics.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df.type.value_counts()

## Solve URL

In [25]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = doi_to_url(df.loc[i, 'doi'])

## Query string

In [29]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [36]:
df['bsky_mentions'] = None

In [38]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'
access_token = auth_response.json().get('accessJwt')
headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
df.loc[0,'query_string'].split('|')

In [48]:
df_bsky = pd.DataFrame()

In [50]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
for query in range(df.shape[0]):
    print(query, end='\r')
    posts_bsk = []

    for subquery in df.loc[query, 'query_string'].split('|'):
        params = {
            'q': subquery,
            'limit': 100
        }
        
        search_response = requests.get(search_url, headers=headers, params=params)
        
        if search_response.status_code == 401:
            print("Token expirado. Reautenticando...")
            auth_response = requests.post(auth_url, json=auth_data)
            if auth_response.status_code == 200:
                access_token = auth_response.json().get('accessJwt')
                headers['Authorization'] = f'Bearer {access_token}'
                print("Reautenticación exitosa. Reintentando solicitud...")
                search_response = requests.get(search_url, headers=headers, params=params)
            else:
                print(f"Error al reautenticar: {auth_response.status_code}")
                exit()

        
        rate_limit_remaining = int(search_response.headers.get('RateLimit-Remaining', 0))
        rate_limit_reset = int(search_response.headers.get('RateLimit-Reset', 0))

        if rate_limit_remaining == 0:
            current_timestamp = int(time.time())
            wait_time = rate_limit_reset - current_timestamp
            print(f"Se alcanzó el límite de solicitudes. Esperando {wait_time} segundos...")
            time.sleep(wait_time)
            continue

        if search_response.status_code == 200:
            posts = search_response.json().get('posts', [])
            posts_bsk.extend(posts)
        else:
            print(f"Error en la búsqueda: {query}")
            continue

    if len(posts_bsk) > 0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query, 'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
df_bsky[df_bsky.doi=='https://doi.org/10.1007/s11192-023-04894-0']

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [56]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [109]:
df.to_csv('results/scientometrics.tsv', sep='\t', index=False)

In [113]:
df_bsky.to_csv('results/scientometrics_bsky.tsv', sep='\t', index=False)

# 2. JASIST

In [None]:
df = pd.read_csv('data/jasist.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df.type.value_counts()

In [None]:
df = df[df.type!='paratext']
df = df.reset_index(drop=True)
df

## Solve URL

In [142]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = 'https://asistdl.onlinelibrary.wiley.com/doi/'+re.sub(r'^https://doi.org/', '', df.loc[i, 'doi'])

## Query string

In [147]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [154]:
df['bsky_mentions'] = None

In [156]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'
access_token = auth_response.json().get('accessJwt')
headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
df.loc[0,'query_string'].split('|')

In [160]:
df_bsky = pd.DataFrame()

In [162]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
for query in range(df.shape[0]):
    print(query, end='\r')
    
    posts_bsk = []
    
    for subquery in df.loc[query,'query_string'].split('|'):
    
        params = {
                'q': subquery,
                'limit': 100
        }

        search_response = requests.get(search_url, headers=headers, params=params)
        posts = search_response.json().get('posts', [])
        
        posts_bsk = posts + posts_bsk
    
    if len(posts_bsk)>0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query,'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [170]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [175]:
df.to_csv('results/jasist.tsv', sep='\t', index=False)

In [176]:
df_bsky.to_csv('results/jasist_bsky.tsv', sep='\t', index=False)

# 3. Journal of Informetrics

In [None]:
df = pd.read_csv('data/joi.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df.type.value_counts()

In [None]:
df = df[df.type!='paratext']
df = df.reset_index(drop=True)
df

## Solve URL

In [84]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = doi_to_url(df.loc[i, 'doi'])

## Query string

In [90]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [97]:
df['bsky_mentions'] = None

In [99]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'
access_token = auth_response.json().get('accessJwt')
headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
df.loc[0,'query_string'].split('|')

In [103]:
df_bsky = pd.DataFrame()

In [105]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
for query in range(df.shape[0]):
    print(query, end='\r')
    
    posts_bsk = []
    
    for subquery in df.loc[query,'query_string'].split('|'):
    
        params = {
                'q': subquery,
                'limit': 100
        }

        search_response = requests.get(search_url, headers=headers, params=params)
        posts = search_response.json().get('posts', [])
        
        posts_bsk = posts + posts_bsk
    
    if len(posts_bsk)>0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query,'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [118]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [130]:
df.to_csv('results/joi.tsv', sep='\t', index=False)

In [132]:
df_bsky.to_csv('results/joi_bsky.tsv', sep='\t', index=False)

In [None]:
a.sort_values('bsky_mentions')

# 4. QSS

In [None]:
df = pd.read_csv('data/qss.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df.type.value_counts()

In [None]:
requests.head('https://doi.org/10.1162/qss_a_00337', allow_redirects=True, timeout=15)

## Solve URL

In [204]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = doi_to_url(df.loc[i, 'doi'])

## Query string

In [218]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [286]:
df['bsky_mentions'] = None

In [280]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'
access_token = auth_response.json().get('accessJwt')
headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
df.loc[0,'query_string'].split('|')

In [284]:
df_bsky = pd.DataFrame()

In [246]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
for query in range(df.shape[0]):
    print(query, end='\r')
    
    posts_bsk = []
    
    for subquery in df.loc[query,'query_string'].split('|'):
    
        params = {
                'q': subquery,
                'limit': 100
        }

        search_response = requests.get(search_url, headers=headers, params=params)
        posts = search_response.json().get('posts', [])
        
        posts_bsk = posts + posts_bsk
    
    if len(posts_bsk)>0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query,'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [256]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [304]:
df.to_csv('results/qss.tsv', sep='\t', index=False)

In [306]:
df_bsky.to_csv('results/qss_bsky.tsv', sep='\t', index=False)

# 5. Nature

In [None]:
df = pd.read_csv('data/top.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df['primary_location.source.display_name'].value_counts()

In [21]:
df = df[df['primary_location.source.display_name']=='Nature'].copy()
df.reset_index(drop=True, inplace=True)

In [None]:
df.type.value_counts()

In [None]:
df

## Solve URL

In [28]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = 'https://www.nature.com/articles/'+re.sub('https://doi.org/10.1038/', '', df.loc[i, 'doi'])

## Query string

In [33]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [40]:
df['bsky_mentions'] = None

In [88]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'
access_token = auth_response.json().get('accessJwt')
headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
df.loc[0,'query_string'].split('|')

In [46]:
df_bsky = pd.DataFrame()

In [48]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
df_bsky

In [None]:
for query in range(2319,df.shape[0]):
    print(query, end='\r')
    posts_bsk = []

    for subquery in df.loc[query, 'query_string'].split('|'):
        params = {
            'q': subquery,
            'limit': 100
        }
        
        search_response = requests.get(search_url, headers=headers, params=params)
        
        if search_response.status_code == 401:
            print("Token expirado. Reautenticando...")
            auth_response = requests.post(auth_url, json=auth_data)
            if auth_response.status_code == 200:
                access_token = auth_response.json().get('accessJwt')
                headers['Authorization'] = f'Bearer {access_token}'
                print("Reautenticación exitosa. Reintentando solicitud...")
                search_response = requests.get(search_url, headers=headers, params=params)
            else:
                print(f"Error al reautenticar: {auth_response.status_code}")
                exit()

        
        rate_limit_remaining = int(search_response.headers.get('RateLimit-Remaining', 0))
        rate_limit_reset = int(search_response.headers.get('RateLimit-Reset', 0))

        if rate_limit_remaining == 0:
            current_timestamp = int(time.time())
            wait_time = rate_limit_reset - current_timestamp
            print(f"Se alcanzó el límite de solicitudes. Esperando {wait_time} segundos...")
            time.sleep(wait_time)
            continue

        if search_response.status_code == 200:
            posts = search_response.json().get('posts', [])
            posts_bsk.extend(posts)
        else:
            print(f"Error en la búsqueda: {query}")
            continue

    if len(posts_bsk) > 0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query, 'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [88]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [96]:
df.to_csv('results/nature.tsv', sep='\t', index=False)

In [98]:
df_bsky.to_csv('results/nature_bsky.tsv', sep='\t', index=False)

# 6. Science

In [None]:
df = pd.read_csv('data/top.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df['primary_location.source.display_name'].value_counts()

In [19]:
df = df[df['primary_location.source.display_name']=='Science'].copy()
df.reset_index(drop=True, inplace=True)

In [None]:
df.type.value_counts()

In [None]:
df

## Solve URL

In [26]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = 'https://www.science.org/doi/'+re.sub('https://doi.org/', '', df.loc[i, 'doi'])

## Query string

In [31]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [42]:
df['bsky_mentions'] = None

In [None]:
df.loc[0,'query_string'].split('|')

In [169]:
df_bsky = pd.DataFrame()

In [171]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
auth_response = requests.post(auth_url, json=auth_data)
auth_response

In [210]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'

access_token = auth_response.json().get('accessJwt')

headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
for query in range(df.shape[0]):
    print(query, end='\r')
    posts_bsk = []

    for subquery in df.loc[query, 'query_string'].split('|'):
        params = {
            'q': subquery,
            'limit': 100
        }
        
        search_response = requests.get(search_url, headers=headers, params=params)
        
        if search_response.status_code == 401:
            print("Token expirado. Reautenticando...")
            auth_response = requests.post(auth_url, json=auth_data)
            if auth_response.status_code == 200:
                access_token = auth_response.json().get('accessJwt')
                headers['Authorization'] = f'Bearer {access_token}'
                print("Reautenticación exitosa. Reintentando solicitud...")
                search_response = requests.get(search_url, headers=headers, params=params)
            else:
                print(f"Error al reautenticar: {auth_response.status_code}")
                exit()

        
        rate_limit_remaining = int(search_response.headers.get('RateLimit-Remaining', 0))
        rate_limit_reset = int(search_response.headers.get('RateLimit-Reset', 0))

        if rate_limit_remaining == 0:
            current_timestamp = int(time.time())
            wait_time = rate_limit_reset - current_timestamp
            print(f"Se alcanzó el límite de solicitudes. Esperando {wait_time} segundos...")
            time.sleep(wait_time)
            continue

        if search_response.status_code == 200:
            posts = search_response.json().get('posts', [])
            posts_bsk.extend(posts)
        else:
            print(f"Error en la búsqueda: {query}")
            continue

    if len(posts_bsk) > 0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query, 'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [183]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [228]:
df.to_csv('results/science.tsv', sep='\t', index=False)

In [230]:
df_bsky.to_csv('results/science_bsky.tsv', sep='\t', index=False)

# 7. PNAS

In [None]:
df = pd.read_csv('data/top.csv')
df['title'] = df['title'].str.replace('&amp;', '&', regex=False)
df

In [None]:
df['primary_location.source.display_name'].value_counts()

In [19]:
df = df[df['primary_location.source.display_name']=='Proceedings of the National Academy of Sciences'].copy()
df.reset_index(drop=True, inplace=True)

In [None]:
df.type.value_counts()

In [None]:
df

## Solve URL

In [27]:
df['url_final'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'url_final'] = 'https://www.pnas.org/doi/'+re.sub('https://doi.org/', '', df.loc[i, 'doi'])

## Query string

In [32]:
df['query_string'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'query_string'] = generar_cadena_or_completa(df.iloc[i,])

In [None]:
df.loc[0,'query_string']

## Bluesky

In [38]:
df['bsky_mentions'] = None

In [None]:
df.loc[0,'query_string'].split('|')

In [43]:
df_bsky = pd.DataFrame()

In [45]:
bsky_columns = ['uri', 'cid', 'indexedAt', 'record_text', 'author_did', 'author_handle', 'record_text', 'replyCount', 'repostCount', 'likeCount', 'quoteCount']

In [None]:
auth_response = requests.post(auth_url, json=auth_data)
auth_response

In [106]:
search_url = 'https://bsky.social/xrpc/app.bsky.feed.searchPosts'

access_token = auth_response.json().get('accessJwt')

headers = {
            'Authorization': f'Bearer {access_token}'
    }

In [None]:
for query in range(3120,df.shape[0]):
    print(query, end='\r')
    posts_bsk = []

    for subquery in df.loc[query, 'query_string'].split('|'):
        params = {
            'q': subquery,
            'limit': 100
        }
        
        search_response = requests.get(search_url, headers=headers, params=params)
        
        if search_response.status_code == 401:
            print("Token expirado. Reautenticando...")
            auth_response = requests.post(auth_url, json=auth_data)
            if auth_response.status_code == 200:
                access_token = auth_response.json().get('accessJwt')
                headers['Authorization'] = f'Bearer {access_token}'
                print("Reautenticación exitosa. Reintentando solicitud...")
                search_response = requests.get(search_url, headers=headers, params=params)
            else:
                print(f"Error al reautenticar: {auth_response.status_code}")
                exit()

        
        rate_limit_remaining = int(search_response.headers.get('RateLimit-Remaining', 0))
        rate_limit_reset = int(search_response.headers.get('RateLimit-Reset', 0))

        if rate_limit_remaining == 0:
            current_timestamp = int(time.time())
            wait_time = rate_limit_reset - current_timestamp
            print(f"Se alcanzó el límite de solicitudes. Esperando {wait_time} segundos...")
            time.sleep(wait_time)
            continue

        if search_response.status_code == 200:
            posts = search_response.json().get('posts', [])
            posts_bsk.extend(posts)
        else:
            print(f"Error en la búsqueda: {query}")
            continue

    if len(posts_bsk) > 0:
        df.loc[query, 'bsky_mentions'] = pd.DataFrame(posts_bsk).uri.drop_duplicates().shape[0]
        df_bsky_aux = pd.json_normalize(posts_bsk, sep='_')
        df_bsky_aux = df_bsky_aux[bsky_columns]
        df_bsky_aux['doi'] = df.loc[query, 'doi']
        df_bsky = pd.concat([df_bsky, df_bsky_aux], axis=0)
    else:
        df.loc[query, 'bsky_mentions'] = 0

In [None]:
df_bsky = df_bsky.reset_index(drop=True).drop_duplicates()
df_bsky

In [None]:
sum(df['bsky_mentions'])

In [None]:
sum(df['bsky_mentions']>0)

In [None]:
df_bsky.groupby('doi')['author_did'].nunique().reset_index()['author_did'].mean()

## X (via Altmetric.com)

In [119]:
df['x_mentions'] = None

In [None]:
for i in range(df.shape[0]):
    print(i, end='\r')
    df.loc[i, 'x_mentions'] = get_altmetric_data(re.sub(r'^https://doi.org/', '', df.loc[i,'doi']))

In [None]:
sum(df['x_mentions']>0)

In [None]:
df[df['x_mentions']>0]['x_mentions'].mean()

## Export

In [132]:
df.to_csv('results/pnas.tsv', sep='\t', index=False)

In [133]:
df_bsky.to_csv('results/pnas_bsky.tsv', sep='\t', index=False)