In [None]:
import pandas as pd
import io
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor

In [None]:
# --- 1. Load the three toy CSV files from multi‑line strings -----------------
articles_csv = """
identifier,title,publication_date,doi,abstract,author_count,affiliation_count,corpus
85133492759,"Metamodeling and Audio Signals Design Process, for the Encounter Between Sound and Changing Forms",2023-01-01,10.1007/978-3-031-09659-4_42,"...",3,2,True
85133293730,"Facility Layout Design in Textile MSMEs. Literature Review of Resilient Indicators",2023-01-01,10.1007/978-3-031-09360-9_23,"...",5,2,True
85132518705,"Optimal finite sample post-selection confidence distributions in generalized linear models",2023-01-01,10.1016/j.jspi.2022.06.001,"...",2,2,True
85112575431,"Corporate social responsibility, green innovation and competitiveness – causality in manufacturing",2022-12-19,10.1108/CR-12-2020-0160,"...",2,2,True
85109263966,"Analysing competing logics towards sustainable supplier management",2022-12-19,10.1108/SCM-07-2020-0354,"...",3,3,True
85123598971,"Lumbocostovertebral syndrome. A case report",2022-12-17,10.30944/20117582.907,"...",4,2,True
85125083244,"Identification and morphological characterization of marine actinomycetes as biocontrol agents of Fusarium solani in tomato",2022-12-16,10.47280/RevFacAgron(LUZ).v39.n1.15,"...",6,3,True
85133455001,"Meeting 24-h movement guidelines and markers of adiposity in adults from eight Latin America countries: the ELANS study",2022-12-01,10.1038/s41598-022-15504-z,"...",18,22,True
85133239609,"The close interaction between hypoxia-related proteins and metastasis in pancarcinomas",2022-12-01,10.1038/s41598-022-15246-y,"...",18,12,True
"""

authors_csv = """
identifier,first_name,last_name,auth_name,initials
57219054382,Jorge Edwin,Ormaza Andrade,Ormaza Andrade J.E.,J.E.
57192930404,Mario,Hurtado,Hurtado M.,M.
57192803433,Ruth Elizabeth,Minga-Vallejo,Minga-Vallejo R.E.,R.E.
57220465983,Carlos,Tapia,Tapia C.,C.
57215549671,Pelayo,Salinas-DeLeón,Salinas-DeLeón P.,P.
57209420893,Lorena,Chalco,Chalco L.,L.
6701484398,Emmanuelle,Quentin,Quentin E.,E.
7201831013,J. F.,Dumont,Dumont J.F.,J.F.
57223997475,Andrés Alejandro,Vaca,Vaca A.A.,A.A.
"""

links_csv = """
article_id,author_id
85133492759,57219054382
85133293730,57192930404
85133293730,57192803433
85133293730,57220465983
85133293730,57215549671
85133293730,57209420893
85132518705,6701484398
85112575431,7201831013
85109263966,57223997475
85123598971,57219054382
85123598971,57192930404
"""


In [None]:
articles = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles.csv")
authors = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/authors.csv")
links   = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles_authors.csv")

In [None]:
# --- 2. Build an author‑year panel with yearly publication counts -------------
# Merge articles with links to get author-publication pairs
pairs = links.merge(articles, left_on="article_id", right_on="identifier", how="left")

# extract publication year
pairs["year"] = pd.to_datetime(pairs["publication_date"]).dt.year

# publications per author per year
author_year_counts = (
    pairs.groupby(["author_id", "year"])
    .size()
    .rename("pubs_in_year")
    .reset_index()
)

In [None]:
# --- 3. Expand to include prior‑year cumulative totals ------------------------
# Find career start for each author (earliest year)
min_years = author_year_counts.groupby("author_id")["year"].min().to_dict()

records = []
for aid, grp in author_year_counts.groupby("author_id"):
    for year in grp["year"]:
        prev_total = grp.loc[grp["year"] < year, "pubs_in_year"].sum()
        pubs = grp.loc[grp["year"] == year, "pubs_in_year"].values[0]
        records.append(
            {
                "author_id": aid,
                "year": year,
                "prev_total_pubs": prev_total,
                "pubs_this_year": pubs,
            }
        )

panel = pd.DataFrame(records)

In [None]:
# --- 4. Split into train (<=2022) and test (2023) -----------------------------
train = panel[panel["year"] <= 2022]
test  = panel[panel["year"] == 2023]

# If no 2023 rows for an author, create row with zero pubs and proper prev_total
for aid in panel["author_id"].unique():
    if aid not in test["author_id"].values:
        prev_total = panel.loc[(panel["author_id"] == aid) & (panel["year"] <= 2022), "pubs_this_year"].sum()
        test = pd.concat(
            [
                test,
                pd.DataFrame(
                    [
                        {
                            "author_id": aid,
                            "year": 2023,
                            "prev_total_pubs": prev_total,
                            "pubs_this_year": 0,
                        }
                    ]
                ),
            ],
            ignore_index=True,
        )

In [None]:
# --- 5. Fit a simple Poisson regression using scikit‑learn --------------------
# Features: previous cumulative pubs
model = PoissonRegressor(alpha=0.0, max_iter=1000)
X_train = train[["prev_total_pubs"]]
y_train = train["pubs_this_year"]
model.fit(X_train, y_train)

In [None]:
# --- 6. Predict for 2023 ------------------------------------------------------
test["predicted_pubs_2023"] = model.predict(test[["prev_total_pubs"]])

In [None]:
# --- 7. Show a compact result table ------------------------------------------
result = test.merge(authors, left_on="author_id", right_on="identifier", how="left")
result_table = result[["first_name", "last_name", "prev_total_pubs", "pubs_this_year", "predicted_pubs_2023"]]

In [None]:
result_table.sort_values(by=["pubs_this_year"], ascending=False)

Unnamed: 0,first_name,last_name,prev_total_pubs,pubs_this_year,predicted_pubs_2023
0,Lorena,Siguenza-Guzman,40,1,2.178923
1,Franklin,Tigre-Ortega,9,1,1.654541
2,Pablo,Flores-Siguenza,2,1,1.554816
3,Andrea C.,Garcia-Angulo,1,1,1.541069
4,Freddy,Lema,2,1,1.554816
...,...,...,...,...,...
13081,Javier Gavilanes,Carrión,4,0,1.582679
13082,Joffre,Cartuche,1,0,1.541069
13083,Edison R.,Valencia-Nunez,5,0,1.596797
13084,Johnny,Novillo-Vicuña,2,0,1.554816
