In [12]:
!pip install prophet



In [13]:
import pandas as pd
from io import StringIO
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from prophet import Prophet

In [14]:
# Toy CSV data
authors_csv = """identifier,first_name,last_name,auth_name,initials
57219054382,Jorge Edwin,Ormaza Andrade,Ormaza Andrade J.E.,J.E.
57192930404,Mario,Hurtado,Hurtado M.,M.
57192803433,Ruth Elizabeth,Minga-Vallejo,Minga-Vallejo R.E.,R.E.
57220465983,Carlos,Tapia,Tapia C.,C.
57215549671,Pelayo,Salinas-DeLeón,Salinas-DeLeón P.,P.
57209420893,Lorena,Chalco,Chalco L.,L.
6701484398,Emmanuelle,Quentin,Quentin E.,E.
7201831013,J. F.,Dumont,Dumont J.F.,J.F.
57223997475,Andrés Alejandro,Vaca,Vaca A.A.,A.A.
"""

institutions_csv = """identifier,institution_name,city,country
57219054382,University A,Quito,Ecuador
57192930404,University A,Quito,Ecuador
57192803433,University A,Quito,Ecuador
57220465983,University A,Quito,Ecuador
57215549671,University B,Cuenca,Ecuador
57209420893,University B,Cuenca,Ecuador
6701484398,University C,Guayaquil,Ecuador
7201831013,University D,Paris,France
57223997475,University B,Cuenca,Ecuador
"""

articles_csv = """identifier,title,publication_date,doi,abstract,author_count,affiliation_count,corpus
85133492759,"Metamodeling and Audio Signals Design Process",2023-01-01,10.1007/978-3-031-09659-4_42,"...",3,2,True
85133293730,"Facility Layout Design",2023-01-01,10.1007/978-3-031-09360-9_23,"...",5,2,True
85132518705,"Optimal Confidence Distributions",2023-01-01,10.1016/j.jspi.2022.06.001,"...",2,2,True
85112575431,"Corporate Social Responsibility",2022-12-19,10.1108/CR-12-2020-0160,"...",2,2,True
85109263966,"Sustainable Supplier Management",2022-12-19,10.1108/SCM-07-2020-0354,"...",3,3,True
85123598971,"Lumbocostovertebral Syndrome",2022-12-17,10.30944/20117582.907,"...",4,2,True
"""

links_csv = """article_id,author_id
85133492759,57219054382
85133293730,57192930404
85133293730,57192803433
85133293730,57220465983
85133293730,57215549671
85133293730,57209420893
85132518705,6701484398
85112575431,7201831013
85109263966,57223997475
85123598971,57219054382
85123598971,57192930404
"""

# Load into DataFrames
# authors_df = pd.read_csv(StringIO(authors_csv))
# institutions_df = pd.read_csv(StringIO(institutions_csv))
# articles_df = pd.read_csv(StringIO(articles_csv), parse_dates=['publication_date'])
# links_df = pd.read_csv(StringIO(links_csv))

articles_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles.csv")
authors_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/authors.csv")
links_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles_authors.csv")
institutions_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/affiliations.csv")


In [17]:
# Convert 'publication_date' to datetime after loading the CSV
articles_df['publication_date'] = pd.to_datetime(articles_df['publication_date'])

# Join authors with their institutions
authors_inst_df = pd.merge(authors_df, institutions_df, on="identifier")

# Join links with authors + institutions
links_auth_inst = pd.merge(links_df, authors_inst_df, left_on="author_id", right_on="identifier")

# Join with articles
full_df = pd.merge(links_auth_inst, articles_df, left_on="article_id", right_on="identifier", suffixes=('_author', '_article'))

# Add year
full_df['pub_year'] = full_df['publication_date'].dt.year

# Group publications per year per institution
inst_yearly_counts = full_df.groupby(['affiliation_name', 'pub_year'])['article_id'].nunique().reset_index(name='num_publications')
display(inst_yearly_counts)

Unnamed: 0,affiliation_name,pub_year,num_publications


In [20]:
# Use University A for example
# Check if the institution name exists in the data
institution_name_to_forecast = 'Escuela Polotecnica Nacional'

if institution_name_to_forecast not in inst_yearly_counts['affiliation_name'].unique():
    print(f"Institution '{institution_name_to_forecast}' not found in the data.")
    print("Available institutions:")
    for inst in inst_yearly_counts['affiliation_name'].unique():
        print(f"- {inst}")
    # Exit or handle the case where the institution is not found
else:
    ua_data = inst_yearly_counts[inst_yearly_counts['affiliation_name'] == institution_name_to_forecast]
    ua_data = ua_data.set_index('pub_year').sort_index()

    # Check if there is enough data for ARIMA
    if len(ua_data) < 2: # ARIMA generally requires at least 2 points for differencing
        print(f"Not enough data for institution '{institution_name_to_forecast}' to fit ARIMA model.")
        print(f"Number of data points: {len(ua_data)}")
    else:
        # Fit ARIMA model
        # Changed the ARIMA order to (0, 1, 0) which might work with only two data points
        arima_model = ARIMA(ua_data['num_publications'], order=(0,1,0))
        arima_fit = arima_model.fit()
        forecast_arima = arima_fit.forecast(steps=2)

        # Plot
        plt.plot(ua_data.index, ua_data['num_publications'], marker='o', label='Actual')
        plt.plot(range(ua_data.index.max()+1, ua_data.index.max()+3), forecast_arima, marker='x', linestyle='--', label='Forecast')
        plt.title(f"ARIMA Forecast - {institution_name_to_forecast}")
        plt.xlabel("Year")
        plt.ylabel("Publications")
        plt.legend()
        plt.grid(True)
        plt.show()

    # Prepare data for Prophet
    # Prophet might be able to handle limited data, but it's still good to check
    if len(ua_data) == 0:
         print(f"No data available for institution '{institution_name_to_forecast}' to fit Prophet model.")
    else:
        df_prophet = ua_data.reset_index().rename(columns={'pub_year': 'ds', 'num_publications': 'y'})
        # Convert to datetime, ensuring a valid format
        df_prophet['ds'] = pd.to_datetime(df_prophet['ds'], format='%Y')


        # Fit and predict
        model = Prophet(yearly_seasonality=False)
        model.fit(df_prophet)
        # Ensure enough data points for forecasting with Prophet
        if len(df_prophet) > 0:
             future = model.make_future_dataframe(periods=2, freq='Y')
             forecast = model.predict(future)

             # Plot forecast
             fig = model.plot(forecast)
             plt.title(f"Prophet Forecast - {institution_name_to_forecast}")
             plt.xlabel("Year")
             plt.ylabel("Publications")
             plt.grid(True)
             plt.show()
        else:
             print(f"Not enough data points for institution '{institution_name_to_forecast}' to make future predictions with Prophet.")

Institution 'Escuela Polotecnica Nacional' not found in the data.
Available institutions:


In [21]:
# Prepare data for Prophet
df_prophet = ua_data.reset_index().rename(columns={'pub_year': 'ds', 'num_publications': 'y'})
df_prophet['ds'] = pd.to_datetime(df_prophet['ds'], format='%Y')

# Fit and predict
model = Prophet(yearly_seasonality=False)
model.fit(df_prophet)
future = model.make_future_dataframe(periods=2, freq='Y')
forecast = model.predict(future)

# Plot forecast
fig = model.plot(forecast)
plt.title("Prophet Forecast - University A")
plt.xlabel("Year")
plt.ylabel("Publications")
plt.grid(True)
plt.show()


ValueError: Dataframe has less than 2 non-NaN rows.