In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import pandas as pd
from io import StringIO

In [None]:
articles_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles.csv")
authors_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/authors.csv")
links_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/articles_authors.csv")
institutions_df = pd.read_csv("/content/drive/MyDrive/Universidad/Tesis/DataScopus/AnalisisGrafos/raw_data/affiliations.csv")


In [None]:
# Convert 'publication_date' to datetime after loading the CSV
articles_df['publication_date'] = pd.to_datetime(articles_df['publication_date'])

# Join authors with their institutions
authors_inst_df = pd.merge(authors_df, institutions_df, on="identifier")

# Join links with authors + institutions
links_auth_inst = pd.merge(links_df, authors_inst_df, left_on="author_id", right_on="identifier")

# Join with articles
full_df = pd.merge(links_auth_inst, articles_df, left_on="article_id", right_on="identifier", suffixes=('_author', '_article'))

# Add year
full_df['pub_year'] = full_df['publication_date'].dt.year

# Group publications per year per institution
inst_yearly_counts = full_df.groupby(['affiliation_name', 'pub_year'])['article_id'].nunique().reset_index(name='num_publications')
display(inst_yearly_counts)

Unnamed: 0,affiliation_name,pub_year,num_publications


In [None]:
# Use publication counts of University A
# Check the unique values in 'affiliation_name' to find the correct institution name
print(inst_yearly_counts['affiliation_name'].unique())

# Replace 'Escuela Polotecnica Nacional' with the correct institution name found in the data
ua_data = inst_yearly_counts[inst_yearly_counts['affiliation_name'] == 'Universidad Estatal de Bolivar'] # Corrected the institution name
ua_data = ua_data.sort_values('pub_year')

# Reshape for sklearn
X = ua_data['pub_year'].values.reshape(-1, 1)
y = ua_data['num_publications'].values

# Ensure there is data before training the model
if len(X) > 0:
    # Train the model
    lr_model = LinearRegression()
    lr_model.fit(X, y)

    # Predict for future years
    future_years = np.array([2024, 2025]).reshape(-1, 1)
    future_preds = lr_model.predict(future_years)

    # Plot actual and forecast
    plt.plot(ua_data['pub_year'], y, marker='o', label='Actual')
    plt.plot(future_years.flatten(), future_preds, marker='x', linestyle='--', label='Forecast')
    plt.title("Linear Regression Forecast - Escuela Politécnica Nacional") # Updated title
    plt.xlabel("Year")
    plt.ylabel("Number of Publications")
    plt.grid(True)
    plt.legend()
    plt.show()

    # Optional: View predicted values
    for year, pred in zip(future_years.flatten(), future_preds):
        print(f"Predicted publications in {year}: {pred:.2f}")
else:
    print("No data found for the specified institution.")

[]
No data found for the specified institution.
