In [262]:
import os
from google.cloud import bigquery
from google.cloud import storage
from prophet import Prophet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "ethereal-accord-397414-944cb605214b.json"

In [263]:
# Definir nombre del Dataset
dataset_id = "Migraciones"
project_id = "ethereal-accord-397414"

# Obtencion de datos de Big Query

In [264]:
def leer_tabla_bq(dataset_id, table_id, project_id):
    """
    Leer tabla desde Big Query
    Parametros nombre del proyecto, dataset y de la tabla.
    Retorna un DataFrame con todos los datos de la tabla.
    """
    client = bigquery.Client()

    # Define el ID completo de la tabla en BigQuery
    table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

    # Load the table
    table = client.get_table(table_ref)

    # Get the data as a pandas DataFrame
    df = client.list_rows(table).to_dataframe()

    return df

Obtengo todos los datos de las tablas en distintos DataFrames.

In [265]:
df_migracion = leer_tabla_bq(dataset_id, 'migracion', project_id)
df_pais = leer_tabla_bq(dataset_id, 'pais', project_id)

# Creacion de DataFrame para trabajar
### Merge

In [266]:
df_tot = df_migracion.merge(right=df_pais, how='left', on='id_pais')
df_tot.head()

Unnamed: 0,id_migracion,id_inmigracion,id_fac_soc,id_fac_eco,id_pais,migracion_neta,migracion_neta_pred,anio,nombre
0,20160002,,20160002,20160002,2,16,32,2016-12-31,Antigua y Barbuda
1,20160001,,20160001,20160001,1,5215,4859,2016-12-31,Argentina
2,20160004,,20160004,20160004,4,1283,1146,2016-12-31,Belice
3,20160005,,20160005,20160005,5,-10071,-9632,2016-12-31,Bolivia
4,20160006,,20160006,20160006,6,82589,46414,2016-12-31,Brasil


In [267]:
# Dropear columnas de ID que no precisamos ya que no brindan ninguna informacion.
df_tot.drop(columns=['id_inmigracion', 'id_fac_soc', 'id_fac_eco', 'migracion_neta_pred'], inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,nombre
0,20160002,2,16,2016-12-31,Antigua y Barbuda
1,20160001,1,5215,2016-12-31,Argentina
2,20160004,4,1283,2016-12-31,Belice
3,20160005,5,-10071,2016-12-31,Bolivia
4,20160006,6,82589,2016-12-31,Brasil


In [268]:
# Renombrar columnas para facilidad
df_tot.rename(columns={'nombre': 'pais'}, inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
0,20160002,2,16,2016-12-31,Antigua y Barbuda
1,20160001,1,5215,2016-12-31,Argentina
2,20160004,4,1283,2016-12-31,Belice
3,20160005,5,-10071,2016-12-31,Bolivia
4,20160006,6,82589,2016-12-31,Brasil


In [269]:
# Transfomrar Anio en Año real.
df_tot['anio'] = pd.to_datetime(df_tot['anio'], format="%Y-%m-%d")
df_tot['anio'].head()

0   2016-12-31
1   2016-12-31
2   2016-12-31
3   2016-12-31
4   2016-12-31
Name: anio, dtype: datetime64[ns]

In [270]:
df_tot.sort_values(by='anio', inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
909,19900017,17,-15350,1990-12-31,Honduras
891,19900002,2,-1145,1990-12-31,Antigua y Barbuda
892,19900001,1,6021,1990-12-31,Argentina
893,19900003,3,241,1990-12-31,Bahamas
911,19900021,21,-286584,1990-12-31,Mexico


In [271]:
df_tot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 990 entries, 909 to 934
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id_migracion    990 non-null    Int64         
 1   id_pais         990 non-null    Int64         
 2   migracion_neta  960 non-null    Int64         
 3   anio            990 non-null    datetime64[ns]
 4   pais            990 non-null    object        
dtypes: Int64(3), datetime64[ns](1), object(1)
memory usage: 49.3+ KB


**Estudiar NaNs que quedan**

In [272]:
df_tot.isna().sum()

id_migracion       0
id_pais            0
migracion_neta    30
anio               0
pais               0
dtype: int64

In [273]:
df_tot.tail()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
942,20220022,22,,2022-12-31,Nicaragua
943,20220023,23,,2022-12-31,Panama
944,20220025,25,,2022-12-31,Paraguay
946,20220012,12,,2022-12-31,Santa Lucia
934,20220026,26,,2022-12-31,El Salvador


In [274]:
# Quedan muchos NaN en todas las variables en 2022 asi que lo sacamos
df_tot = df_tot[df_tot['anio'] < ('2022-01-01')]

In [275]:
df_tot.isna().sum()

id_migracion      0
id_pais           0
migracion_neta    0
anio              0
pais              0
dtype: int64

In [276]:
df_tot.tail()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
116,20210001,1,2344,2021-12-31,Argentina
117,20210003,3,485,2021-12-31,Bahamas
118,20210004,4,577,2021-12-31,Belice
953,20210012,12,0,2021-12-31,Santa Lucia
139,20210024,24,68012,2021-12-31,Peru


In [277]:
df_tot[df_tot['pais'] == 'Argentina'].head(35)

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
892,19900001,1,6021,1990-12-31,Argentina
376,19910001,1,-8569,1991-12-31,Argentina
747,19920001,1,-13201,1992-12-31,Argentina
231,19930001,1,-16383,1993-12-31,Argentina
607,19940001,1,-18725,1994-12-31,Argentina
87,19950001,1,-19736,1995-12-31,Argentina
491,19960001,1,-20396,1996-12-31,Argentina
862,19970001,1,-20588,1997-12-31,Argentina
346,19980001,1,-20331,1998-12-31,Argentina
717,19990001,1,-20220,1999-12-31,Argentina


# Prophet

Preparar DataFrame para Prophet.

Dataframe must have columns "ds" and "y" with the dates and values respectively.

In [278]:
paises = df_tot['id_pais'].unique()
paises

<IntegerArray>
[17,  2,  1,  3, 21, 22, 23, 25, 24, 12, 27, 29, 28, 30,  4, 19,  6,  5, 18,
 16, 15, 26, 13, 14, 11, 10,  9,  8,  7, 20]
Length: 30, dtype: Int64

In [279]:
def predecir(data, columna, periodo, scale, idpais):
    
    # Filtro por pais.
    df_pais = data[data['id_pais'] == idpais]

    # Renombrar las columnas para que sean compatibles con Prophet.
    df_prophet = pd.DataFrame()
    df_prophet['ds'] = df_pais['anio']
    df_prophet['y'] = df_pais[columna]

    # Crear un modelo Prophet
    model = Prophet(changepoint_prior_scale=scale)

    # Ajustar el modelo a los datos
    model.fit(df_prophet)

    # Crear un DataFrame con las fechas futuras que deseas predecir
    future = model.make_future_dataframe(periods=periodo, freq='Y')

    # Realizar las predicciones
    forecast = model.predict(future)

    return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [280]:
df_pred = pd.DataFrame(columns=['ds', 'yhat', 'id_pais'])

for p in paises:
    df_forcast = predecir(df_tot, 'migracion_neta', 5, 2, p)
    df_forcast = df_forcast[['ds', 'yhat']]
    df_forcast['id_pais'] = p
    df_pred = df_pred.merge(df_forcast, how='outer')

15:14:37 - cmdstanpy - INFO - Chain [1] start processing
15:14:38 - cmdstanpy - INFO - Chain [1] done processing
15:14:38 - cmdstanpy - INFO - Chain [1] start processing
15:14:38 - cmdstanpy - INFO - Chain [1] done processing
15:14:38 - cmdstanpy - INFO - Chain [1] start processing
15:14:39 - cmdstanpy - INFO - Chain [1] done processing
15:14:39 - cmdstanpy - INFO - Chain [1] start processing
15:14:39 - cmdstanpy - INFO - Chain [1] done processing
15:14:40 - cmdstanpy - INFO - Chain [1] start processing
15:14:40 - cmdstanpy - INFO - Chain [1] done processing
15:14:40 - cmdstanpy - INFO - Chain [1] start processing
15:14:41 - cmdstanpy - INFO - Chain [1] done processing
15:14:41 - cmdstanpy - INFO - Chain [1] start processing
15:14:41 - cmdstanpy - INFO - Chain [1] done processing
15:14:42 - cmdstanpy - INFO - Chain [1] start processing
15:14:42 - cmdstanpy - INFO - Chain [1] done processing
15:14:42 - cmdstanpy - INFO - Chain [1] start processing
15:14:43 - cmdstanpy - INFO - Chain [1]

In [281]:
df_pred.rename(columns={'ds': 'anio', 'yhat': 'migracion_neta_pred'}, inplace=True)
df_pred.head()

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352.675042,17
1,1991-12-31,-15521.211957,17
2,1992-12-31,-15568.323794,17
3,1993-12-31,-15559.075368,17
4,1994-12-31,-15246.074408,17


In [282]:
df_pred.head(50)

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352.675042,17
1,1991-12-31,-15521.211957,17
2,1992-12-31,-15568.323794,17
3,1993-12-31,-15559.075368,17
4,1994-12-31,-15246.074408,17
5,1995-12-31,-14711.073509,17
6,1996-12-31,-14049.076315,17
7,1997-12-31,-13343.075413,17
8,1998-12-31,-12626.247103,17
9,1999-12-31,-11756.858749,17


In [283]:
# Pasar prediccion a valores enteros
df_pred['migracion_neta_pred'] =  df_pred['migracion_neta_pred'].astype(int)

In [284]:
df_pred.head(50)

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352,17
1,1991-12-31,-15521,17
2,1992-12-31,-15568,17
3,1993-12-31,-15559,17
4,1994-12-31,-15246,17
5,1995-12-31,-14711,17
6,1996-12-31,-14049,17
7,1997-12-31,-13343,17
8,1998-12-31,-12626,17
9,1999-12-31,-11756,17


In [242]:
# Crear codigo migracion para poder hacer el merge
# df_pred['id_migracion'] =  10000 * df_pred['anio'].dt.year + df_pred['id_pais']

In [286]:
df_pred.head()

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352,17
1,1991-12-31,-15521,17
2,1992-12-31,-15568,17
3,1993-12-31,-15559,17
4,1994-12-31,-15246,17


In [285]:
# df_pred = df_pred[['id_migracion', 'migracion_neta_pred']]
df_pred.tail()

Unnamed: 0,anio,migracion_neta_pred,id_pais
1105,2022-12-31,-17754,20
1106,2023-12-31,-16369,20
1107,2024-12-31,-12946,20
1108,2025-12-31,-11551,20
1109,2026-12-31,-10161,20


In [287]:
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
909,19900017,17,-15350,1990-12-31,Honduras
891,19900002,2,-1145,1990-12-31,Antigua y Barbuda
892,19900001,1,6021,1990-12-31,Argentina
893,19900003,3,241,1990-12-31,Bahamas
911,19900021,21,-286584,1990-12-31,Mexico


In [296]:
# df_mig_new = df_tot.merge(df_pred, how='outer', on=['id_pais', 'anio'])
df_mig_new = df_pred.merge(df_tot, how='outer', on=['id_pais', 'anio'])
df_mig_new[df_mig_new['id_pais'] == 2]
# df_mig_new.head(40)

Unnamed: 0,anio,migracion_neta_pred,id_pais,id_migracion,migracion_neta,pais
37,1990-12-31,-1138,2,19900002.0,-1145.0,Antigua y Barbuda
38,1991-12-31,216,2,19910002.0,223.0,Antigua y Barbuda
39,1992-12-31,404,2,19920002.0,405.0,Antigua y Barbuda
40,1993-12-31,483,2,19930002.0,484.0,Antigua y Barbuda
41,1994-12-31,512,2,19940002.0,513.0,Antigua y Barbuda
42,1995-12-31,524,2,19950002.0,528.0,Antigua y Barbuda
43,1996-12-31,484,2,19960002.0,478.0,Antigua y Barbuda
44,1997-12-31,489,2,19970002.0,493.0,Antigua y Barbuda
45,1998-12-31,464,2,19980002.0,465.0,Antigua y Barbuda
46,1999-12-31,389,2,19990002.0,390.0,Antigua y Barbuda


---
---
---

In [None]:
paises = df_tot['pais'].unique()
paises

array(['Honduras', 'Antigua y Barbuda', 'Argentina', 'Bahamas', 'Mexico',
       'Nicaragua', 'Panama', 'Paraguay', 'Peru', 'Santa Lucia',
       'Trinidad y Tobago', 'Estados Unidos', 'Uruguay', 'Venezuela',
       'Belice', 'Jamaica', 'Brasil', 'Bolivia', 'Haiti', 'Guyana',
       'Guatemala', 'El Salvador', 'Ecuador', 'Granada', 'Cuba',
       'Costa Rica', 'Colombia', 'Chile', 'Canada',
       'Republica Dominicana'], dtype=object)

In [None]:
def predecir(data, columna, periodo, scale, pais):
    
    # Filtro por pais.
    df_pais = data[data['pais'] == pais]

    # Renombrar las columnas para que sean compatibles con Prophet.
    df_prophet = pd.DataFrame()
    df_prophet['ds'] = df_pais['anio']
    df_prophet['y'] = df_pais[columna]

    # Crear un modelo Prophet
    model = Prophet(changepoint_prior_scale=scale)

    # Ajustar el modelo a los datos
    model.fit(df_prophet)

    # Crear un DataFrame con las fechas futuras que deseas predecir
    future = model.make_future_dataframe(periods=periodo, freq='Y')

    # Realizar las predicciones
    forecast = model.predict(future)

    return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [None]:
df_pred = pd.DataFrame(columns=['ds', 'yhat', 'pais'])

for p in paises:
# for p in ['Argentina', 'Uruguay']:

    df_forcast = predecir(df_tot, 'migracion_neta', 10, 2, p)
    df_forcast = df_forcast[['ds', 'yhat']]
    df_forcast['pais'] = p
    df_pred = df_pred.merge(df_forcast, how='outer')

12:23:53 - cmdstanpy - INFO - Chain [1] start processing
12:23:53 - cmdstanpy - INFO - Chain [1] done processing
12:23:54 - cmdstanpy - INFO - Chain [1] start processing
12:23:54 - cmdstanpy - INFO - Chain [1] done processing
12:23:54 - cmdstanpy - INFO - Chain [1] start processing
12:23:55 - cmdstanpy - INFO - Chain [1] done processing
12:23:55 - cmdstanpy - INFO - Chain [1] start processing
12:23:55 - cmdstanpy - INFO - Chain [1] done processing
12:23:56 - cmdstanpy - INFO - Chain [1] start processing
12:23:56 - cmdstanpy - INFO - Chain [1] done processing
12:23:56 - cmdstanpy - INFO - Chain [1] start processing
12:23:57 - cmdstanpy - INFO - Chain [1] done processing
12:23:57 - cmdstanpy - INFO - Chain [1] start processing
12:23:57 - cmdstanpy - INFO - Chain [1] done processing
12:23:58 - cmdstanpy - INFO - Chain [1] start processing
12:23:58 - cmdstanpy - INFO - Chain [1] done processing
12:23:58 - cmdstanpy - INFO - Chain [1] start processing
12:23:59 - cmdstanpy - INFO - Chain [1]

In [None]:
df_pred.head(50)

Unnamed: 0,ds,yhat,pais
0,1990-12-31,-15352.675042,Honduras
1,1991-12-31,-15521.211957,Honduras
2,1992-12-31,-15568.323794,Honduras
3,1993-12-31,-15559.075368,Honduras
4,1994-12-31,-15246.074408,Honduras
5,1995-12-31,-14711.073509,Honduras
6,1996-12-31,-14049.076315,Honduras
7,1997-12-31,-13343.075413,Honduras
8,1998-12-31,-12626.247103,Honduras
9,1999-12-31,-11756.858749,Honduras
