In [2]:
import os
from google.cloud import bigquery
from google.cloud import storage
from prophet import Prophet
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "ethereal-accord-397414-944cb605214b.json"

  from .autonotebook import tqdm as notebook_tqdm


In [1]:
# Definir nombre del Dataset
dataset_id = "Migraciones"
project_id = "ethereal-accord-397414"

# Obtencion de datos de Big Query

In [3]:
def leer_tabla_bq(dataset_id, table_id, project_id):
    """
    Leer tabla desde Big Query
    Parametros nombre del proyecto, dataset y de la tabla.
    Retorna un DataFrame con todos los datos de la tabla.
    """
    client = bigquery.Client()

    # Define el ID completo de la tabla en BigQuery
    table_ref = client.dataset(dataset_id, project=project_id).table(table_id)

    # Load the table
    table = client.get_table(table_ref)

    # Get the data as a pandas DataFrame
    df = client.list_rows(table).to_dataframe()

    return df

Obtengo todos los datos de las tablas en distintos DataFrames.

In [4]:
df_migracion = leer_tabla_bq(dataset_id, 'migracion', project_id)
df_pais = leer_tabla_bq(dataset_id, 'pais', project_id)

# Creacion de DataFrame para trabajar
### Merge

In [5]:
df_tot = df_migracion.merge(right=df_pais, how='left', on='id_pais')
df_tot.head()

Unnamed: 0,id_migracion,id_inmigracion,id_fac_soc,id_fac_eco,id_pais,migracion_neta,migracion_neta_pred,anio,nombre
0,20160002,,20160002,20160002,2,16,32,2016-12-31,Antigua y Barbuda
1,20160001,,20160001,20160001,1,5215,4859,2016-12-31,Argentina
2,20160004,,20160004,20160004,4,1283,1146,2016-12-31,Belice
3,20160005,,20160005,20160005,5,-10071,-9632,2016-12-31,Bolivia
4,20160006,,20160006,20160006,6,82589,46414,2016-12-31,Brasil


In [6]:
# Dropear columnas de ID que no precisamos ya que no brindan ninguna informacion.
df_tot.drop(columns=['id_inmigracion', 'id_fac_soc', 'id_fac_eco', 'migracion_neta_pred'], inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,nombre
0,20160002,2,16,2016-12-31,Antigua y Barbuda
1,20160001,1,5215,2016-12-31,Argentina
2,20160004,4,1283,2016-12-31,Belice
3,20160005,5,-10071,2016-12-31,Bolivia
4,20160006,6,82589,2016-12-31,Brasil


In [7]:
# Renombrar columnas para facilidad
df_tot.rename(columns={'nombre': 'pais'}, inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
0,20160002,2,16,2016-12-31,Antigua y Barbuda
1,20160001,1,5215,2016-12-31,Argentina
2,20160004,4,1283,2016-12-31,Belice
3,20160005,5,-10071,2016-12-31,Bolivia
4,20160006,6,82589,2016-12-31,Brasil


In [8]:
# Transfomrar Anio en Año real.
df_tot['anio'] = pd.to_datetime(df_tot['anio'], format="%Y-%m-%d")
df_tot['anio'].head()

0   2016-12-31
1   2016-12-31
2   2016-12-31
3   2016-12-31
4   2016-12-31
Name: anio, dtype: datetime64[ns]

In [9]:
df_tot.sort_values(by='anio', inplace=True)
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
909,19900017,17,-15350,1990-12-31,Honduras
891,19900002,2,-1145,1990-12-31,Antigua y Barbuda
892,19900001,1,6021,1990-12-31,Argentina
893,19900003,3,241,1990-12-31,Bahamas
911,19900021,21,-286584,1990-12-31,Mexico


In [10]:
df_tot.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1140 entries, 909 to 1042
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   id_migracion    990 non-null    Int64         
 1   id_pais         1140 non-null   Int64         
 2   migracion_neta  960 non-null    Int64         
 3   anio            1140 non-null   datetime64[ns]
 4   pais            1140 non-null   object        
dtypes: Int64(3), datetime64[ns](1), object(1)
memory usage: 56.8+ KB


**Estudiar NaNs que quedan**

In [11]:
df_tot.isna().sum()

id_migracion      150
id_pais             0
migracion_neta    180
anio                0
pais                0
dtype: int64

In [12]:
df_tot.tail()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
1046,,6,,2027-12-31,Brasil
1045,,5,,2027-12-31,Bolivia
1044,,4,,2027-12-31,Belice
1041,,2,,2027-12-31,Antigua y Barbuda
1042,,1,,2027-12-31,Argentina


In [13]:
# Quedan muchos NaN en todas las variables en 2022 asi que lo sacamos
df_tot = df_tot[df_tot['anio'] < ('2022-01-01')]

In [15]:
df_tot.isna().sum()

id_migracion      0
id_pais           0
migracion_neta    0
anio              0
pais              0
dtype: int64

In [16]:
df_tot.tail()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
1103,20210012,12,0,2021-12-31,Santa Lucia
116,20210001,1,2344,2021-12-31,Argentina
120,20210006,6,20376,2021-12-31,Brasil
117,20210003,3,485,2021-12-31,Bahamas
1102,20210002,2,0,2021-12-31,Antigua y Barbuda


In [17]:
df_tot[df_tot['pais'] == 'Argentina'].head(35)

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
892,19900001,1,6021,1990-12-31,Argentina
376,19910001,1,-8569,1991-12-31,Argentina
747,19920001,1,-13201,1992-12-31,Argentina
231,19930001,1,-16383,1993-12-31,Argentina
607,19940001,1,-18725,1994-12-31,Argentina
87,19950001,1,-19736,1995-12-31,Argentina
491,19960001,1,-20396,1996-12-31,Argentina
862,19970001,1,-20588,1997-12-31,Argentina
346,19980001,1,-20331,1998-12-31,Argentina
717,19990001,1,-20220,1999-12-31,Argentina


# Prophet

Preparar DataFrame para Prophet.

Dataframe must have columns "ds" and "y" with the dates and values respectively.

In [18]:
paises = df_tot['id_pais'].unique()
paises

<IntegerArray>
[17,  2,  1,  3, 21, 22, 23, 25, 24, 12, 27, 29, 28, 30,  4, 19,  6,  5, 18,
 16, 15, 26, 13, 14, 11, 10,  9,  8,  7, 20]
Length: 30, dtype: Int64

In [19]:
def predecir(data, columna, periodo, scale, idpais):
    
    # Filtro por pais.
    df_pais = data[data['id_pais'] == idpais]

    # Renombrar las columnas para que sean compatibles con Prophet.
    df_prophet = pd.DataFrame()
    df_prophet['ds'] = df_pais['anio']
    df_prophet['y'] = df_pais[columna]

    # Crear un modelo Prophet
    model = Prophet(changepoint_prior_scale=scale)

    # Ajustar el modelo a los datos
    model.fit(df_prophet)

    # Crear un DataFrame con las fechas futuras que deseas predecir
    future = model.make_future_dataframe(periods=periodo, freq='Y')

    # Realizar las predicciones
    forecast = model.predict(future)

    return forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']]

In [20]:
df_pred = pd.DataFrame(columns=['ds', 'yhat', 'id_pais'])

for p in paises:
    df_forcast = predecir(df_tot, 'migracion_neta', 5, 2, p)
    df_forcast = df_forcast[['ds', 'yhat']]
    df_forcast['id_pais'] = p
    df_pred = df_pred.merge(df_forcast, how='outer')

15:49:52 - cmdstanpy - INFO - Chain [1] start processing
15:49:52 - cmdstanpy - INFO - Chain [1] done processing
15:49:52 - cmdstanpy - INFO - Chain [1] start processing
15:49:53 - cmdstanpy - INFO - Chain [1] done processing
15:49:53 - cmdstanpy - INFO - Chain [1] start processing
15:49:53 - cmdstanpy - INFO - Chain [1] done processing
15:49:53 - cmdstanpy - INFO - Chain [1] start processing
15:49:54 - cmdstanpy - INFO - Chain [1] done processing
15:49:54 - cmdstanpy - INFO - Chain [1] start processing
15:49:55 - cmdstanpy - INFO - Chain [1] done processing
15:49:55 - cmdstanpy - INFO - Chain [1] start processing
15:49:55 - cmdstanpy - INFO - Chain [1] done processing
15:49:55 - cmdstanpy - INFO - Chain [1] start processing
15:49:56 - cmdstanpy - INFO - Chain [1] done processing
15:49:56 - cmdstanpy - INFO - Chain [1] start processing
15:49:56 - cmdstanpy - INFO - Chain [1] done processing
15:49:57 - cmdstanpy - INFO - Chain [1] start processing
15:49:57 - cmdstanpy - INFO - Chain [1]

In [21]:
df_pred.rename(columns={'ds': 'anio', 'yhat': 'migracion_neta_pred'}, inplace=True)
df_pred.head()

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352.675042,17
1,1991-12-31,-15521.211957,17
2,1992-12-31,-15568.323794,17
3,1993-12-31,-15559.075368,17
4,1994-12-31,-15246.074408,17


In [22]:
df_pred.head(50)

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352.675042,17
1,1991-12-31,-15521.211957,17
2,1992-12-31,-15568.323794,17
3,1993-12-31,-15559.075368,17
4,1994-12-31,-15246.074408,17
5,1995-12-31,-14711.073509,17
6,1996-12-31,-14049.076315,17
7,1997-12-31,-13343.075413,17
8,1998-12-31,-12626.247103,17
9,1999-12-31,-11756.858749,17


In [23]:
# Pasar prediccion a valores enteros
df_pred['migracion_neta_pred'] =  df_pred['migracion_neta_pred'].astype(int)

In [24]:
df_pred.head(50)

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352,17
1,1991-12-31,-15521,17
2,1992-12-31,-15568,17
3,1993-12-31,-15559,17
4,1994-12-31,-15246,17
5,1995-12-31,-14711,17
6,1996-12-31,-14049,17
7,1997-12-31,-13343,17
8,1998-12-31,-12626,17
9,1999-12-31,-11756,17


In [25]:
df_pred.head()

Unnamed: 0,anio,migracion_neta_pred,id_pais
0,1990-12-31,-15352,17
1,1991-12-31,-15521,17
2,1992-12-31,-15568,17
3,1993-12-31,-15559,17
4,1994-12-31,-15246,17


In [26]:
df_pred.tail()

Unnamed: 0,anio,migracion_neta_pred,id_pais
1105,2022-12-31,-17754,20
1106,2023-12-31,-16369,20
1107,2024-12-31,-12946,20
1108,2025-12-31,-11551,20
1109,2026-12-31,-10161,20


In [27]:
df_tot.head()

Unnamed: 0,id_migracion,id_pais,migracion_neta,anio,pais
909,19900017,17,-15350,1990-12-31,Honduras
891,19900002,2,-1145,1990-12-31,Antigua y Barbuda
892,19900001,1,6021,1990-12-31,Argentina
893,19900003,3,241,1990-12-31,Bahamas
911,19900021,21,-286584,1990-12-31,Mexico


In [31]:
df_mig_new = df_pred.merge(df_tot, how='outer', on=['id_pais', 'anio'])
df_mig_new[df_mig_new['id_pais'] == 1]

Unnamed: 0,anio,migracion_neta_pred,id_pais,id_migracion,migracion_neta,pais
74,1990-12-31,5937,1,19900001.0,6021.0,Argentina
75,1991-12-31,-8485,1,19910001.0,-8569.0,Argentina
76,1992-12-31,-13200,1,19920001.0,-13201.0,Argentina
77,1993-12-31,-16382,1,19930001.0,-16383.0,Argentina
78,1994-12-31,-18724,1,19940001.0,-18725.0,Argentina
79,1995-12-31,-19702,1,19950001.0,-19736.0,Argentina
80,1996-12-31,-20463,1,19960001.0,-20396.0,Argentina
81,1997-12-31,-20498,1,19970001.0,-20588.0,Argentina
82,1998-12-31,-20377,1,19980001.0,-20331.0,Argentina
83,1999-12-31,-20293,1,19990001.0,-20220.0,Argentina
