# **ETL**

---

### Extracción de datos

In [2]:
# Librerias a utilizar
import requests
import pandas as pd
import calendar
import io
import datetime
import json
import re
from bs4 import BeautifulSoup

In [297]:
# Extraemos los datos mediante una API 

# Creamos lista donde almacenaremos todos los datos
all_data = []

# Iteramos para extraer los datos de los años de interes, en este caso de 1950 a 2023
for i in range(1970, 2023):
    year = i
    for mes in range (1, 13):
        # Tomamos el último dia de cada mes
        last_day = calendar.monthrange(year, mes)[1]
        starttime = f"{year}-{mes}-01"
        endtime = f"{year}-{mes}-{last_day}"
        # El url de la api con los parámetros, donde filtramos buscando magnitudes mayores a 4
        url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={starttime}&endtime={endtime}&minmagnitude=4"
        # Envía la solicitud HTTP a la API de USGS y procesa los datos de respuesta
        response = requests.get(url)
        data = json.loads(response.text)
        data = data["features"]
                # Itera sobre cada objeto "Feature" en la lista "data"
        for feature in data:
                # Accede a la parte "properties" de cada objeto
            properties = feature['properties']
            properties['year'] = year
            properties['month'] = mes

            all_data.append(properties)
    # Crea un dataframe de Pandas a partir de la lista de eventos sísmicos
    
            

# Realizamos la misma extracción, pero para el año 2023 de forma actualizada siempre al tiempo actual
# Tomamos el mes actual
mes_actual = datetime.datetime.now().month
# Iteramos por mes, con tope en el mes actual
for mes in range (1, mes_actual +1):
    # Tomamos el último dia de cada mes
    last_day = calendar.monthrange(2023, mes)[1]
    starttime = f"2023-{mes}-01"
    endtime = f"2023-{mes}-{last_day}"
    url = f"https://earthquake.usgs.gov/fdsnws/event/1/query?format=geojson&starttime={starttime}&endtime={endtime}&minmagnitude=4"
    response = requests.get(url)
    data = json.loads(response.text)
    data = data["features"]
    # Itera sobre cada objeto "Feature" en la lista "data"
    for feature in data:
     # Accede a la parte "properties" de cada objeto
        properties = feature['properties']
        properties['year'] = 2023
        properties['month'] = mes
    # Crea un dataframe de Pandas a partir de la lista de eventos sísmicos
        all_data.append(properties)
            
# Concatenamos todos los dataframes en uno solo
df_total = pd.DataFrame(all_data)

In [313]:
# Observamos
df_total

Unnamed: 0,mag,place,tz,felt,cdi,mmi,alert,status,tsunami,sig,...,code,types,nst,dmin,rms,gap,magType,type,year,month
0,4.09,"65km ENE of Beatty, NV",,,,,,reviewed,0,257,...,3324919,",origin,phase-data,",13.0,1.749,0.31,253.0,ml,nuclear explosion,1970,1
1,5.90,"59 km NNE of Port-Olry, Vanuatu",,,,3.894,,reviewed,0,536,...,800507,",origin,shakemap,",,,,,mw,earthquake,1970,1
2,6.14,Fiji region,,,,,,reviewed,0,580,...,800471,",origin,",,,,,mw,earthquake,1970,1
3,5.27,"195 km WSW of Linxia Chengguanzhen, China",,,,,,reviewed,0,427,...,800431,",origin,",,,,,mw,earthquake,1970,1
4,5.77,"38 km SSE of San Cristóbal, Venezuela",,,,6.276,,reviewed,0,512,...,800429,",origin,shakemap,",,,,,mw,earthquake,1970,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461654,5.30,"82 km ESE of Katsuren-haebaru, Japan",,2.0,2.7,,,reviewed,0,433,...,7000jxaa,",dyfi,internal-moment-tensor,moment-tensor,ori...",74.0,0.909,0.83,64.0,mww,earthquake,2023,5
461655,5.10,"Ryukyu Islands, Japan",,,,,,reviewed,0,400,...,7000jxab,",origin,phase-data,",77.0,0.888,1.17,64.0,mb,earthquake,2023,5
461656,4.30,"45 km NNE of Klyuchi, Russia",,,,,,reviewed,0,284,...,7000jxa6,",origin,phase-data,",40.0,3.949,0.63,126.0,mb,earthquake,2023,5
461657,5.10,,,,,,,reviewed,0,400,...,7000jx9j,",origin,phase-data,",66.0,0.682,0.59,121.0,mww,earthquake,2023,5


In [299]:
# Observamos las columnas, hay algunas que no aportan nada de información útil para el análisis
df_total.columns

Index(['mag', 'place', 'time', 'updated', 'tz', 'url', 'detail', 'felt', 'cdi',
       'mmi', 'alert', 'status', 'tsunami', 'sig', 'net', 'code', 'ids',
       'sources', 'types', 'nst', 'dmin', 'rms', 'gap', 'magType', 'type',
       'title', 'year', 'month'],
      dtype='object')

In [300]:
# Eliminamos las columnas que no utilizaremos
df_total.drop(['time', 'updated',"url","detail","ids","sources","title"],inplace=True,axis=1)

In [312]:
# Observamos
df_total.tail()

Unnamed: 0,mag,place,tz,felt,cdi,mmi,alert,status,tsunami,sig,...,code,types,nst,dmin,rms,gap,magType,type,year,month
461654,5.3,"82 km ESE of Katsuren-haebaru, Japan",,2.0,2.7,,,reviewed,0,433,...,7000jxaa,",dyfi,internal-moment-tensor,moment-tensor,ori...",74.0,0.909,0.83,64.0,mww,earthquake,2023,5
461655,5.1,"Ryukyu Islands, Japan",,,,,,reviewed,0,400,...,7000jxab,",origin,phase-data,",77.0,0.888,1.17,64.0,mb,earthquake,2023,5
461656,4.3,"45 km NNE of Klyuchi, Russia",,,,,,reviewed,0,284,...,7000jxa6,",origin,phase-data,",40.0,3.949,0.63,126.0,mb,earthquake,2023,5
461657,5.1,,,,,,,reviewed,0,400,...,7000jx9j,",origin,phase-data,",66.0,0.682,0.59,121.0,mww,earthquake,2023,5
461658,4.2,"66 km NNE of Bandar Abbas, Iran",,,,,,reviewed,0,271,...,7000jx9i,",origin,phase-data,",48.0,2.77,0.75,143.0,mb,earthquake,2023,5


In [337]:
# Exportamos
df_total.to_json("df_total.json")

---

# USA

In [303]:
estados2 = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California',
       'Colorado', 'Connecticut', 'Delaware', 'Florida', 'Georgia',
       'Hawai', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas',
       'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts',
       'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
       'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey',
       'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio',
       'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island',
       'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'Utah',
       'Vermont', 'Virginia', 'Washington', 'West Virginia', 'Wisconsin',
       'Wyoming']


In [315]:
df_total["place"].isna().sum()

0

In [314]:
df_total["place"].fillna("sin dato",inplace=True)

In [316]:
all_dfs2 = []
for i in estados2 : 
    all_dfs2.append(df_total[df_total["place"].str.contains(i)])

df_usa1 = pd.concat(all_dfs2, ignore_index=True)   

In [317]:
df_usa1

Unnamed: 0,mag,place,tz,felt,cdi,mmi,alert,status,tsunami,sig,...,code,types,nst,dmin,rms,gap,magType,type,year,month
0,4.5,"3 km NNW of Fayette, Alabama",,,,5.447,,reviewed,0,312,...,p0000brg,",origin,shakemap,trump-shakemap,",,,,,mb,earthquake,1975,6
1,4.2,"4 km SW of Brookwood, Alabama",,,,,,reviewed,0,271,...,p0002tkj,",impact-text,origin,phase-data,",,,0.80,,mb,rock burst,1986,5
2,4.8,"15 km NNW of Flomaton, Alabama",,,,6.012,,reviewed,0,354,...,p00089hq,",impact-text,origin,phase-data,shakemap,trump-...",,,0.98,,mb,earthquake,1997,10
3,4.8,"14 km WNW of North Johns, Alabama",,,,,,reviewed,0,354,...,p000918y,",impact-text,origin,phase-data,",,,1.00,,mb,rock burst,1999,1
4,4.6,"8 km S of Valley Head, Alabama",,17204.0,6.0,4.987,green,reviewed,0,926,...,605341,",associate,dyfi,focal-mechanism,impact-text,lo...",21.0,,0.07,154.0,mw,earthquake,2003,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22676,4.0,"23 km SE of Kelly, Wyoming",,153.0,3.4,4.370,green,reviewed,0,298,...,10004t1f,",cap,dyfi,impact-text,losspager,moment-tensor,...",,0.195,1.16,18.0,mwr,earthquake,2016,2
22677,4.8,"4 km ENE of Bondurant, Wyoming",,578.0,3.9,4.640,green,reviewed,0,580,...,20006umx,",cap,dyfi,impact-text,losspager,moment-tensor,...",,0.373,1.14,7.0,mwr,earthquake,2016,8
22678,4.0,"57 km S of Ten Sleep, Wyoming",,58.0,4.1,3.130,green,reviewed,0,270,...,2000bi6d,",dyfi,losspager,moment-tensor,origin,phase-dat...",,1.074,0.74,63.0,mwr,earthquake,2017,11
22679,4.0,"26 km ESE of Wright, Wyoming",,,,,,reviewed,0,246,...,6000exg9,",origin,phase-data,",,0.984,0.42,54.0,mb,mining explosion,2021,7


In [318]:
df_usa1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22681 entries, 0 to 22680
Data columns (total 21 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   mag      22681 non-null  float64
 1   place    22681 non-null  object 
 2   tz       1 non-null      float64
 3   felt     2714 non-null   float64
 4   cdi      2714 non-null   float64
 5   mmi      2733 non-null   float64
 6   alert    1422 non-null   object 
 7   status   22681 non-null  object 
 8   tsunami  22681 non-null  int64  
 9   sig      22681 non-null  int64  
 10  net      22681 non-null  object 
 11  code     22681 non-null  object 
 12  types    22681 non-null  object 
 13  nst      7120 non-null   float64
 14  dmin     5886 non-null   float64
 15  rms      18610 non-null  float64
 16  gap      10151 non-null  float64
 17  magType  22679 non-null  object 
 18  type     22681 non-null  object 
 19  year     22681 non-null  int64  
 20  month    22681 non-null  int64  
dtypes: float64(9

In [336]:
df_usa1["place"] = df_usa1["place"].str.strip()
df_usa1[["place1","place2","place3"]] = df_usa1["place"].str.split(",",expand=True)
df_usa1

Unnamed: 0,mag,place,tz,felt,cdi,mmi,alert,status,tsunami,sig,...,dmin,rms,gap,magType,type,year,month,place1,place2,place3
0,4.5,"3 km NNW of Fayette, Alabama",,,,5.447,,reviewed,0,312,...,,,,mb,earthquake,1975,6,3 km NNW of Fayette,Alabama,
1,4.2,"4 km SW of Brookwood, Alabama",,,,,,reviewed,0,271,...,,0.80,,mb,rock burst,1986,5,4 km SW of Brookwood,Alabama,
2,4.8,"15 km NNW of Flomaton, Alabama",,,,6.012,,reviewed,0,354,...,,0.98,,mb,earthquake,1997,10,15 km NNW of Flomaton,Alabama,
3,4.8,"14 km WNW of North Johns, Alabama",,,,,,reviewed,0,354,...,,1.00,,mb,rock burst,1999,1,14 km WNW of North Johns,Alabama,
4,4.6,"8 km S of Valley Head, Alabama",,17204.0,6.0,4.987,green,reviewed,0,926,...,,0.07,154.0,mw,earthquake,2003,4,8 km S of Valley Head,Alabama,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22676,4.0,"23 km SE of Kelly, Wyoming",,153.0,3.4,4.370,green,reviewed,0,298,...,0.195,1.16,18.0,mwr,earthquake,2016,2,23 km SE of Kelly,Wyoming,
22677,4.8,"4 km ENE of Bondurant, Wyoming",,578.0,3.9,4.640,green,reviewed,0,580,...,0.373,1.14,7.0,mwr,earthquake,2016,8,4 km ENE of Bondurant,Wyoming,
22678,4.0,"57 km S of Ten Sleep, Wyoming",,58.0,4.1,3.130,green,reviewed,0,270,...,1.074,0.74,63.0,mwr,earthquake,2017,11,57 km S of Ten Sleep,Wyoming,
22679,4.0,"26 km ESE of Wright, Wyoming",,,,,,reviewed,0,246,...,0.984,0.42,54.0,mb,mining explosion,2021,7,26 km ESE of Wright,Wyoming,


In [338]:
# Exportamos
df_usa1.to_json("Usa.json")

---

# Japón

In [334]:
df_japon = df_total[df_total["place"].str.contains("Japan")]
df_japon

Unnamed: 0,mag,place,tz,felt,cdi,mmi,alert,status,tsunami,sig,...,code,types,nst,dmin,rms,gap,magType,type,year,month
12,6.40,"45 km SSW of Obihiro, Japan",,,,6.419,,reviewed,0,630,...,800254,",origin,shakemap,trump-shakemap,",,,,,mw,earthquake,1970,1
77,5.53,"Volcano Islands, Japan region",,,,,,reviewed,0,470,...,799066,",origin,",,,,,mw,earthquake,1970,2
107,5.54,"67 km NE of Naze, Japan",,,,,,reviewed,0,472,...,798599,",origin,",,,,,mw,earthquake,1970,3
127,5.70,"101 km WSW of Kurio, Japan",,,,,,reviewed,0,500,...,798320,",origin,",,,,,mw,earthquake,1970,3
129,5.70,"4 km WSW of Takanosu, Japan",,,,3.951,,reviewed,0,500,...,798303,",origin,shakemap,",,,,,mw,earthquake,1970,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
461650,5.10,"75 km ESE of Katsuren-haebaru, Japan",,,,,,reviewed,0,400,...,7000jxal,",origin,phase-data,",58.0,0.883,0.99,79.0,mww,earthquake,2023,5
461651,5.00,"81 km SE of Taira, Japan",,,,,,reviewed,0,385,...,7000jxaj,",origin,phase-data,",44.0,0.845,0.95,78.0,mb,earthquake,2023,5
461652,5.90,"79 km ESE of Katsuren-haebaru, Japan",,10.0,3.1,3.841,green,reviewed,0,539,...,7000jxag,",dyfi,internal-moment-tensor,internal-origin,l...",76.0,0.871,0.75,57.0,mww,earthquake,2023,5
461654,5.30,"82 km ESE of Katsuren-haebaru, Japan",,2.0,2.7,,,reviewed,0,433,...,7000jxaa,",dyfi,internal-moment-tensor,moment-tensor,ori...",74.0,0.909,0.83,64.0,mww,earthquake,2023,5


In [339]:
df_japon.to_json("Japón.json")

---

# México

In [329]:
# Dataset con magnitudes mayores a 4.0 y de 1970 a 2023
df_mexico = pd.read_json("Datasets/sismos_méxico.json")
# Obervamos
df_mexico

Unnamed: 0,Fecha,Hora,Magnitud,Latitud,Longitud,Profundidad,Referencia de localizacion,Fecha UTC,Hora UTC,Estatus
0,1970-02-03,23:08:50,6.6,15.524,-99.493,21.0,"141 km al SUR de SAN MARCOS, GRO",1970-02-04,05:08:50,revisado
1,1970-04-29,08:01:34,7.3,14.463,-92.683,44.0,"62 km al SUROESTE de CD HIDALGO, CHIS",1970-04-29,14:01:34,revisado
2,1970-04-30,02:32:58,6.5,14.608,-93.260,22.0,"100 km al SUROESTE de MAPASTEPEC, CHIS",1970-04-30,08:32:58,revisado
3,1971-09-30,02:18:00,6.5,26.880,-110.800,14.0,"92 km al SUROESTE de PUEBLO YAQUI, SON",1971-09-30,08:18:00,revisado
4,1972-10-20,02:17:46,6.6,18.700,-106.756,10.0,"238 km al OESTE de CIHUATLAN, JAL",1972-10-20,08:17:46,revisado
...,...,...,...,...,...,...,...,...,...,...
42665,2023-05-03,21:17:37,4.0,19.350,-103.870,31.0,"17 km al NOROESTE de VILLA DE ALVAREZ, COL",2023-05-04,03:17:37,verificado
42666,2023-05-03,21:39:31,4.2,16.910,-95.040,106.0,"4 km al NORTE de MATIAS ROMERO, OAX",2023-05-04,03:39:31,verificado
42667,2023-05-04,03:19:06,4.0,15.430,-92.020,212.4,"26 km al NORESTE de MOTOZINTLA, CHIS",2023-05-04,09:19:06,verificado
42668,2023-05-04,04:42:31,4.1,14.880,-94.420,16.1,"151 km al SUROESTE de TONALA, CHIS",2023-05-04,10:42:31,verificado


In [330]:
# Separamos "referencia de localizacion"
df_mexico[["Distancia","Estado"]] = df_mexico["Referencia de localizacion"].str.split(",",expand=True)
df_mexico.sample(3)

Unnamed: 0,Fecha,Hora,Magnitud,Latitud,Longitud,Profundidad,Referencia de localizacion,Fecha UTC,Hora UTC,Estatus,Distancia,Estado
36479,2020-11-05,23:41:27,4.1,17.3467,-100.691,7.6,"15 km al NOROESTE de TECPAN, GRO",2020-11-06,05:41:27,revisado,15 km al NOROESTE de TECPAN,GRO
37654,2021-04-11,11:20:07,4.9,18.381,-103.207,52.3,"44 km al SUR de COALCOMAN, MICH",2021-04-11,16:20:07,revisado,44 km al SUR de COALCOMAN,MICH
23149,2017-02-10,21:29:40,4.3,18.4107,-102.591,56.2,"50 km al NOROESTE de LA MIRA, MICH",2017-02-11,03:29:40,revisado,50 km al NOROESTE de LA MIRA,MICH


In [340]:
# Exportamos a formato json
df_mexico.to_json("México.json")

---

# Requerimientos de Data Analysts

---

+ ### KPI 1 

In [513]:
# usar los datasets de la carpeta KPI1
seg_mex = pd.read_parquet("Datasets\KPI 1\seguros_mex.parquet")
seg_mex.pop("level_0")
# Observamos
seg_mex


Unnamed: 0,AñO,MONEDA,ZONA SISMICA,TIPO DE SEGURO,SUBTIPO DE SEGURO,NUMERO DE PISOS,NUMERO DE UBICACIONES,PRIMA EMITIDA,PRIMA RETENIDA,PRIMA DEVENGADA,COMISION DIRECTA,VALORES TOTALES EDIFICIO,VALORES TOTALES CONTENIDOS,VALORES TOTALES PeERDIDAS CONSECUENCIALES,LIMITE MAXIMO DE RESPONSABILIDAD
0,2019,Nacional,A_,Terremoto,Otro,1,1,0,0,63,0,0,655000,0,655000
1,2019,Nacional,B_,Terremoto,Otro,2,2,383,262,382,19,0,1500000,120000,1620000
2,2019,Nacional,C_,Terremoto,Otro,1,3,3511,2336,2970,175,5000000,4000000,0,9000000
3,2019,Nacional,C_,Terremoto,Otro,2,9,2895,1790,2883,146,2859280,8600000,1150000,12609280
4,2019,Nacional,B_,Terremoto,Otro,1,17,6135,331,79334,305,890890000,245448050,81172500,1217510550
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
321787,2021,Extranjera,B_,Terremoto,Microseguro,1,1,34337,343,10724,1030,0,306600000,29199991,335799991
321788,2021,Extranjera,C_,Terremoto,Microseguro,1,1,4,0,2,0,0,40880,4088,44968
321789,2021,Extranjera,B_,Terremoto,Microseguro,1,2,7463,-101,4110,224,0,83804000,8380400,92184400
321790,2021,Extranjera,B1,Terremoto,Microseguro,1,4,5809,267,3199,174,0,59547368,7998737,67546105


---

+ ### KPI 2

In [None]:
# si es posible hacer + web scrapping + de wikipedia para conseguir la data, si no CSV (USA  y Japon)

# USA : https://en.wikipedia.org/wiki/List_of_earthquakes_in_the_United_States

# Japon : https://en.wikipedia.org/wiki/List_of_earthquakes_in_Japan

# Mexico : https://www.worlddata.info/america/mexico/earthquakes.php

In [314]:
# Definimos una función para extraer la fecha en formato YYYY-MM-DD
def clean_date(date_string):
    # Expresión regular para extraer la fecha
    date_pattern = r'([a-zA-Z]+) (\d{1,2}), (\d{4})'
    
    # Buscamos la fecha en el texto usando la expresión regular
    match = re.search(date_pattern, date_string)
    
    # Si encontramos una coincidencia, creamos la fecha en formato YYYY-MM-DD
    if match:
        month = match.group(1)
        day = match.group(2)
        year = match.group(3)
        
        # Convertimos el mes a número usando un diccionario
        month_dict = {'January': '01',
                      'February': '02',
                      'March': '03',
                      'April': '04',
                      'May': '05',
                      'June': '06',
                      'July': '07',
                      'August': '08',
                      'September': '09',
                      'October': '10',
                      'November': '11',
                      'December': '12'}
        
        month_number = month_dict[month]
        
        return f"{year}-{month_number}-{day}"
    
    # Si no encontramos una coincidencia, devolvemos un valor vacío
    else:
        return ''

In [414]:
# Mexico

url = 'https://www.worlddata.info/america/mexico/earthquakes.php'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
seccion = soup.find('table', {'class': 'std100 hover'})
rows = seccion.find_all("tr")

data_mex = []
for row in rows[1:]: # Saltamos la primera fila que contiene los encabezados de la tabla
    cols = row.find_all('td')
    date = cols[0].text.strip()
    location = cols[1].text.strip()
    depth = cols[2].text.strip()
    magnitude = cols[3].text.strip()
    deaths = cols[4].text.strip()
    data_mex.append([date, location, depth, magnitude, deaths])

# Crear dataframe
df_mex = pd.DataFrame(data_mex, columns=["date", "location", "depth", "magnitude", "deaths"])
# Mostrar el dataframe
# Hay que normalizar los 3 Dataframes para posteriormente unirlos en 1 sola tabla

# Colocamos el id de Mexico = 2
df_mex["idCountry"] = 3
# Colocamos correcto tipo en date
df_mex['date'] = pd.to_datetime(df_mex['date'], format='%m/%d/%Y')

# Reordenamos
df_mex = df_mex[["idCountry","date","location","magnitude","deaths"]]

df_mex

Unnamed: 0,idCountry,date,location,magnitude,deaths
0,3,2022-09-22,"Mexico City, Michoacan",6.8,3
1,3,2022-09-19,"Michoacan, Colima, Jalisco",7.6,2
2,3,2022-05-25,Oaxaca,5.5,0
3,3,2021-09-08,Guerrero,7.0,3
4,3,2020-06-23,Oaxaca,7.4,10
...,...,...,...,...,...
61,3,1956-01-08,Guerrero,6.5,0
62,3,1954-02-05,Chiapas,6.2,6
63,3,1951-07-09,Oaxaca (Miahiatlan),6.2,1
64,3,1950-12-14,Acapulco,7.5,0


In [430]:
# Japon

url = 'https://en.wikipedia.org/wiki/List_of_earthquakes_in_Japan'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

seccion = soup.find('table', {'id': 'stripe'})
rows = seccion.find_all("tr")

data_jap = []
for row in rows[1:]: # Saltamos la primera fila que contiene los encabezados de la tabla
    cols = row.find_all('td')
    date = cols[0].text.strip()
    magnitude = cols[1].text.strip()
    deaths = cols[2].text.strip()
    name = cols[3].text.strip()
    data_jap.append([date, magnitude, deaths, name])

# Crear dataframe
df_jap = pd.DataFrame(data_jap, columns=["date", "magnitude", "deaths", "location"])
# Mostrar el dataframe
df_jap["idCountry"] = 2

# Aplicamos la función a la columna date y creamos una nueva columna "clean_date"
df_jap['date'] = df_jap['date'].apply(clean_date)
df_jap["date"].replace("",None,inplace=True)
# Reordenamos
df_jap = df_jap[["idCountry","date","location","magnitude","deaths"]]

# Observamos
df_jap



Unnamed: 0,idCountry,date,location,magnitude,deaths
0,2,,684 Hakuho earthquake,8.4 MK (Kawasumi scale)[6],"101–1,000"
1,2,,occurred at Minoh,7.9 MK,
2,2,,869 Sanriku earthquake,8.9 MK,"1,000+[9]"
3,2,1293-05-27,1293 Kamakura earthquake,7.1 Ms,"23,024[11]"
4,2,1361-08-3,1361 Shōhei earthquake,8.4 Ms,
...,...,...,...,...,...
90,2,2021-02-13,2021 Fukushima earthquake,7.1 Mw,1
91,2,2021-03-20,March 2021 Miyagi earthquake,7.0 Mw,0
92,2,2021-10-7,2021 Chiba earthquake,5.9 MW,0
93,2,2022-03-16,2022 Fukushima earthquake,7.3 MW,4


In [429]:
# USA

url = 'https://en.wikipedia.org/wiki/List_of_earthquakes_in_the_United_States'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')


seccion = soup.find('div', {'class': 'mw-parser-output'})
rows = seccion.find_all("tr")

data_usa = []
for row in rows[1:159]: 
    cols = row.find_all('td')
    date = cols[0].text.strip()
    location = cols[1].text.strip()
    magnitude = cols[2].text.strip()
    deaths = cols[3].text.strip()
    data_usa.append([date, location, magnitude, deaths])

# Crear dataframe
df_usa = pd.DataFrame(data_usa, columns=["date", "location", "magnitude", "deaths"])
# Mostrar el dataframe
df_usa["idCountry"] = 1
# Aplicamos la función a la columna date y creamos una nueva columna "clean_date"
df_usa['date'] = df_usa['date'].apply(clean_date)
# Reordenamos
df_usa = df_usa[["idCountry","date","location","magnitude","deaths"]]

df_usa


Unnamed: 0,idCountry,date,location,magnitude,deaths
0,1,1585-06-11,"Aleutian Islands, Alaska",9.2 Mw,Unknown
1,1,1700-01-26,"Washington, Oregon, California",8.7–9.2 Mw,Unknown
2,1,1755-11-18,Massachusetts,5.9 Mw,0
3,1,1788-07-21,Alaska,8.0 Ms,Unknown
4,1,1788-08-6,Alaska,8.0 Ms,Unknown
...,...,...,...,...,...
153,1,2020-07-22,Alaska,7.8 Mw,0
154,1,2020-08-9,North Carolina,5.1 Mw,0
155,1,2020-10-19,Alaska,7.6 Mww,0
156,1,2021-07-29,Alaska,8.2 Mw,0


In [453]:
# Combinamos los 3 Dataframes para crear una tabla única de fatalidades
df_fatalidades = pd.concat([df_usa, df_jap, df_mex]).reset_index()
# Eliminamos el índice
df_fatalidades.pop("index")
# Observamos
df_fatalidades


Unnamed: 0,idCountry,date,location,magnitude,deaths
0,1,1585-06-11,"Aleutian Islands, Alaska",9.2 Mw,Unknown
1,1,1700-01-26,"Washington, Oregon, California",8.7–9.2 Mw,Unknown
2,1,1755-11-18,Massachusetts,5.9 Mw,0
3,1,1788-07-21,Alaska,8.0 Ms,Unknown
4,1,1788-08-6,Alaska,8.0 Ms,Unknown
...,...,...,...,...,...
314,3,1956-01-08 00:00:00,Guerrero,6.5,0
315,3,1954-02-05 00:00:00,Chiapas,6.2,6
316,3,1951-07-09 00:00:00,Oaxaca (Miahiatlan),6.2,1
317,3,1950-12-14 00:00:00,Acapulco,7.5,0


In [454]:
# Exportamos
df_fatalidades.to_json("Datasets/KPI 2/Fatalidades.json")

---

+ ### KPI 3

In [None]:
# Web scrapping de estas paginas 

# https://www.shakeout.org/glb_participants.php?year=2022&start=All ( de acá todo)

# https://www.shakeout.org/participants.php?year=2022 ( de acá solo mexico y category)

#No se pudo

In [204]:
# Librerias a utilizar
from selenium import webdriver
from bs4 import BeautifulSoup
from functools import reduce
import pandas as pd

lista = []
i = 2015

url = "https://www.shakeout.org/glb_participants.php?year=2022&start=All"
    # Cargar la página con Selenium
driver = webdriver.Chrome()
driver.get(url)

    # Esperar a que la página se cargue por completo
driver.implicitly_wait(10)

    # Capturar el contenido de la tabla con BeautifulSoup
content = driver.page_source
soup = BeautifulSoup(content, 'lxml')
table = soup.find_all("table")

driver.quit()

rows = table[0].find_all('tr')

rows = str(rows)

soup = BeautifulSoup(rows, 'html.parser')

print(soup)

[<tr><td style="vertical-align: top">
<table border="0" style="margin-left: 43px; margin-right: 64px;">
<tbody><tr><td align="left" colspan="2"><b>2022 participation by category (excluding Japan and NZ)</b></td></tr><tr><td align="center" colspan="3" valign="top"><hr size="1"/></td></tr>
<script language="JavaScript">

document.write(createCatNewRow("Individuals/Families", "individuals"));
document.write(createCatNewRow("Childcare and Pre-Schools", "childcare"));
document.write(createCatNewRow("K-12 Schools and Districts", "schools"));
document.write(createCatNewRow("Colleges and Universities", "colleges"));
document.write(createCatNewRow("Local Government", "local"));
document.write(createCatNewRow("State Government", "state"));
document.write(createCatNewRow("Federal/National Government (Including Military)", "federal"));
document.write(createCatNewRow("Tribes/Indigenous Peoples", "tribes"));
document.write(createCatNewRow("Businesses", "businesses"));
document.write(createCatNewRow(

In [213]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup

# Configuración del webdriver
service = Service("C:\\webdrivers\\chromedriver.exe")
driver = webdriver.Chrome(service=service)

try:
    # Carga la página
    driver.get("https://www.w3schools.com/html/html_tables.asp")

    # Espera a que la página cargue completamente
    driver.implicitly_wait(10)

    # Extrae la tabla
    table = None
    while table is None:
        soup = BeautifulSoup(driver.page_source, "html.parser")
        table = soup.find("table", {"id": "customers"})

    # Extrae los datos de la tabla
    rows = table.find_all("tr")
    for row in rows:
        cols = row.find_all("td")
        for col in cols:
            print(col.text)

except KeyboardInterrupt:
    print("Programa interrumpido por el usuario")

finally:
    driver.quit()


Alfreds Futterkiste
Maria Anders
Germany
Centro comercial Moctezuma
Francisco Chang
Mexico
Ernst Handel
Roland Mendel
Austria
Island Trading
Helen Bennett
UK
Laughing Bacchus Winecellars
Yoshi Tannamuri
Canada
Magazzini Alimentari Riuniti
Giovanni Rovelli
Italy


In [216]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

# Inicializar el servicio del navegador Chrome
service = Service("C:\\webdrivers\\chromedriver.exe")
driver = webdriver.Chrome(service=service)

# Hacer una solicitud HTTP a la página web que contiene la tabla
url = "https://www.shakeout.org/glb_participants.php?year=2022&start=All"
driver.get(url)

# Esperar a que la página cargue completamente
time.sleep(5)

    # Capturar el contenido de la tabla con BeautifulSoup
content = driver.page_source
print(content)
soup = BeautifulSoup(content, 'lxml')
table = soup.find_all("div",{"id:" :"dataArea"})
# Encontrar el elemento HTML que contiene la tabla
print(table)



# Cerrar el navegador
driver.quit()


<html xmlns="http://www.w3.org/1999/xhtml" style="height:100%" prefix="og: http://ogp.me/ns/website#"><head>
<title>Great ShakeOut Earthquake Drills - 2022 Global Participant Totals</title>
  
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<meta name="gmapkey" content="ABQIAAAAOVxjJ1Bff3b9SF1ClQvORxRXDewq9uUSVuRBNHMwkFHLvZO2rRR2nezmWhuXPdQgz7hA_OYIh0EtPw">

<meta property="og:description" content="Great ShakeOut Earthquake Drills help people in homes, schools, and organizations worldwide improve preparedness and practice how to be safe during earthquakes. Register to participate in your state or country's drill at www.ShakeOut.org.">
<meta property="og:title" content="Great ShakeOut Earthquake Drills">
<meta property="og:type" content="website">
<!--meta property="og:url" content="https://www.shakeout.org" -->
<meta property="og:image" content="http://www.shakeout.org/images/shakeoutfb.jpg">
	
<link rel="shortcut icon" href="https://www.shakeout.org/images/favicon.

In [219]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup
import time

# Inicializar el servicio del navegador Chrome
service = Service("C:\\webdrivers\\chromedriver.exe")
driver = webdriver.Chrome(service=service)
# Hacer una solicitud HTTP a la página web que contiene la tabla
url = "https://www.shakeout.org/glb_participants.php?year=2022&start=All"
driver.get(url)

# Esperar a que la página cargue completamente
time.sleep(5)

# Capturar el contenido de la tabla con BeautifulSoup
soup = BeautifulSoup(driver.page_source, 'html.parser')
table = soup.find("table", {"border": "0"})

# Encontrar el elemento HTML que contiene la tabla
print(table)

# Cerrar el navegador
driver.quit()


<table border="0" cellpadding="0" cellspacing="0">
<tbody><tr><td style="vertical-align: top">
<table border="0" style="margin-left: 43px; margin-right: 64px;">
<tbody><tr><td align="left" colspan="2"><b>2022 participation by category (excluding Japan and NZ)</b></td></tr><tr><td align="center" colspan="3" valign="top"><hr size="1"/></td></tr>
<script language="JavaScript">

document.write(createCatNewRow("Individuals/Families", "individuals"));
document.write(createCatNewRow("Childcare and Pre-Schools", "childcare"));
document.write(createCatNewRow("K-12 Schools and Districts", "schools"));
document.write(createCatNewRow("Colleges and Universities", "colleges"));
document.write(createCatNewRow("Local Government", "local"));
document.write(createCatNewRow("State Government", "state"));
document.write(createCatNewRow("Federal/National Government (Including Military)", "federal"));
document.write(createCatNewRow("Tribes/Indigenous Peoples", "tribes"));
document.write(createCatNewRow("Bus

---

+ ### KPI 4 

In [None]:
# usar tambien datasets de kpi 1

---

+ ### KPI 5

In [None]:
# https://data.worldbank.org/indicator/NY.GDP.MKTP.CD?locations=US-JP-MX

In [147]:
# Ingestaremos los datos a través de una URL
url = "https://api.worldbank.org/v2/en/indicator/NY.GDP.MKTP.CD?downloadformat=excel"
df_gdp = pd.read_excel(url)
# Observamos, se puede evidenciar que los nombres de las columnas estan mal posicionados
df_gdp.head()

Unnamed: 0,Data Source,World Development Indicators,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65
0,Last Updated Date,2023-03-30 00:00:00,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,Country Name,Country Code,Indicator Name,Indicator Code,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,...,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
3,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2615084000.0,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0,3126019000.0
4,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,21291520000.0,21809440000.0,23708060000.0,28211280000.0,26119940000.0,29683480000.0,...,972002200000.0,982677100000.0,1003403000000.0,923143900000.0,889859300000.0,1030482000000.0,1016697000000.0,1009052000000.0,934179100000.0,1089454000000.0


In [148]:
# Corregimos las columnas
df_gdp.columns = df_gdp.iloc[2]
# Eliminamos las filas
df_gdp = df_gdp.drop([0,1,2]) 
# Reiniciamos el índice
df_gdp = df_gdp.reset_index()
# Como podemos observar nos da datos de todos los países y solo precisamos Usa, Japón y México para este análisis
df_gdp.head(3)

2,index,Country Name,Country Code,Indicator Name,Indicator Code,1960.0,1961.0,1962.0,1963.0,1964.0,...,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
0,3,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,...,2615084000.0,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0,3126019000.0
1,4,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,21291520000.0,21809440000.0,23708060000.0,28211280000.0,26119940000.0,...,972002200000.0,982677100000.0,1003403000000.0,923143900000.0,889859300000.0,1030482000000.0,1016697000000.0,1009052000000.0,934179100000.0,1089454000000.0
2,5,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,...,20203570000.0,20564490000.0,20550580000.0,19998160000.0,18019560000.0,18896350000.0,18418850000.0,18904490000.0,20143440000.0,14786860000.0


In [149]:
# Filtramos los países de interés para nuestro análisis
df_gdp = df_gdp.loc[df_gdp["Country Name"].isin(["United States", "Japan", "Mexico"])].reset_index()
# Observamos
df_gdp

2,level_0,index,Country Name,Country Code,Indicator Name,Indicator Code,1960.0,1961.0,1962.0,1963.0,...,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
0,119,122,Japan,JPN,GDP (current US$),NY.GDP.MKTP.CD,44307340000.0,53508620000.0,60723020000.0,69498130000.0,...,6272363000000.0,5212328000000.0,4896994000000.0,4444931000000.0,5003678000000.0,4930837000000.0,5037835000000.0,5123318000000.0,5040108000000.0,4940878000000.0
1,154,157,Mexico,MEX,GDP (current US$),NY.GDP.MKTP.CD,13040000000.0,14160000000.0,15200000000.0,16960000000.0,...,1201090000000.0,1274443000000.0,1315351000000.0,1171868000000.0,1078491000000.0,1158913000000.0,1222408000000.0,1269012000000.0,1090515000000.0,1272839000000.0
2,251,254,United States,USA,GDP (current US$),NY.GDP.MKTP.CD,543300000000.0,563300000000.0,605100000000.0,638600000000.0,...,16253970000000.0,16843190000000.0,17550680000000.0,18206020000000.0,18695110000000.0,19477340000000.0,20533060000000.0,21380980000000.0,21060470000000.0,23315080000000.0


In [150]:
# Eliminamos las columnas que no son de utilidad
df_gdp.drop(["level_0","index","Indicator Code","Indicator Name"],axis=1,inplace=True)
# Observamos
df_gdp

2,Country Name,Country Code,1960.0,1961.0,1962.0,1963.0,1964.0,1965.0,1966.0,1967.0,...,2012.0,2013.0,2014.0,2015.0,2016.0,2017.0,2018.0,2019.0,2020.0,2021.0
0,Japan,JPN,44307340000.0,53508620000.0,60723020000.0,69498130000.0,81749010000.0,90950280000.0,105628100000.0,123781900000.0,...,6272363000000.0,5212328000000.0,4896994000000.0,4444931000000.0,5003678000000.0,4930837000000.0,5037835000000.0,5123318000000.0,5040108000000.0,4940878000000.0
1,Mexico,MEX,13040000000.0,14160000000.0,15200000000.0,16960000000.0,20080000000.0,21840000000.0,24320000000.0,26560000000.0,...,1201090000000.0,1274443000000.0,1315351000000.0,1171868000000.0,1078491000000.0,1158913000000.0,1222408000000.0,1269012000000.0,1090515000000.0,1272839000000.0
2,United States,USA,543300000000.0,563300000000.0,605100000000.0,638600000000.0,685800000000.0,743700000000.0,815000000000.0,861700000000.0,...,16253970000000.0,16843190000000.0,17550680000000.0,18206020000000.0,18695110000000.0,19477340000000.0,20533060000000.0,21380980000000.0,21060470000000.0,23315080000000.0


In [151]:
# Hay muchas columnas, podemos aplicar un pandas.melt para solucionarlo
df_gdp = df_gdp.melt(["Country Name","Country Code"],value_name="gdp(current US$)")
# Observamos 
df_gdp

Unnamed: 0,Country Name,Country Code,2,gdp(current US$)
0,Japan,JPN,1960.0,4.430734e+10
1,Mexico,MEX,1960.0,1.304000e+10
2,United States,USA,1960.0,5.433000e+11
3,Japan,JPN,1961.0,5.350862e+10
4,Mexico,MEX,1961.0,1.416000e+10
...,...,...,...,...
181,Mexico,MEX,2020.0,1.090515e+12
182,United States,USA,2020.0,2.106047e+13
183,Japan,JPN,2021.0,4.940878e+12
184,Mexico,MEX,2021.0,1.272839e+12


In [152]:
# Normalizamos los nombres de las columnas
df_gdp.rename(columns={"Country Name" : "country", "Country Code" : "code", "Indicator Name" : "indicator", 2 : "year"},inplace=True)
# Agregamos el ID para cada país
df_gdp["idCountry"] = df_gdp["country"].replace({'United States': 1, 'Japan': 2, 'Mexico': 3})
# Observamos
df_gdp

Unnamed: 0,country,code,year,gdp(current US$),idCountry
0,Japan,JPN,1960.0,4.430734e+10,2
1,Mexico,MEX,1960.0,1.304000e+10,3
2,United States,USA,1960.0,5.433000e+11,1
3,Japan,JPN,1961.0,5.350862e+10,2
4,Mexico,MEX,1961.0,1.416000e+10,3
...,...,...,...,...,...
181,Mexico,MEX,2020.0,1.090515e+12,3
182,United States,USA,2020.0,2.106047e+13,1
183,Japan,JPN,2021.0,4.940878e+12,2
184,Mexico,MEX,2021.0,1.272839e+12,3


In [153]:
# Borramos la columna country, para garantizar una forma normal
df_gdp.pop("country")
# Reordenamos
df_gdp = df_gdp[["idCountry","code","year","gdp(current US$)"]]
# Observamos
df_gdp

Unnamed: 0,idCountry,code,year,gdp(current US$)
0,2,JPN,1960.0,4.430734e+10
1,3,MEX,1960.0,1.304000e+10
2,1,USA,1960.0,5.433000e+11
3,2,JPN,1961.0,5.350862e+10
4,3,MEX,1961.0,1.416000e+10
...,...,...,...,...
181,3,MEX,2020.0,1.090515e+12
182,1,USA,2020.0,2.106047e+13
183,2,JPN,2021.0,4.940878e+12
184,3,MEX,2021.0,1.272839e+12


In [154]:
df_gdp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 186 entries, 0 to 185
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   idCountry         186 non-null    int64  
 1   code              186 non-null    object 
 2   year              186 non-null    object 
 3   gdp(current US$)  186 non-null    float64
dtypes: float64(1), int64(1), object(2)
memory usage: 5.9+ KB


In [155]:
# Colocamos el tipo de dato correcto
df_gdp['idCountry'] = df_gdp['idCountry'].astype('int64')
df_gdp['year'] = df_gdp['year'].astype('int64')
df_gdp['gdp(current US$)'] = df_gdp['gdp(current US$)'].astype('float64')
# Observamos
df_gdp

Unnamed: 0,idCountry,code,year,gdp(current US$)
0,2,JPN,1960,4.430734e+10
1,3,MEX,1960,1.304000e+10
2,1,USA,1960,5.433000e+11
3,2,JPN,1961,5.350862e+10
4,3,MEX,1961,1.416000e+10
...,...,...,...,...
181,3,MEX,2020,1.090515e+12
182,1,USA,2020,2.106047e+13
183,2,JPN,2021,4.940878e+12
184,3,MEX,2021,1.272839e+12


In [447]:
# Exportamos a formato JSON para su posterior análisis
df_gdp.to_json("Datasets/KPI 5/GDP (1960 - 2021).json")

---