# Ibtracks check

Fabiana Castiblanco

http://github.com/fabi-cast

As some inconsistencies were found after a thorough inspection of our data, we decided to extract further information from our core data set.

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import re
import math
import requests
import json
import matplotlib
from pandas import ExcelFile
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
from datetime import datetime
import geopandas as gp
import reverse_geocoder as rg
import geopy
import ast
import statsmodels.api as sm
import pickle
from sklearn.preprocessing import OneHotEncoder
pd.set_option('display.max_columns', None)
import sklearn
matplotlib.rc('image', cmap='Set3')
from plotly import graph_objects as go
import plotly.express as px
from scicolorscales import *
from plotly.subplots import make_subplots


# 1. Import dataset and setting data types

In [2]:
ibtracs = pd.read_csv("Data_input/ibtracs_fundamental.csv", low_memory = False).drop(columns = "Unnamed: 0")

In [3]:
def null_cols(data):

    """
    This function takes a dataframe df and shows the columns of df that have NaN values
    and the number of them

    """
    nulls = data.isna().sum()
    nulls = nulls[nulls > 0] / len(data) * 100
    nulls = pd.DataFrame(nulls.sort_values(ascending = False))
    nulls = nulls.reset_index().rename(columns = {"index": "Feature", 0: "% Null rows"})
    return nulls


In [4]:
ibtracs = ibtracs[['SID', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'ISO', 'NAME', 'ISO_TIME', 'NATURE', 'LAT', 'LON', 'COORDS', 'WMO_WIND', 'WMO_PRES', 'WMO_AGENCY', 'TRACK_TYPE', 'DIST2LAND', 'LANDFALL', 'IFLAG', 'USA_AGENCY', 'USA_ATCF_ID', 'USA_LAT', 'USA_LON', 'USA_RECORD', 'USA_STATUS', 'USA_WIND', 'USA_PRES', 'USA_SSHS', 'USA_R34_NE', 'USA_R34_SE', 'USA_R34_SW', 'USA_R34_NW', 'USA_R50_NE', 'USA_R50_SE', 'USA_R50_SW', 'USA_R50_NW', 'USA_R64_NE', 'USA_R64_SE', 'USA_R64_SW', 'USA_R64_NW', 'USA_POCI', 'USA_ROCI', 'USA_RMW', 'USA_EYE', 'TOKYO_LAT', 'TOKYO_LON', 'TOKYO_GRADE', 'TOKYO_WIND', 'TOKYO_PRES', 'TOKYO_R50_DIR', 'TOKYO_R50_LONG', 'TOKYO_R50_SHORT', 'TOKYO_R30_DIR', 'TOKYO_R30_LONG', 'TOKYO_R30_SHORT', 'TOKYO_LAND', 'CMA_LAT', 'CMA_LON', 'CMA_CAT', 'CMA_WIND', 'CMA_PRES', 'HKO_LAT', 'HKO_LON', 'HKO_CAT', 'HKO_WIND', 'HKO_PRES', 'NEWDELHI_LAT', 'NEWDELHI_LON', 'NEWDELHI_GRADE', 'NEWDELHI_WIND', 'NEWDELHI_PRES', 'NEWDELHI_CI', 'NEWDELHI_DP', 'NEWDELHI_POCI', 'REUNION_LAT', 'REUNION_LON', 'REUNION_TYPE', 'REUNION_WIND', 'REUNION_PRES', 'REUNION_TNUM', 'REUNION_CI', 'REUNION_RMW', 'REUNION_R34_NE', 'REUNION_R34_SE', 'REUNION_R34_SW', 'REUNION_R34_NW', 'REUNION_R50_NE', 'REUNION_R50_SE', 'REUNION_R50_SW', 'REUNION_R50_NW', 'REUNION_R64_NE', 'REUNION_R64_SE', 'REUNION_R64_SW', 'REUNION_R64_NW', 'BOM_LAT', 'BOM_LON', 'BOM_TYPE', 'BOM_WIND', 'BOM_PRES', 'BOM_TNUM', 'BOM_CI', 'BOM_RMW', 'BOM_R34_NE', 'BOM_R34_SE', 'BOM_R34_SW', 'BOM_R34_NW', 'BOM_R50_NE', 'BOM_R50_SE', 'BOM_R50_SW', 'BOM_R50_NW', 'BOM_R64_NE', 'BOM_R64_SE', 'BOM_R64_SW', 'BOM_R64_NW', 'BOM_ROCI', 'BOM_POCI', 'BOM_EYE', 'BOM_POS_METHOD', 'BOM_PRES_METHOD', 'NADI_LAT', 'NADI_LON', 'NADI_CAT', 'NADI_WIND', 'NADI_PRES', 'WELLINGTON_LAT', 'WELLINGTON_LON', 'WELLINGTON_WIND', 'WELLINGTON_PRES', 'DS824_LAT', 'DS824_LON', 'DS824_STAGE', 'DS824_WIND', 'DS824_PRES', 'TD9636_LAT', 'TD9636_LON', 'TD9636_STAGE', 'TD9636_WIND', 'TD9636_PRES', 'TD9635_LAT', 'TD9635_LON', 'TD9635_WIND', 'TD9635_PRES', 'TD9635_ROCI', 'NEUMANN_LAT', 'NEUMANN_LON', 'NEUMANN_CLASS', 'NEUMANN_WIND', 'NEUMANN_PRES', 'MLC_LAT', 'MLC_LON', 'MLC_CLASS', 'MLC_WIND', 'MLC_PRES', 'USA_GUST', 'BOM_GUST', 'BOM_GUST_PER', 'REUNION_GUST', 'REUNION_GUST_PER', 'USA_SEAHGT', 'USA_SEARAD_NE', 'USA_SEARAD_SE', 'USA_SEARAD_SW', 'USA_SEARAD_NW', 'STORM_SPEED', 'STORM_DIR']]

We should change the data types in our dataset:

In [5]:
for col in ["LAT", "LON", "DIST2LAND", "LANDFALL", "STORM_SPEED", "STORM_DIR"]:
    ibtracs[col] = ibtracs[col].apply(lambda x: x if x!= " " else 0)
    ibtracs[col] = pd.to_numeric(ibtracs[col])

ibtracs["ISO_TIME"] = pd.to_datetime(ibtracs["ISO_TIME"])

In [6]:
ibtracs.dtypes

SID              object
SEASON            int64
NUMBER            int64
BASIN            object
SUBBASIN         object
                  ...  
USA_SEARAD_SE    object
USA_SEARAD_SW    object
USA_SEARAD_NW    object
STORM_SPEED       int64
STORM_DIR         int64
Length: 165, dtype: object

In [7]:
ibtracs.head()

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,ISO,NAME,ISO_TIME,NATURE,LAT,LON,COORDS,WMO_WIND,WMO_PRES,WMO_AGENCY,TRACK_TYPE,DIST2LAND,LANDFALL,IFLAG,USA_AGENCY,USA_ATCF_ID,USA_LAT,USA_LON,USA_RECORD,USA_STATUS,USA_WIND,USA_PRES,USA_SSHS,USA_R34_NE,USA_R34_SE,USA_R34_SW,USA_R34_NW,USA_R50_NE,USA_R50_SE,USA_R50_SW,USA_R50_NW,USA_R64_NE,USA_R64_SE,USA_R64_SW,USA_R64_NW,USA_POCI,USA_ROCI,USA_RMW,USA_EYE,TOKYO_LAT,TOKYO_LON,TOKYO_GRADE,TOKYO_WIND,TOKYO_PRES,TOKYO_R50_DIR,TOKYO_R50_LONG,TOKYO_R50_SHORT,TOKYO_R30_DIR,TOKYO_R30_LONG,TOKYO_R30_SHORT,TOKYO_LAND,CMA_LAT,CMA_LON,CMA_CAT,CMA_WIND,CMA_PRES,HKO_LAT,HKO_LON,HKO_CAT,HKO_WIND,HKO_PRES,NEWDELHI_LAT,NEWDELHI_LON,NEWDELHI_GRADE,NEWDELHI_WIND,NEWDELHI_PRES,NEWDELHI_CI,NEWDELHI_DP,NEWDELHI_POCI,REUNION_LAT,REUNION_LON,REUNION_TYPE,REUNION_WIND,REUNION_PRES,REUNION_TNUM,REUNION_CI,REUNION_RMW,REUNION_R34_NE,REUNION_R34_SE,REUNION_R34_SW,REUNION_R34_NW,REUNION_R50_NE,REUNION_R50_SE,REUNION_R50_SW,REUNION_R50_NW,REUNION_R64_NE,REUNION_R64_SE,REUNION_R64_SW,REUNION_R64_NW,BOM_LAT,BOM_LON,BOM_TYPE,BOM_WIND,BOM_PRES,BOM_TNUM,BOM_CI,BOM_RMW,BOM_R34_NE,BOM_R34_SE,BOM_R34_SW,BOM_R34_NW,BOM_R50_NE,BOM_R50_SE,BOM_R50_SW,BOM_R50_NW,BOM_R64_NE,BOM_R64_SE,BOM_R64_SW,BOM_R64_NW,BOM_ROCI,BOM_POCI,BOM_EYE,BOM_POS_METHOD,BOM_PRES_METHOD,NADI_LAT,NADI_LON,NADI_CAT,NADI_WIND,NADI_PRES,WELLINGTON_LAT,WELLINGTON_LON,WELLINGTON_WIND,WELLINGTON_PRES,DS824_LAT,DS824_LON,DS824_STAGE,DS824_WIND,DS824_PRES,TD9636_LAT,TD9636_LON,TD9636_STAGE,TD9636_WIND,TD9636_PRES,TD9635_LAT,TD9635_LON,TD9635_WIND,TD9635_PRES,TD9635_ROCI,NEUMANN_LAT,NEUMANN_LON,NEUMANN_CLASS,NEUMANN_WIND,NEUMANN_PRES,MLC_LAT,MLC_LON,MLC_CLASS,MLC_WIND,MLC_PRES,USA_GUST,BOM_GUST,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
0,1949163N07145,1949,38,WP,MM,FM,DELLA,1949-06-12 00:00:00,TS,6.6,145.0,"(6.6, 145.0)",,,,main,991,991,__O___________,,,,,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.6,145.0,0,,1008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13,270
1,1949163N07145,1949,38,WP,MM,FM,DELLA,1949-06-12 03:00:00,TS,6.60023,144.322,"(6.600230000000001, 144.322)",,,,main,1007,1007,__P___________,,,,,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.60023,144.322,0,,1008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,13,270
2,1949163N07145,1949,38,WP,MM,FM,DELLA,1949-06-12 06:00:00,TS,6.6,143.7,"(6.6, 143.7)",,,,main,1026,1026,__O___________,,,,,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.6,143.7,0,,1008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,11,270
3,1949163N07145,1949,38,WP,MM,FM,DELLA,1949-06-12 09:00:00,TS,6.5999,143.17,"(6.5999, 143.17)",,,,main,1045,1040,__P___________,,,,,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.5999,143.17,0,,1008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,270
4,1949163N07145,1949,38,WP,MM,FM,DELLA,1949-06-12 12:00:00,TS,6.6,142.7,"(6.6, 142.7)",,,,main,1040,1022,__O___________,,,,,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6.6,142.7,0,,1008,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9,270


As the coordinates are not being recognized as tuples, we should aggregate them again:

In [10]:
ibtracs["COORDS"] = ibtracs[["LAT", "LON"]].values.tolist()
ibtracs["COORDS"] = ibtracs.COORDS.apply(lambda x: (x[0], x[1]))


After running an script, some of our rows have ISO null:

In [13]:
ibtracs[ibtracs.ISO.isna()]

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,ISO,NAME,ISO_TIME,NATURE,LAT,LON,COORDS,WMO_WIND,WMO_PRES,WMO_AGENCY,TRACK_TYPE,DIST2LAND,LANDFALL,IFLAG,USA_AGENCY,USA_ATCF_ID,USA_LAT,USA_LON,USA_RECORD,USA_STATUS,USA_WIND,USA_PRES,USA_SSHS,USA_R34_NE,USA_R34_SE,USA_R34_SW,USA_R34_NW,USA_R50_NE,USA_R50_SE,USA_R50_SW,USA_R50_NW,USA_R64_NE,USA_R64_SE,USA_R64_SW,USA_R64_NW,USA_POCI,USA_ROCI,USA_RMW,USA_EYE,TOKYO_LAT,TOKYO_LON,TOKYO_GRADE,TOKYO_WIND,TOKYO_PRES,TOKYO_R50_DIR,TOKYO_R50_LONG,TOKYO_R50_SHORT,TOKYO_R30_DIR,TOKYO_R30_LONG,TOKYO_R30_SHORT,TOKYO_LAND,CMA_LAT,CMA_LON,CMA_CAT,CMA_WIND,CMA_PRES,HKO_LAT,HKO_LON,HKO_CAT,HKO_WIND,HKO_PRES,NEWDELHI_LAT,NEWDELHI_LON,NEWDELHI_GRADE,NEWDELHI_WIND,NEWDELHI_PRES,NEWDELHI_CI,NEWDELHI_DP,NEWDELHI_POCI,REUNION_LAT,REUNION_LON,REUNION_TYPE,REUNION_WIND,REUNION_PRES,REUNION_TNUM,REUNION_CI,REUNION_RMW,REUNION_R34_NE,REUNION_R34_SE,REUNION_R34_SW,REUNION_R34_NW,REUNION_R50_NE,REUNION_R50_SE,REUNION_R50_SW,REUNION_R50_NW,REUNION_R64_NE,REUNION_R64_SE,REUNION_R64_SW,REUNION_R64_NW,BOM_LAT,BOM_LON,BOM_TYPE,BOM_WIND,BOM_PRES,BOM_TNUM,BOM_CI,BOM_RMW,BOM_R34_NE,BOM_R34_SE,BOM_R34_SW,BOM_R34_NW,BOM_R50_NE,BOM_R50_SE,BOM_R50_SW,BOM_R50_NW,BOM_R64_NE,BOM_R64_SE,BOM_R64_SW,BOM_R64_NW,BOM_ROCI,BOM_POCI,BOM_EYE,BOM_POS_METHOD,BOM_PRES_METHOD,NADI_LAT,NADI_LON,NADI_CAT,NADI_WIND,NADI_PRES,WELLINGTON_LAT,WELLINGTON_LON,WELLINGTON_WIND,WELLINGTON_PRES,DS824_LAT,DS824_LON,DS824_STAGE,DS824_WIND,DS824_PRES,TD9636_LAT,TD9636_LON,TD9636_STAGE,TD9636_WIND,TD9636_PRES,TD9635_LAT,TD9635_LON,TD9635_WIND,TD9635_PRES,TD9635_ROCI,NEUMANN_LAT,NEUMANN_LON,NEUMANN_CLASS,NEUMANN_WIND,NEUMANN_PRES,MLC_LAT,MLC_LON,MLC_CLASS,MLC_WIND,MLC_PRES,USA_GUST,BOM_GUST,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
6630,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-09 21:00:00,TS,-20.3549,19.6315,"(-20.3549, 19.6315)",,,,main,0,0,P________PP___,,SH101977,-20.3497,19.655,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.3497,19.655,TC,25,,-20.3652,19.5846,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,14,265
6631,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 00:00:00,TS,-20.4075,18.9501,"(-20.4075, 18.9501)",,,,main,0,0,O________OP___,jtwc_sh,SH101977,-20.4,19.0,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.4,19.0,TC,25,,-20.4226,18.8503,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,265
6632,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 03:00:00,TS,-20.4612,18.3287,"(-20.4612, 18.3287)",,,,main,0,0,P________PP___,,SH101977,-20.4575,18.3601,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.4575,18.3601,TC,25,,-20.4686,18.2658,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,265
6633,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 06:00:00,TS,-20.5,17.9,"(-20.5, 17.9)",,,,main,0,0,O________OO___,jtwc_sh,SH101977,-20.5,17.9,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,17.9,TC,25,,-20.5,17.9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,265
6634,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 09:00:00,TS,-20.5098,17.7044,"(-20.5098, 17.7044)",,,,main,0,0,P________PP___,,SH101977,-20.5071,17.6574,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5071,17.6574,TC,25,,-20.5153,17.7983,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,267
6635,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 12:00:00,TS,-20.5059,17.7708,"(-20.5059, 17.7708)",,,,main,0,0,O________OP___,jtwc_sh,SH101977,-20.5,17.7,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,17.7,TC,25,,-20.5177,17.9123,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,89
6636,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 15:00:00,TS,-20.5025,18.0718,"(-20.5025, 18.0718)",,,,main,0,0,P________PP___,,SH101977,-20.498,18.0225,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.498,18.0225,TC,25,,-20.5113,18.1702,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,7,90
6637,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 18:00:00,TS,-20.5,18.5,"(-20.5, 18.5)",,,,main,0,0,O________OO___,jtwc_sh,SH101977,-20.5,18.5,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,18.5,TC,25,,-20.5,18.5,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,8,90


We should again construct a function calculating the missing ``ISO``:

In [15]:
def get_iso(coords):
    return rg.search(coords, mode = 1)[0]["cc"]

We will apply that function just for the rows that are missing:

In [16]:
ibtracs_iso_missing = ibtracs[ibtracs.ISO.isna()].copy()

In [19]:
ibtracs_iso_missing["ISO"] = ibtracs_iso_missing.COORDS.apply(get_iso)

In [20]:
ibtracs_iso_missing

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,ISO,NAME,ISO_TIME,NATURE,LAT,LON,COORDS,WMO_WIND,WMO_PRES,WMO_AGENCY,TRACK_TYPE,DIST2LAND,LANDFALL,IFLAG,USA_AGENCY,USA_ATCF_ID,USA_LAT,USA_LON,USA_RECORD,USA_STATUS,USA_WIND,USA_PRES,USA_SSHS,USA_R34_NE,USA_R34_SE,USA_R34_SW,USA_R34_NW,USA_R50_NE,USA_R50_SE,USA_R50_SW,USA_R50_NW,USA_R64_NE,USA_R64_SE,USA_R64_SW,USA_R64_NW,USA_POCI,USA_ROCI,USA_RMW,USA_EYE,TOKYO_LAT,TOKYO_LON,TOKYO_GRADE,TOKYO_WIND,TOKYO_PRES,TOKYO_R50_DIR,TOKYO_R50_LONG,TOKYO_R50_SHORT,TOKYO_R30_DIR,TOKYO_R30_LONG,TOKYO_R30_SHORT,TOKYO_LAND,CMA_LAT,CMA_LON,CMA_CAT,CMA_WIND,CMA_PRES,HKO_LAT,HKO_LON,HKO_CAT,HKO_WIND,HKO_PRES,NEWDELHI_LAT,NEWDELHI_LON,NEWDELHI_GRADE,NEWDELHI_WIND,NEWDELHI_PRES,NEWDELHI_CI,NEWDELHI_DP,NEWDELHI_POCI,REUNION_LAT,REUNION_LON,REUNION_TYPE,REUNION_WIND,REUNION_PRES,REUNION_TNUM,REUNION_CI,REUNION_RMW,REUNION_R34_NE,REUNION_R34_SE,REUNION_R34_SW,REUNION_R34_NW,REUNION_R50_NE,REUNION_R50_SE,REUNION_R50_SW,REUNION_R50_NW,REUNION_R64_NE,REUNION_R64_SE,REUNION_R64_SW,REUNION_R64_NW,BOM_LAT,BOM_LON,BOM_TYPE,BOM_WIND,BOM_PRES,BOM_TNUM,BOM_CI,BOM_RMW,BOM_R34_NE,BOM_R34_SE,BOM_R34_SW,BOM_R34_NW,BOM_R50_NE,BOM_R50_SE,BOM_R50_SW,BOM_R50_NW,BOM_R64_NE,BOM_R64_SE,BOM_R64_SW,BOM_R64_NW,BOM_ROCI,BOM_POCI,BOM_EYE,BOM_POS_METHOD,BOM_PRES_METHOD,NADI_LAT,NADI_LON,NADI_CAT,NADI_WIND,NADI_PRES,WELLINGTON_LAT,WELLINGTON_LON,WELLINGTON_WIND,WELLINGTON_PRES,DS824_LAT,DS824_LON,DS824_STAGE,DS824_WIND,DS824_PRES,TD9636_LAT,TD9636_LON,TD9636_STAGE,TD9636_WIND,TD9636_PRES,TD9635_LAT,TD9635_LON,TD9635_WIND,TD9635_PRES,TD9635_ROCI,NEUMANN_LAT,NEUMANN_LON,NEUMANN_CLASS,NEUMANN_WIND,NEUMANN_PRES,MLC_LAT,MLC_LON,MLC_CLASS,MLC_WIND,MLC_PRES,USA_GUST,BOM_GUST,BOM_GUST_PER,REUNION_GUST,REUNION_GUST_PER,USA_SEAHGT,USA_SEARAD_NE,USA_SEARAD_SE,USA_SEARAD_SW,USA_SEARAD_NW,STORM_SPEED,STORM_DIR
6630,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-09 21:00:00,TS,-20.3549,19.6315,"(-20.3549, 19.6315)",,,,main,0,0,P________PP___,,SH101977,-20.3497,19.655,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.3497,19.655,TC,25,,-20.3652,19.5846,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,14,265
6631,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 00:00:00,TS,-20.4075,18.9501,"(-20.4075, 18.9501)",,,,main,0,0,O________OP___,jtwc_sh,SH101977,-20.4,19.0,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.4,19.0,TC,25,,-20.4226,18.8503,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,12,265
6632,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 03:00:00,TS,-20.4612,18.3287,"(-20.4612, 18.3287)",,,,main,0,0,P________PP___,,SH101977,-20.4575,18.3601,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.4575,18.3601,TC,25,,-20.4686,18.2658,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,10,265
6633,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 06:00:00,TS,-20.5,17.9,"(-20.5, 17.9)",,,,main,0,0,O________OO___,jtwc_sh,SH101977,-20.5,17.9,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,17.9,TC,25,,-20.5,17.9,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,265
6634,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 09:00:00,TS,-20.5098,17.7044,"(-20.5098, 17.7044)",,,,main,0,0,P________PP___,,SH101977,-20.5071,17.6574,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5071,17.6574,TC,25,,-20.5153,17.7983,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,267
6635,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 12:00:00,TS,-20.5059,17.7708,"(-20.5059, 17.7708)",,,,main,0,0,O________OP___,jtwc_sh,SH101977,-20.5,17.7,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,17.7,TC,25,,-20.5177,17.9123,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,3,89
6636,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 15:00:00,TS,-20.5025,18.0718,"(-20.5025, 18.0718)",,,,main,0,0,P________PP___,,SH101977,-20.498,18.0225,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.498,18.0225,TC,25,,-20.5113,18.1702,1,,,,,,,,,,,,,,,,,,,,,,,,,,,,7,90
6637,1977025S11064,1977,8,SI,MM,,EMILIE,1977-02-10 18:00:00,TS,-20.5,18.5,"(-20.5, 18.5)",,,,main,0,0,O________OO___,jtwc_sh,SH101977,-20.5,18.5,,,,,-1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-20.5,18.5,TC,25,,-20.5,18.5,6,,,,,,,,,,,,,,,,,,,,,,,,,,,,8,90


As all those cyclones have ``ISO`` identically NA, we can fill this value in ``ibtracs``:

In [22]:
ibtracs.ISO.fillna("NA", inplace= True)

In [24]:
null_cols(ibtracs)

Unnamed: 0,Feature,% Null rows
0,BASIN,14.47338
1,SUBBASIN,8.525597


## 2. Getting winds and pressures

In [32]:
# Getting winds

winds = ibtracs.filter(regex='WIND')

wind_cols = list(winds.columns)

ib_winds = ibtracs[['SID','ISO','SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'LAT', 'LON', "COORDS"]+ wind_cols].copy()

for col in wind_cols:
    ib_winds[col] = ib_winds[col].apply(lambda x: x if x!= " " else 0)
    ib_winds[col] = pd.to_numeric(ib_winds[col], errors = "coerce")
    
ib_winds["ALL_WINDS"] = ib_winds[wind_cols].values.tolist()
ib_winds["ALL_WINDS"] = ib_winds["ALL_WINDS"].apply(lambda x: set(x))

ib_winds["MAX_WIND"] = ib_winds["ALL_WINDS"].apply(lambda x: max(x))

ib_winds.drop(columns = wind_cols, inplace= True)

ib_winds

Unnamed: 0,SID,ISO,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,LAT,LON,COORDS,ALL_WINDS,MAX_WIND
0,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 00:00:00,6.60000,145.0000,"(6.6, 145.0)",{0},0
1,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 03:00:00,6.60023,144.3220,"(6.600230000000002, 144.322)",{0},0
2,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 06:00:00,6.60000,143.7000,"(6.6, 143.7)",{0},0
3,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 09:00:00,6.59990,143.1700,"(6.5999, 143.17)",{0},0
4,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 12:00:00,6.60000,142.7000,"(6.6, 142.7)",{0},0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60589,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-02 18:00:00,18.30000,85.0000,"(18.3, 85.0)","{0, 134}",134
60590,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-02 21:00:00,18.61260,85.2425,"(18.6126, 85.2425)","{0, 131}",131
60591,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-03 00:00:00,19.00000,85.5000,"(19.0, 85.5)","{0, 128}",128
60592,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-03 03:00:00,19.54990,85.6774,"(19.5499, 85.6774)","{0, 121}",121


In [33]:
# Getting pressures

pres = ibtracs.filter(regex='PRES')

pres_cols = list(pres.columns)

ib_pres = ibtracs[['SID', 'ISO', 'SEASON', 'NUMBER', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'LAT', 'LON', "COORDS"]+ pres_cols].copy()

for col in pres_cols:
    ib_pres[col] = ib_pres[col].apply(lambda x: x if x!= " " else 0)
    ib_pres[col] = pd.to_numeric(ib_pres[col], errors = "coerce")
    
ib_pres["ALL_PRES"] = ib_pres[pres_cols].values.tolist()
ib_pres["ALL_PRES"] = ib_pres["ALL_PRES"].apply(lambda x: set(x))

def calc_min_pres(set_values):
    """
    Returns the min value greater than 800 in a list.
    If there is no such a value, i.e. all values in the list are <800, 
    then returns 0.
    """
    list_values = list(set_values)
    list_values = [value for value in set_values if value>800]
    try:
        return min(list_values)
    except:
        return 0
    
ib_pres["MIN_PRES"] = ib_pres["ALL_PRES"].apply(calc_min_pres)

ib_pres.drop(columns = pres_cols, inplace= True)

ib_pres

Unnamed: 0,SID,ISO,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,LAT,LON,COORDS,ALL_PRES,MIN_PRES
0,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 00:00:00,6.60000,145.0000,"(6.6, 145.0)","{0, 1008}",1008
1,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 03:00:00,6.60023,144.3220,"(6.600230000000002, 144.322)","{0, 1008}",1008
2,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 06:00:00,6.60000,143.7000,"(6.6, 143.7)","{0, 1008}",1008
3,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 09:00:00,6.59990,143.1700,"(6.5999, 143.17)","{0, 1008}",1008
4,1949163N07145,FM,1949,38,WP,MM,DELLA,1949-06-12 12:00:00,6.60000,142.7000,"(6.6, 142.7)","{0, 1008}",1008
...,...,...,...,...,...,...,...,...,...,...,...,...,...
60589,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-02 18:00:00,18.30000,85.0000,"(18.3, 85.0)","{0, 920}",920
60590,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-02 21:00:00,18.61260,85.2425,"(18.6126, 85.2425)","{0, 921}",921
60591,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-03 00:00:00,19.00000,85.5000,"(19.0, 85.5)","{0, 923}",923
60592,2019117N05088,IN,2019,21,NI,BB,FANI,2019-05-03 03:00:00,19.54990,85.6774,"(19.5499, 85.6774)","{0, 932}",932


## 3. Filtering the data set with new wind and pressures features

In [34]:
ib_prem = ibtracs.copy().drop(columns = wind_cols+pres_cols)

In [35]:
ib_prem = ib_prem[['SID', 'ISO','SEASON', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'NATURE', 'LAT', 'LON', "COORDS","DIST2LAND", "LANDFALL", "USA_SSHS", "STORM_SPEED", "STORM_DIR" ]]

In [36]:
ib_prem.shape

(60594, 16)

In [37]:
ib_prem.head(2)

Unnamed: 0,SID,ISO,SEASON,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,COORDS,DIST2LAND,LANDFALL,USA_SSHS,STORM_SPEED,STORM_DIR
0,1949163N07145,FM,1949,WP,MM,DELLA,1949-06-12 00:00:00,TS,6.6,145.0,"(6.6, 145.0)",991,991,-1,13,270
1,1949163N07145,FM,1949,WP,MM,DELLA,1949-06-12 03:00:00,TS,6.60023,144.322,"(6.600230000000002, 144.322)",1007,1007,-1,13,270


We need to merge our wind and pressure features with ``ib_prem``:

In [38]:
ib_prem = ib_prem.merge(ib_winds, 
                        on = ['SID','ISO', 'SEASON', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'LAT', 'LON', "COORDS"], 
                        how = "inner").merge(ib_pres,
                                             on = ['SID', 'ISO', 'SEASON', 'BASIN', 'SUBBASIN', 'NAME', 'ISO_TIME', 'LAT', 'LON', "COORDS"],
                                             how = "inner")

In [39]:
ib_prem.shape

(60594, 22)

In [40]:
ib_prem.drop(columns = ["NUMBER_x", "NUMBER_y"], inplace = True)

## 2. Calculating maximum winds, lowest pressure, highest storm speed 


In a tropical cyclone, the most important elements are the **highest winds** and the **lowest pressures**. We need to find these elements for each cyclone in the data set. As we replace missing values with 0, we need to filter those out to compute the real minimum pressure. Also, we need to note that the minimum sea level pressure goes from 800 mb on. 

We calculate also the min distance to land and landfall, the maximum storm speed, and the minimum and maximum dates and times of each cyclone. 

In [41]:
#wind_pres shows the aggregated values
#max_wind and min pressure calculated as above: min which is >800 else 0 (meaning there are no valid values)
aggregates = ib_prem.groupby(["SID", "ISO"]).agg({'MAX_WIND':"max", 
                                        "MIN_PRES": calc_min_pres,
                                        "DIST2LAND": "min",
                                        "LANDFALL": "min",
                                        "STORM_SPEED": "max",
                                        "ISO_TIME": "min"
                                        }).reset_index().rename(columns = {"DIST2LAND": "MIN_DIST2LAND",
                                                                           "STORM_SPEED": "MAX_STORMSPEED",
                                                                           "ISO_TIME": "MIN_ISOTIME"})

In [42]:
max_time = ib_prem.groupby(["SID", "ISO"]).agg({"ISO_TIME": "max"}).reset_index().rename(columns = {"ISO_TIME": "MAX_ISOTIME"})

In [43]:
aggregates = aggregates.merge(max_time, on = ["SID", "ISO"], how = "inner")

In [44]:
aggregates["TOTAL_HOURS_EVENT"]= (aggregates["MAX_ISOTIME"]-aggregates["MIN_ISOTIME"]).apply(lambda x: x.days)*24 + (aggregates["MAX_ISOTIME"]-aggregates["MIN_ISOTIME"]).apply(lambda x: x.seconds)/3600

Now, we will calculate the **total time in hours in land**:

In [47]:
time_land = ib_prem[ib_prem.DIST2LAND == 0].groupby(["SID", "ISO"]).agg({"ISO_TIME": "min"}).reset_index()

time_land_max = ib_prem[ib_prem.DIST2LAND == 0].groupby(["SID", "ISO"]).agg({"ISO_TIME": "max"}).reset_index()

time_land = time_land.merge(time_land_max, on = ["SID", "ISO"], how = "left").rename(columns = {"ISO_TIME_x": "TIME_LAND_MIN",
                                                                                       "ISO_TIME_y": "TIME_LAND_MAX"})

time_land["TOTAL_HOURS_IN_LAND"] = (time_land["TIME_LAND_MAX"]- time_land["TIME_LAND_MIN"]).apply(lambda x: x.days)*24+ (time_land["TIME_LAND_MAX"]- time_land["TIME_LAND_MIN"]).apply(lambda x: x.seconds)/3600

In [48]:
time_land.head()

Unnamed: 0,SID,ISO,TIME_LAND_MIN,TIME_LAND_MAX,TOTAL_HOURS_IN_LAND
0,1949163N07145,JP,1949-06-20 15:00:00,1949-06-23 03:00:00,60.0
1,1950241N23140,JP,1950-09-03 03:00:00,1950-09-04 00:00:00,21.0
2,1951224N12316,JM,1951-08-18 03:00:00,1951-08-18 06:00:00,3.0
3,1951224N12316,MX,1951-08-20 06:00:00,1951-08-23 18:00:00,84.0
4,1951337N09150,PH,1951-12-09 12:00:00,1951-12-11 00:00:00,36.0


Finally, we will merge ``aggregates`` and ``time_land``. For the SIDs without data in ``time_land``, we will put 0 as total hours in land, because in this case, the cyclone did not touch land.

In [49]:
aggregates = aggregates.merge(time_land, on = ["SID", "ISO"], how = "left")

aggregates.TOTAL_HOURS_IN_LAND.fillna(0, inplace = True)

In [50]:
aggregates.head(2)

Unnamed: 0,SID,ISO,MAX_WIND,MIN_PRES,MIN_DIST2LAND,LANDFALL,MAX_STORMSPEED,MIN_ISOTIME,MAX_ISOTIME,TOTAL_HOURS_EVENT,TIME_LAND_MIN,TIME_LAND_MAX,TOTAL_HOURS_IN_LAND
0,1949163N07145,FM,0,1005,893,888,13,1949-06-12 00:00:00,1949-06-14 09:00:00,57.0,NaT,NaT,0.0
1,1949163N07145,JP,116,952,0,0,33,1949-06-19 09:00:00,1949-06-24 06:00:00,117.0,1949-06-20 15:00:00,1949-06-23 03:00:00,60.0


## 3. Coordinates manipulation, length of the path followed by a cyclone

It would be interesting to know what was the total distance followed by the cyclone both in land and outside.

In [56]:
#creating list of the path followed by each cyclone

def agg_coords(series):
    coords_list = series.tolist()
    return coords_list

paths = ib_prem.groupby(["SID", "ISO"]).agg({"COORDS": agg_coords}).reset_index()

In [57]:
from geopy import distance

In [58]:
def get_distance(coords_list):
    dist_list = []
    n = len(coords_list)
    if n==0:
        return 0
    else:
        for i in range(n):
            if i>0:
                dist_list.append(distance.distance(coords_list[i-1], coords_list[i]).km)
        return sum(dist_list)

In [59]:
paths["LENGTH_PATH"] = paths.COORDS.apply(get_distance)
paths["VINCENTY_LENGTH"] = paths.COORDS.apply(lambda x: distance.distance(x[0], x[-1]).km)

In [60]:
paths

Unnamed: 0,SID,ISO,COORDS,LENGTH_PATH,VINCENTY_LENGTH
0,1949163N07145,FM,"[(6.6, 145.0), (6.600230000000002, 144.322), (...",937.785294,931.051958
1,1949163N07145,JP,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...",3889.480640,3632.626180
2,1949163N07145,KR,"[(36.1379, 130.923), (36.5117, 130.75799999999...",109.794071,92.380409
3,1949163N07145,PH,"[(12.6455, 131.224), (13.02, 130.68), (13.4415...",1223.557289,1120.943289
4,1949163N07145,PW,"[(7.5, 136.2), (7.63498, 135.792), (7.8, 135.4...",795.689277,721.404174
...,...,...,...,...,...
3676,2019113S10051,MZ,"[(-11.8576, 41.2852), (-12.1, 40.8), (-12.2265...",336.098173,304.360948
3677,2019113S10051,YT,"[(-10.7, 46.6)]",0.000000,0.000000
3678,2019117N05088,ID,"[(4.72566, 89.0623), (5.1, 89.8), (5.712269999...",244.277401,191.401909
3679,2019117N05088,IN,"[(7.047460000000001, 89.2377), (7.5, 88.9), (7...",1794.999316,1501.754117


In [61]:
aggregates = aggregates.merge(paths, on= ["SID", "ISO"], how = "inner")

In [62]:
aggregates["MONTH_START"] = aggregates.MIN_ISOTIME.apply(lambda x: x.month)
aggregates["MONTH_END"] = aggregates.MAX_ISOTIME.apply(lambda x: x.month)

In [64]:
aggregates.head(2)

Unnamed: 0,SID,ISO,MAX_WIND,MIN_PRES,MIN_DIST2LAND,LANDFALL,MAX_STORMSPEED,MIN_ISOTIME,MAX_ISOTIME,TOTAL_HOURS_EVENT,TIME_LAND_MIN,TIME_LAND_MAX,TOTAL_HOURS_IN_LAND,COORDS,LENGTH_PATH,VINCENTY_LENGTH,MONTH_START,MONTH_END
0,1949163N07145,FM,0,1005,893,888,13,1949-06-12 00:00:00,1949-06-14 09:00:00,57.0,NaT,NaT,0.0,"[(6.6, 145.0), (6.600230000000002, 144.322), (...",937.785294,931.051958,6,6
1,1949163N07145,JP,116,952,0,0,33,1949-06-19 09:00:00,1949-06-24 06:00:00,117.0,1949-06-20 15:00:00,1949-06-23 03:00:00,60.0,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...",3889.48064,3632.62618,6,6


## 4. Finding coordinates with maximum winds and minimum distance to land

We want to find those coordinates and store them.

In [65]:
max_winds = ib_prem.merge(aggregates[["SID", "ISO", "MAX_WIND"]], 
                          on = ["SID", "ISO", "MAX_WIND"], 
                          how = "right")

In [66]:
max_winds_coords = max_winds.groupby(["SID", "ISO"]).agg({"COORDS": agg_coords}).reset_index().rename(columns = {"COORDS": "COORDS_MAX_WINDS"})

In [67]:
min_dist2land = ib_prem.merge(aggregates[["SID", "ISO", "MIN_DIST2LAND"]].rename(columns = {"MIN_DIST2LAND": "DIST2LAND"}), 
                          on = ["SID", "ISO", "DIST2LAND"], 
                          how = "right")

In [68]:
min_dist2land_coords = min_dist2land.groupby(["SID", "ISO"]).agg({"COORDS": agg_coords}).reset_index().rename(columns = {"COORDS": "COORDS_MIN_DIST2LAND"})

In [69]:
coords_cyclones = max_winds_coords.merge(min_dist2land_coords, on = ["SID", "ISO"], how = "left")

In [70]:
coords_cyclones

Unnamed: 0,SID,ISO,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND
0,1949163N07145,FM,"[(6.6, 145.0), (6.600230000000002, 144.322), (...","[(7.0, 137.7)]"
1,1949163N07145,JP,"[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999..."
2,1949163N07145,KR,"[(36.1379, 130.923)]","[(36.7857, 130.714)]"
3,1949163N07145,PH,"[(20.873, 125.085)]","[(19.1179, 124.693)]"
4,1949163N07145,PW,"[(12.2619, 131.704)]","[(10.285, 132.107)]"
...,...,...,...,...
3676,2019113S10051,MZ,"[(-11.8576, 41.2852)]","[(-12.2265, 40.429), (-12.3, 40.1), (-12.4076,..."
3677,2019113S10051,YT,"[(-10.7, 46.6)]","[(-10.7, 46.6)]"
3678,2019117N05088,ID,"[(5.1, 89.8), (5.712269999999998, 89.8377), (6...","[(5.712269999999998, 89.8377)]"
3679,2019117N05088,IN,"[(17.6, 84.8), (17.965, 84.855), (18.3, 85.0)]","[(20.2, 85.8)]"


## 5. Selecting maximum category in general and inland

In [71]:
ib_prem["USA_SSHS"] = ib_prem.USA_SSHS.apply(lambda x: -6 if x== " " else x)

In [72]:
ib_prem["USA_SSHS"] = pd.to_numeric(ib_prem.USA_SSHS)

In [73]:
max_scales = ib_prem.groupby(["SID", "ISO"]).agg({"USA_SSHS": "max"}).reset_index()

In [74]:
max_inland = ib_prem[ib_prem.DIST2LAND ==0].groupby(["SID", "ISO"]).agg({"USA_SSHS": "max"}).reset_index().rename(columns = {"USA_SSHS":"MAX_USA_SSHS_INLAND"})

In [75]:
scales = max_scales.merge(max_inland, on =["SID", "ISO"], how = "left").rename(columns = {"USA_SSHS": "MAX_USA_SSHS"})

In [76]:
scales

Unnamed: 0,SID,ISO,MAX_USA_SSHS,MAX_USA_SSHS_INLAND
0,1949163N07145,FM,-1,
1,1949163N07145,JP,3,0.0
2,1949163N07145,KR,0,
3,1949163N07145,PH,2,
4,1949163N07145,PW,-1,
...,...,...,...,...
3676,2019113S10051,MZ,4,3.0
3677,2019113S10051,YT,1,
3678,2019117N05088,ID,0,
3679,2019117N05088,IN,4,4.0


For cyclones which does not touch land, we will fill this with "No_landing" eventually.

In [77]:
basins_subbasins = ib_prem.groupby(["SID", "ISO"]).agg({"BASIN": agg_coords, "SUBBASIN": agg_coords})

In [78]:
basins_subbasins["BASIN"] = basins_subbasins.BASIN.apply(lambda x: set(x))
basins_subbasins["SUBBASIN"] = basins_subbasins.SUBBASIN.apply(lambda x: set(x))

basins_subbasins["BASIN_NUMBER"] = basins_subbasins.BASIN.apply(lambda x: len(x))
basins_subbasins["SUBBASIN_NUMBER"] = basins_subbasins.SUBBASIN.apply(lambda x: len(x))

In [79]:
basins_subbasins.BASIN_NUMBER.value_counts()

1    3617
2      64
Name: BASIN_NUMBER, dtype: int64

## 6. Aggregating all in one

In [80]:
aggregates = aggregates.merge(coords_cyclones, 
                              on = ["SID", "ISO"], 
                              how = "inner").merge(scales, 
                                                   on =["SID", "ISO"], 
                                                   how = "inner")

In [81]:
aggregates.head()

Unnamed: 0,SID,ISO,MAX_WIND,MIN_PRES,MIN_DIST2LAND,LANDFALL,MAX_STORMSPEED,MIN_ISOTIME,MAX_ISOTIME,TOTAL_HOURS_EVENT,TIME_LAND_MIN,TIME_LAND_MAX,TOTAL_HOURS_IN_LAND,COORDS,LENGTH_PATH,VINCENTY_LENGTH,MONTH_START,MONTH_END,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,MAX_USA_SSHS,MAX_USA_SSHS_INLAND
0,1949163N07145,FM,0,1005,893,888,13,1949-06-12 00:00:00,1949-06-14 09:00:00,57.0,NaT,NaT,0.0,"[(6.6, 145.0), (6.600230000000002, 144.322), (...",937.785294,931.051958,6,6,"[(6.6, 145.0), (6.600230000000002, 144.322), (...","[(7.0, 137.7)]",-1,
1,1949163N07145,JP,116,952,0,0,33,1949-06-19 09:00:00,1949-06-24 06:00:00,117.0,1949-06-20 15:00:00,1949-06-23 03:00:00,60.0,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...",3889.48064,3632.62618,6,6,"[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3,0.0
2,1949163N07145,KR,77,984,107,106,10,1949-06-21 06:00:00,1949-06-21 15:00:00,9.0,NaT,NaT,0.0,"[(36.1379, 130.923), (36.5117, 130.75799999999...",109.794071,92.380409,6,6,"[(36.1379, 130.923)]","[(36.7857, 130.714)]",0,
3,1949163N07145,PH,97,969,264,256,13,1949-06-16 09:00:00,1949-06-19 06:00:00,69.0,NaT,NaT,0.0,"[(12.6455, 131.224), (13.02, 130.68), (13.4415...",1223.557289,1120.943289,6,6,"[(20.873, 125.085)]","[(19.1179, 124.693)]",2,
4,1949163N07145,PW,30,1000,658,625,16,1949-06-14 12:00:00,1949-06-16 06:00:00,42.0,NaT,NaT,0.0,"[(7.5, 136.2), (7.63498, 135.792), (7.8, 135.4...",795.689277,721.404174,6,6,"[(12.2619, 131.704)]","[(10.285, 132.107)]",-1,


## 7. Merging with populational data, basins and subbasins

First we load the consolidated data set.

In [82]:
final = pd.read_excel("Data_input/OUTPUT_WBI_exposer_cyclones_v10.xls")

In [83]:
final.head(2)

Unnamed: 0,SID,NAME,ISO,YEAR,BASIN,SUB BASIN,NATURE,ISO_TIME,COORDS,TOTAL_HRS,DAY_HRS,NIGHT_HRS,USA_SSHS,WIND_CALC_MEAN,PRES_CALC_MEAN,V_LAND_KN,34KN_POP,34KN_ASSETS,64KN_POP,64KN_ASSETS,96KN_POP,96KN_ASSETS,Population density (people per sq. km of land area),TOTAL_AFFECTED,pop_max_34,pop_max_50,pop_max_64,pop_max_34_adj,pop_max_50_adj,pop_max_64_adj
0,1949163N07145,DELLA,JPN,1949,WP,WP,TS,"['1949-06-21 03:00:00', '1949-06-22 03:00:00',...","[[35.67, 130.99], [38.35, 135.07], [38.82, 136...",0,0,0,0,39.133333,1000.333333,99.82,,,,,,,258.911917,194046,16431894.0,7958809.0,4553851.0,11983000.0,5803981.0,3320907.0
1,1950241N23140,JANE,JPN,1950,WP,WP,ET,"['1950-09-02 18:00:00', '1950-09-03 12:00:00',...","[[31.71, 134.12], [38.48, 137.98], [46.7, 152.8]]",6,0,6,-1,76.4,974.833333,99.82,,,,,,,258.911917,642117,43343320.0,21380746.0,13972339.0,31608220.0,15591960.0,10189360.0


In [84]:
#dropping irrelevant columns
final_relevant = final[['SID', 'NAME', 'ISO', 'YEAR', 'BASIN', 'SUB BASIN', 'NATURE',
               'ISO_TIME', 'V_LAND_KN', '34KN_POP','34KN_ASSETS', '64KN_POP', '64KN_ASSETS', 
               '96KN_POP', '96KN_ASSETS', 'pop_max_34_adj', 'pop_max_50_adj', 'pop_max_64_adj',
               'pop_max_34', 'pop_max_50', 'pop_max_64', 'TOTAL_AFFECTED']].copy()

We want to check whether there are SID's present in ``final_relevant`` which are missing in ``aggregates``:

In [85]:
ids_final = set(final_relevant.SID.value_counts().index)

Now, we check for them in our produced data set:

In [86]:
ids_agg = set(aggregates.SID.value_counts().index)

And finally we see which ids are in ids_final but not in ids_agg:

In [87]:
missing_ids = list(ids_final.difference(ids_agg))

In [88]:
missing_ids

['2018142N09057', '2018322N12110', '2018137N13050', '2018365N09113']

Let see if the other names are not in our data set:

In [90]:
names_missing = final[final["SID"].isin(missing_ids)].NAME.to_list()

In [91]:
#SIDS, names and years from final whose ids are not in in_prem and so not in aggregates
final_dif = final[final["SID"].isin(missing_ids)][["SID", "NAME", "YEAR"]].drop_duplicates()

In [92]:
names_dif = ib_prem[(ib_prem.NAME.isin(names_missing)) & (ib_prem.SEASON >=2017)][["SID", "NAME", "SEASON"]].copy().drop_duplicates()

In [93]:
names_dif

Unnamed: 0,SID,NAME,SEASON


In [94]:
ids_present = set(ib_prem.SID.to_list())

#Check which are still missing
ids_not_touch = list(ids_final.difference(ids_present))

In [95]:
ids_not_touch

['2018142N09057', '2018322N12110', '2018137N13050', '2018365N09113']

In [96]:
final_to_concat = final_relevant[final_relevant.SID.isin(ids_not_touch)].copy()

In [97]:
final_relevant = final_relevant[~(final_relevant.SID.isin(ids_not_touch))]

In [98]:
final_relevant.shape

(1001, 22)

As we have our ``aggregates`` with ISO2 we should change this to ISO 3 as in the original dataset:

In [103]:
aggregates.head(2)

Unnamed: 0,SID,ISO,MAX_WIND,MIN_PRES,MIN_DIST2LAND,LANDFALL,MAX_STORMSPEED,MIN_ISOTIME,MAX_ISOTIME,TOTAL_HOURS_EVENT,TIME_LAND_MIN,TIME_LAND_MAX,TOTAL_HOURS_IN_LAND,COORDS,LENGTH_PATH,VINCENTY_LENGTH,MONTH_START,MONTH_END,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,MAX_USA_SSHS,MAX_USA_SSHS_INLAND
0,1949163N07145,FM,0,1005,893,888,13,1949-06-12 00:00:00,1949-06-14 09:00:00,57.0,NaT,NaT,0.0,"[(6.6, 145.0), (6.600230000000002, 144.322), (...",937.785294,931.051958,6,6,"[(6.6, 145.0), (6.600230000000002, 144.322), (...","[(7.0, 137.7)]",-1,
1,1949163N07145,JP,116,952,0,0,33,1949-06-19 09:00:00,1949-06-24 06:00:00,117.0,1949-06-20 15:00:00,1949-06-23 03:00:00,60.0,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...",3889.48064,3632.62618,6,6,"[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3,0.0


In [104]:
import country_converter as coco

In [107]:
def convert_iso(code):
    return coco.convert(names=code, to='ISO3', not_found=None)

In [109]:
aggregates["ISO3"] = aggregates.ISO.apply(convert_iso)

In [113]:
aggregates["ISO2"] = aggregates["ISO"]
aggregates["ISO"] = aggregates["ISO3"]

In [119]:
final_merged = final_relevant.merge(aggregates, on = ["SID", "ISO"], how = "left").drop(columns = "ISO_TIME")

In [120]:
final_merged = final_merged[['SID', 'NAME', 'BASIN', 'SUB BASIN', 'ISO', 'YEAR', 'MONTH_START',
                            'MONTH_END','MIN_ISOTIME', 'MAX_ISOTIME', 'TIME_LAND_MIN', 'TIME_LAND_MAX',
                             'TOTAL_HOURS_EVENT', 'TOTAL_HOURS_IN_LAND', 'NATURE', 'COORDS', 
                             'COORDS_MAX_WINDS', 'COORDS_MIN_DIST2LAND', 'LENGTH_PATH', 'VINCENTY_LENGTH',
                             'MAX_WIND', 'MIN_PRES', 'MIN_DIST2LAND', 'MAX_STORMSPEED', 'MAX_USA_SSHS',
                             'MAX_USA_SSHS_INLAND', 'V_LAND_KN', '34KN_POP',  '64KN_POP', '96KN_POP',
                              'pop_max_34_adj', 'pop_max_50_adj', 'pop_max_64_adj', 
                             'pop_max_34', 'pop_max_50','pop_max_64', 
                             '64KN_ASSETS',  '34KN_ASSETS', '96KN_ASSETS',
                              'TOTAL_AFFECTED']].copy()

In [121]:
final_merged.rename(columns= {"MIN_ISOTIME": "DATE_START",
                              "MAX_ISOTIME": "DATE_END",
                              "TIME_LAND_MIN": "DATE_LAND_START",
                              "TIME_LAND_MAX": "DATE_LAND_END",
                              "LENGTH_PATH": "DISTANCE_TRACK",
                              "VINCENTY_LENGTH": "DISTANCE_TRACK_VINCENTY"}, inplace= True)

In [122]:
final_merged.columns = [x.upper() for x in final_merged.columns]

In [123]:
final_merged.head()

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_START,DATE_END,DATE_LAND_START,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,TOTAL_AFFECTED
0,1949163N07145,DELLA,WP,WP,JPN,1949,6,6,1949-06-19 09:00:00,1949-06-24 06:00:00,1949-06-20 15:00:00,1949-06-23 03:00:00,117.0,60.0,TS,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3889.48064,3632.62618,116,952,0,33,3,0.0,99.82,,,,11983000.0,5803981.0,3320907.0,16431894.0,7958809.0,4553851.0,,,,194046
1,1950241N23140,JANE,WP,WP,JPN,1950,8,9,1950-08-30 03:00:00,1950-09-05 00:00:00,1950-09-03 03:00:00,1950-09-04 00:00:00,141.0,21.0,ET,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",3596.090691,2787.480021,116,940,0,31,3,1.0,99.82,,,,31608220.0,15591960.0,10189360.0,43343320.0,21380746.0,13972339.0,,,,642117
2,1951224N12316,CHARLIE,NAm,CS,JAM,1951,8,8,1951-08-17 21:00:00,1951-08-18 12:00:00,1951-08-18 03:00:00,1951-08-18 06:00:00,15.0,3.0,TS,"[(17.3398, -75.4138), (17.6, -76.2), (17.9, -7...","[(17.6, -76.2), (17.9, -76.9)]","[(17.9, -76.9), (18.1, -77.8)]",438.214691,435.729082,110,0,0,17,3,3.0,132.4,2788659.0,2788659.0,2552903.0,1689243.0,1687083.0,1195052.0,2753687.5,2750167.0,1948090.75,21481980000.0,21481980000.0,19483460000.0,20200
3,1951337N09150,AMY,WP,WP,PHL,1951,12,12,1951-12-07 09:00:00,1951-12-17 00:00:00,1951-12-09 12:00:00,1951-12-11 00:00:00,231.0,36.0,TS,"[(12.0725, 130.967), (12.1333, 130.517), (12.1...","[(11.7833, 127.9)]","[(10.7333, 124.8), (10.534, 123.174), (10.6833...",2238.058669,1905.587795,120,924,0,11,4,2.0,119.2,20464826.0,12675908.0,5618193.0,4760039.0,3006670.0,1114774.0,14131997.0,8926450.0,3309632.25,113105000000.0,188028000000.0,48973330000.0,60000
4,1952180N05144,EMMA,WP,WP,PHL,1952,7,7,1952-07-01 06:00:00,1952-07-04 18:00:00,1952-07-02 12:00:00,1952-07-02 21:00:00,84.0,9.0,TS,"[(9.33776, 130.185), (9.38633, 129.61), (9.45,...","[(9.45, 129.017), (9.50407, 128.407), (9.56613...","[(10.4667, 123.867), (10.6492, 123.239), (10.8...",1876.18679,1797.712213,110,968,0,15,3,1.0,110691.0,,,,5130796.0,2789486.0,1988524.0,15232730.0,8281656.5,5903694.0,,,,103


In [124]:
final_merged.set_index("SID").to_excel("Data_output/final_with_iso.xls")

There are some elements which were not present in the latest version of ibtracs (version 4) but that were present in the first version of the data set. Here we will concat those rows:

In [125]:
to_concat = pd.read_excel("Data_output/to_concat.xls")

In [126]:
to_concat

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_BEGIN,DATE_END,DATE_LAND_BEGIN,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,TOTAL_AFFECTED
0,2018137N13050,SAGAR,NI,AS,SOM,2018,5,5,2018-05-16 18:00:00,2018-05-20 00:00:00,2018-05-19 12:00:00,2018-05-20 00:00:00,,,NR,"[(13.2, 49.5), (13.1, 48.9), (13.2, 48.8), (12...","[(11.2, 45.3)]",[(10.3 43.8)],,,44.82,994,0,,0,0,64.8,90000000.0,0.0,0.0,914728.8,280693.2,6462.713,968672600.0,297246.3,6843.835,,,,228000
1,2018142N09057,MEKUNU,NI,AS,YEM,2018,5,5,2018-05-22 00:00:00,2018-05-27 00:00:00,2018-05-23 06:00:00,2018-05-26 00:00:00,,,NR,"[(9.1, 57.3), (9.6, 57.2), (10.4, 56.9), (10.7...","[(16.4, 54.3)]","[(16.4, 54.3)]",,,99.89,960,0,,1,0,100.0,,,,104210.4,2145.2,0.0,109063.4,2245.101,0.0,,,,750
2,2018322N12110,TORAJI,WP,WP,VNM,2018,11,11,2018-11-17 18:00:00,2018-11-19 21:00:00,2018-11-17 18:00:00,2018-11-19 21:00:00,,,NR,"[(10.7, 111.0), (11.5, 110.0), (11.7, 109.6), ...","[ (11.7, 109.2)]","[ (11.7, 109.2)]",,,24.838,1004,0,0.0,-1,0,36.0,,,,44488220.0,5598202.0,1664410.0,45325540.0,5703566.0,1695737000.0,,,,10028
3,2018365N09113,PABUK,NI,BB,THA,2019,1,1,2018-12-31 06:00:00,2019-01-06 12:00:00,2019-01-03 00:00:00,2019-01-05 12:00:00,,,NR,"[(8.5, 112.6), (7.7, 111.2), (6.6, 111.0), (6....","[(8.2, 100.7)]","[(8.2, 100.7)]",,,50.216,994,0,13.49,0,0,50.0,,,,3900000.0,3200000.0,0.0,3900000.0,3200000.0,0.0,,,,720885


First, we will handle the columns accordingly:

In [127]:
to_concat["TOTAL_HOURS_EVENT"] = (to_concat["DATE_END"]-to_concat["DATE_BEGIN"]).apply(lambda x: x.days)*24+ (to_concat["DATE_END"]-to_concat["DATE_BEGIN"]).apply(lambda x: x.seconds)/3600 

to_concat["TOTAL_HOURS_IN_LAND"] = (to_concat["DATE_LAND_END"]-to_concat["DATE_LAND_BEGIN"]).apply(lambda x: x.days)*24+ (to_concat["DATE_LAND_END"]-to_concat["DATE_LAND_BEGIN"]).apply(lambda x: x.seconds)/3600 

to_concat["COORDS"] = to_concat.COORDS.apply(lambda x: ast.literal_eval(x))

to_concat["DISTANCE_TRACK"] = to_concat["COORDS"].apply(get_distance)


to_concat["DISTANCE_TRACK_VINCENTY"] = to_concat["COORDS"].apply(lambda x: distance.distance(x[0], x[-1]).km)

to_concat.rename(columns = {"DATE_BEGIN": 'DATE_START', "DATE_LAND_BEGIN": 'DATE_LAND_START'}, inplace= True)

final_data = pd.concat([final_merged,to_concat])

Now we finally concat the two data sets and order it by date:

In [145]:
final_data.sort_values(["DATE_START", "NAME", "ISO"], inplace= True)

In [146]:
final_data

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_START,DATE_END,DATE_LAND_START,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,TOTAL_AFFECTED
0,1949163N07145,DELLA,WP,WP,JPN,1949,6,6,1949-06-19 09:00:00,1949-06-24 06:00:00,1949-06-20 15:00:00,1949-06-23 03:00:00,117.0,60.0,TS,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3889.480640,3632.626180,116.0,952,0,33.0,3,0.0,99.82,,,,1.198300e+07,5.803981e+06,3.320907e+06,1.643189e+07,7.958809e+06,4.553851e+06,,,,194046
1,1950241N23140,JANE,WP,WP,JPN,1950,8,9,1950-08-30 03:00:00,1950-09-05 00:00:00,1950-09-03 03:00:00,1950-09-04 00:00:00,141.0,21.0,ET,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",3596.090691,2787.480021,116.0,940,0,31.0,3,1.0,99.82,,,,3.160822e+07,1.559196e+07,1.018936e+07,4.334332e+07,2.138075e+07,1.397234e+07,,,,642117
2,1951224N12316,CHARLIE,NAm,CS,JAM,1951,8,8,1951-08-17 21:00:00,1951-08-18 12:00:00,1951-08-18 03:00:00,1951-08-18 06:00:00,15.0,3.0,TS,"[(17.3398, -75.4138), (17.6, -76.2), (17.9, -7...","[(17.6, -76.2), (17.9, -76.9)]","[(17.9, -76.9), (18.1, -77.8)]",438.214691,435.729082,110.0,0,0,17.0,3,3.0,132.40,2788659.0,2788659.0,2552903.0,1.689243e+06,1.687083e+06,1.195052e+06,2.753688e+06,2.750167e+06,1.948091e+06,2.148198e+10,2.148198e+10,1.948346e+10,20200
3,1951337N09150,AMY,WP,WP,PHL,1951,12,12,1951-12-07 09:00:00,1951-12-17 00:00:00,1951-12-09 12:00:00,1951-12-11 00:00:00,231.0,36.0,TS,"[(12.0725, 130.967), (12.1333, 130.517), (12.1...","[(11.7833, 127.9)]","[(10.7333, 124.8), (10.534, 123.174), (10.6833...",2238.058669,1905.587795,120.0,924,0,11.0,4,2.0,119.20,20464826.0,12675908.0,5618193.0,4.760039e+06,3.006670e+06,1.114774e+06,1.413200e+07,8.926450e+06,3.309632e+06,1.131050e+11,1.880280e+11,4.897333e+10,60000
4,1952180N05144,EMMA,WP,WP,PHL,1952,7,7,1952-07-01 06:00:00,1952-07-04 18:00:00,1952-07-02 12:00:00,1952-07-02 21:00:00,84.0,9.0,TS,"[(9.33776, 130.185), (9.38633, 129.61), (9.45,...","[(9.45, 129.017), (9.50407, 128.407), (9.56613...","[(10.4667, 123.867), (10.6492, 123.239), (10.8...",1876.186790,1797.712213,110.0,968,0,15.0,3,1.0,110691.00,,,,5.130796e+06,2.789486e+06,1.988524e+06,1.523273e+07,8.281656e+06,5.903694e+06,,,,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
997,2019068S17040,IDAI,SI,SI,MOZ,2019,3,3,2019-03-09 06:00:00,2019-03-14 12:00:00,NaT,NaT,126.0,0.0,NR,"[(-17.0, 40.2), (-17.1654, 40.782), (-17.2, 41...","[(-19.7, 37.9), (-19.75, 37.5758), (-19.8, 37.2)]","[(-17.0, 40.2)]",957.628405,522.204526,105.0,944,70,12.0,3,,151.20,,,,2.396895e+06,0.000000e+00,0.000000e+00,2.467099e+06,0.000000e+00,0.000000e+00,,,,1501500
996,2019068S17040,IDAI,SI,SI,MDG,2019,3,3,2019-03-09 21:00:00,2019-03-12 12:00:00,NaT,NaT,63.0,0.0,NR,"[(-17.0124, 42.2448), (-17.2, 42.4), (-17.3374...","[(-17.6, 42.6), (-17.6499, 42.5424), (-17.7, 4...","[(-17.3, 43.2)]",404.150545,211.930107,105.0,956,77,7.0,3,,151.20,,,,2.341183e+05,0.000000e+00,0.000000e+00,2.403834e+08,0.000000e+00,0.000000e+00,,,,1100
998,2019113S10051,KENNETH,SI,SI,COM,2019,4,4,2019-04-24 03:00:00,2019-04-25 06:00:00,NaT,NaT,27.0,0.0,NR,"[(-10.7847, 46.1575), (-10.9, 45.7), (-11.0076...","[(-11.6, 41.8)]","[(-11.6, 41.8)]",486.400190,484.370217,124.0,937,138,13.0,4,,118.80,,,,,,,,,,,,,345311
999,2019113S10051,KENNETH,SI,SI,MOZ,2019,4,4,2019-04-25 09:00:00,2019-04-26 12:00:00,2019-04-25 15:00:00,2019-04-26 12:00:00,27.0,21.0,NR,"[(-11.8576, 41.2852), (-12.1, 40.8), (-12.2265...","[(-11.8576, 41.2852)]","[(-12.2265, 40.429), (-12.3, 40.1), (-12.4076,...",336.098173,304.360948,121.0,939,0,11.0,4,3.0,119.00,,,,3.758703e+06,7.220010e+05,1.563119e+05,3.868792e+06,7.431479e+08,1.608902e+05,,,,400094


In [147]:
final_data.shape

(1009, 40)

## 8. Merging with other indicators

Below we merge the data set obtained with data containing population indicators and the Human Development Index (HDI).

In [148]:
ds = pd.read_excel("Data_output/final_data.xls").drop(columns = "Unnamed: 0")

In [182]:
ds.head()

Unnamed: 0,SID,NAME,ISO,YEAR,BASIN,SUB BASIN,NATURE,ISO_TIME,COORDS,TOTAL_HRS,DAY_HRS,NIGHT_HRS,USA_SSHS,WIND_MIN,WIND_MAX,WIND_CALC_MEAN,PRES_MIN,PRES_MAX,PRES_CALC_MEAN,STORM_SPD_MIN,STORM_SPD_MAX,STORM_SPD_MEAN,STORM_DR_MIN,STORM_DR_MAX,STORM_DR_MEAN,V_LAND_KN,34KN_POP,34KN_ASSETS,64KN_POP,64KN_ASSETS,96KN_POP,96KN_ASSETS,TOTAL_DAMAGE_(000$),TOTAL_DEATHS,pop_max_34,pop_max_50,pop_max_64,TOTAL_AFFECTED,POP_DEN_SQ_KM,RURAL_POP(%),POP_TOTAL,RURAL_POP,hdi
0,1949163N07145,DELLA,JPN,1949,WP,WP,TS,"['1949-06-21 03:00:00', '1949-06-22 03:00:00',...","[[35.67, 130.99], [38.35, 135.07], [38.82, 136...",0,0,0,0,33.0,55.0,39.133333,982.0,1008.0,1000.333333,11.0,27.0,19.0,41.0,356.0,91.444444,99.82,,,,,,,,419.0,16278929.0,4378623.5,732154.8,194046,195.0,77.2,81700000,63072400.0,0.816
1,1950241N23140,JANE,JPN,1950,WP,WP,ET,"['1950-09-02 18:00:00', '1950-09-03 12:00:00',...","[[31.71, 134.12], [38.48, 137.98], [46.7, 152.8]]",6,0,6,-1,55.6,97.2,76.4,947.5,1000.0,974.833333,17.0,30.0,23.0,13.0,66.0,37.666667,99.82,,,,,,,,509.0,39024004.0,14551006.0,4093306.0,642117,227.0,47.0,82800000,38916000.0,0.816
2,1951224N12316,CHARLIE,JAM,1951,NAm,CS,TS,"['1951-08-17 21:00:00', '1951-08-18 12:00:00']","[[17.34, -75.41], [18.3, -79.4]]",15,0,15,1,75.0,102.0,88.5,972.0,977.0,974.5,15.0,16.0,15.5,279.0,288.0,283.5,132.4,2788659.0,21481980000.0,2788659.0,21481980000.0,2552903.0,19483460000.0,56000.0,154.0,2753687.5,2606976.5,1067878.0,20200,133.0,66.23,1440000,953712.0,0.668
3,1951337N09150,AMY,PHL,1951,WP,WP,TS,"['1951-12-08 18:00:00', '1951-12-08 21:00:00',...","[[11.52, 127.35], [11.34, 127.09], [11.17, 126...",15,0,15,2,73.6,109.2,84.488235,948.666667,992.333333,979.54902,1.0,8.0,3.0,0.0,340.0,161.647059,119.2,20464826.0,188028000000.0,12675908.0,113105000000.0,5618193.0,48973330000.0,,569.0,14110069.0,8444818.0,1750716.0,60000,64.5,75.35,19200000,14467200.0,0.59
4,1952180N05144,EMMA,PHL,1952,WP,WP,TS,"['1952-07-01 18:00:00', '1952-07-01 21:00:00',...","[[9.57, 127.79], [9.63, 127.19], [12.38, 119.4...",3,2,1,1,65.6,105.4,83.92,972.666667,985.666667,980.6,10.0,12.0,10.8,276.0,303.0,291.4,110691.0,,,,,,,,19.0,15200242.0,7952954.5,4470182.0,103,66.9,72.41,19900000,14409590.0,0.59


In [150]:
ds_copy = ds[["SID", "NAME", "ISO", "YEAR", "SUB BASIN", "POP_DEN_SQ_KM", "RURAL_POP(%)", "POP_TOTAL", "RURAL_POP", "hdi", "TOTAL_DAMAGE_(000$)", "TOTAL_DEATHS"]].copy()

ds_copy.drop_duplicates(keep = "first", inplace= True)

ds_copy

In [212]:
merged_trial = final_data.merge(ds_copy, on = ["SID", "ISO", "YEAR", "SUB BASIN", "NAME"], how = "left")
#rearranging columns
merged_trial = merged_trial[['SID', 'NAME', 'BASIN', 'SUB BASIN', 'ISO', 'YEAR', 'MONTH_START',
                            'MONTH_END', 'DATE_START', 'DATE_END', 'DATE_LAND_START',
                            'DATE_LAND_END', 'TOTAL_HOURS_EVENT', 'TOTAL_HOURS_IN_LAND', 'NATURE',
                            'COORDS', 'COORDS_MAX_WINDS', 'COORDS_MIN_DIST2LAND', 'DISTANCE_TRACK',
                            'DISTANCE_TRACK_VINCENTY', 'MAX_WIND', 'MIN_PRES', 'MIN_DIST2LAND',
                            'MAX_STORMSPEED', 'MAX_USA_SSHS', 'MAX_USA_SSHS_INLAND', 'V_LAND_KN',
                            '34KN_POP', '64KN_POP', '96KN_POP', 'POP_MAX_34_ADJ', 'POP_MAX_50_ADJ',
                            'POP_MAX_64_ADJ', 'POP_MAX_34', 'POP_MAX_50', 'POP_MAX_64',
                            '64KN_ASSETS', '34KN_ASSETS', '96KN_ASSETS', 
                            'POP_DEN_SQ_KM', 'RURAL_POP(%)', 'POP_TOTAL', 'RURAL_POP', 'hdi',
                            'TOTAL_DAMAGE_(000$)', 'TOTAL_DEATHS', 'TOTAL_AFFECTED']]

Now we check for duplicates. When we have lists as observations, python does not detect duplicates with the ``duplicated()`` method. Then we have to create a copy of our data set and see whether the copy withour features involving lists has any duplicates:

In [213]:
copy = merged_trial.copy()

copy.drop(columns = ['COORDS', 'COORDS_MAX_WINDS', 'COORDS_MIN_DIST2LAND'], inplace= True)


In [214]:
#dropping duplicates
merged_trial.drop([986, 988,1014, 1017], axis = 0, inplace= True)

In [215]:
merged_trial.reset_index(drop = True, inplace= True)

In [216]:
merged_trial.shape

(1019, 47)

In [217]:
merged_trial.head(2)

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_START,DATE_END,DATE_LAND_START,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,POP_DEN_SQ_KM,RURAL_POP(%),POP_TOTAL,RURAL_POP,hdi,TOTAL_DAMAGE_(000$),TOTAL_DEATHS,TOTAL_AFFECTED
0,1949163N07145,DELLA,WP,WP,JPN,1949,6,6,1949-06-19 09:00:00,1949-06-24 06:00:00,1949-06-20 15:00:00,1949-06-23 03:00:00,117.0,60.0,TS,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3889.48064,3632.62618,116.0,952,0,33.0,3,0.0,99.82,,,,11983000.0,5803981.0,3320907.0,16431894.0,7958809.0,4553851.0,,,,195.0,77.2,81700000,63072400.0,0.816,,419.0,194046
1,1950241N23140,JANE,WP,WP,JPN,1950,8,9,1950-08-30 03:00:00,1950-09-05 00:00:00,1950-09-03 03:00:00,1950-09-04 00:00:00,141.0,21.0,ET,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",3596.090691,2787.480021,116.0,940,0,31.0,3,1.0,99.82,,,,31608220.0,15591960.0,10189360.0,43343320.0,21380746.0,13972339.0,,,,227.0,47.0,82800000,38916000.0,0.816,,509.0,642117


# 9. Getting a modified Scale

We will use the maximum winds founded for classify the cyclones using the Saffir-Simpson-Scale. As our wind speed dat is in knots, we will use the scale accordingly.

In [218]:
dict_SaffirSimpsonScale = {"Cat 5": range(137, 1000),
                           "Cat 4": range(113, 137),
                           "Cat 3": range(96, 113),
                           "Cat 2": range(83, 96),
                           "Cat 1": range(64, 83),
                           "TS": range(34, 64),
                           "TD": range(0, 34)}

def get_funct(dictionary, value):
    for key in list(dictionary.keys()):
        if round(value) in dictionary[key]:
            return key


In [219]:
merged_trial["GENERAL_CATEGORY"] = merged_trial["MAX_WIND"].apply(lambda x: get_funct(dict_SaffirSimpsonScale, x))

In [220]:
merged_trial = merged_trial[['SID', 'NAME', 'BASIN', 'SUB BASIN', 'ISO', 'YEAR', 'MONTH_START',
                           'MONTH_END', 'DATE_START', 'DATE_END', 'DATE_LAND_START',
                           'DATE_LAND_END', 'TOTAL_HOURS_EVENT', 'TOTAL_HOURS_IN_LAND', 'NATURE',
                            'GENERAL_CATEGORY', 'MAX_WIND', 'MIN_PRES', 'MIN_DIST2LAND',
                           'MAX_STORMSPEED', 'MAX_USA_SSHS', 'MAX_USA_SSHS_INLAND', 'V_LAND_KN',
                           'COORDS', 'COORDS_MAX_WINDS', 'COORDS_MIN_DIST2LAND', 'DISTANCE_TRACK',
                           'DISTANCE_TRACK_VINCENTY' ,
                           '34KN_POP', '64KN_POP', '96KN_POP', 'POP_MAX_34_ADJ', 'POP_MAX_50_ADJ',
                           'POP_MAX_64_ADJ', 'POP_MAX_34', 'POP_MAX_50', 'POP_MAX_64',
                           '64KN_ASSETS', '34KN_ASSETS', '96KN_ASSETS', 'POP_DEN_SQ_KM',
                            'RURAL_POP(%)', 'POP_TOTAL', 'RURAL_POP', 'hdi', 'TOTAL_DAMAGE_(000$)',
                           'TOTAL_DEATHS', 'TOTAL_AFFECTED']].copy()

In [221]:
merged_trial.columns = [col.upper() for col in merged_trial.columns]

In [222]:
merged_trial.head()

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_START,DATE_END,DATE_LAND_START,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,GENERAL_CATEGORY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,POP_DEN_SQ_KM,RURAL_POP(%),POP_TOTAL,RURAL_POP,HDI,TOTAL_DAMAGE_(000$),TOTAL_DEATHS,TOTAL_AFFECTED
0,1949163N07145,DELLA,WP,WP,JPN,1949,6,6,1949-06-19 09:00:00,1949-06-24 06:00:00,1949-06-20 15:00:00,1949-06-23 03:00:00,117.0,60.0,TS,Cat 4,116.0,952,0,33.0,3,0.0,99.82,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3889.48064,3632.62618,,,,11983000.0,5803981.0,3320907.0,16431894.0,7958809.0,4553851.0,,,,195.0,77.2,81700000,63072400.0,0.816,,419.0,194046
1,1950241N23140,JANE,WP,WP,JPN,1950,8,9,1950-08-30 03:00:00,1950-09-05 00:00:00,1950-09-03 03:00:00,1950-09-04 00:00:00,141.0,21.0,ET,Cat 4,116.0,940,0,31.0,3,1.0,99.82,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",3596.090691,2787.480021,,,,31608220.0,15591960.0,10189360.0,43343320.0,21380746.0,13972339.0,,,,227.0,47.0,82800000,38916000.0,0.816,,509.0,642117
2,1951224N12316,CHARLIE,NAm,CS,JAM,1951,8,8,1951-08-17 21:00:00,1951-08-18 12:00:00,1951-08-18 03:00:00,1951-08-18 06:00:00,15.0,3.0,TS,Cat 3,110.0,0,0,17.0,3,3.0,132.4,"[(17.3398, -75.4138), (17.6, -76.2), (17.9, -7...","[(17.6, -76.2), (17.9, -76.9)]","[(17.9, -76.9), (18.1, -77.8)]",438.214691,435.729082,2788659.0,2788659.0,2552903.0,1689243.0,1687083.0,1195052.0,2753687.5,2750167.0,1948090.75,21481980000.0,21481980000.0,19483460000.0,133.0,66.23,1440000,953712.0,0.668,56000.0,154.0,20200
3,1951337N09150,AMY,WP,WP,PHL,1951,12,12,1951-12-07 09:00:00,1951-12-17 00:00:00,1951-12-09 12:00:00,1951-12-11 00:00:00,231.0,36.0,TS,Cat 4,120.0,924,0,11.0,4,2.0,119.2,"[(12.0725, 130.967), (12.1333, 130.517), (12.1...","[(11.7833, 127.9)]","[(10.7333, 124.8), (10.534, 123.174), (10.6833...",2238.058669,1905.587795,20464826.0,12675908.0,5618193.0,4760039.0,3006670.0,1114774.0,14131997.0,8926450.0,3309632.25,113105000000.0,188028000000.0,48973330000.0,64.5,75.35,19200000,14467200.0,0.59,,569.0,60000
4,1952180N05144,EMMA,WP,WP,PHL,1952,7,7,1952-07-01 06:00:00,1952-07-04 18:00:00,1952-07-02 12:00:00,1952-07-02 21:00:00,84.0,9.0,TS,Cat 3,110.0,968,0,15.0,3,1.0,110691.0,"[(9.33776, 130.185), (9.38633, 129.61), (9.45,...","[(9.45, 129.017), (9.50407, 128.407), (9.56613...","[(10.4667, 123.867), (10.6492, 123.239), (10.8...",1876.18679,1797.712213,,,,5130796.0,2789486.0,1988524.0,15232730.0,8281656.5,5903694.0,,,,66.9,72.41,19900000,14409590.0,0.59,,19.0,103


In [223]:
null_cols(merged_trial)

Unnamed: 0,Feature,% Null rows
0,TOTAL_DAMAGE_(000$),31.894014
1,96KN_ASSETS,23.061825
2,34KN_ASSETS,23.061825
3,64KN_ASSETS,23.061825
4,96KN_POP,22.96369
5,34KN_POP,22.96369
6,64KN_POP,22.96369
7,DATE_LAND_START,21.393523
8,DATE_LAND_END,21.393523
9,MAX_USA_SSHS_INLAND,21.393523


We should fill some nulls according with our feature construction:

In [224]:
for col in ['DATE_LAND_START','DATE_LAND_END', 'MAX_USA_SSHS_INLAND']:
    merged_trial[col].fillna("No landing", inplace= True)

In [225]:
merged_trial

Unnamed: 0,SID,NAME,BASIN,SUB BASIN,ISO,YEAR,MONTH_START,MONTH_END,DATE_START,DATE_END,DATE_LAND_START,DATE_LAND_END,TOTAL_HOURS_EVENT,TOTAL_HOURS_IN_LAND,NATURE,GENERAL_CATEGORY,MAX_WIND,MIN_PRES,MIN_DIST2LAND,MAX_STORMSPEED,MAX_USA_SSHS,MAX_USA_SSHS_INLAND,V_LAND_KN,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,DISTANCE_TRACK,DISTANCE_TRACK_VINCENTY,34KN_POP,64KN_POP,96KN_POP,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,POP_MAX_34,POP_MAX_50,POP_MAX_64,64KN_ASSETS,34KN_ASSETS,96KN_ASSETS,POP_DEN_SQ_KM,RURAL_POP(%),POP_TOTAL,RURAL_POP,HDI,TOTAL_DAMAGE_(000$),TOTAL_DEATHS,TOTAL_AFFECTED
0,1949163N07145,DELLA,WP,WP,JPN,1949,6,6,1949-06-19 09:00:00,1949-06-24 06:00:00,1949-06-20 15:00:00,1949-06-23 03:00:00,117.0,60.0,TS,Cat 4,116.0,952,0,33.0,3,0,99.82,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",3889.480640,3632.626180,,,,1.198300e+07,5.803981e+06,3.320907e+06,1.643189e+07,7.958809e+06,4.553851e+06,,,,195.0,77.20,81700000,63072400.0,0.816,,419.0,194046
1,1950241N23140,JANE,WP,WP,JPN,1950,8,9,1950-08-30 03:00:00,1950-09-05 00:00:00,1950-09-03 03:00:00,1950-09-04 00:00:00,141.0,21.0,ET,Cat 4,116.0,940,0,31.0,3,1,99.82,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",3596.090691,2787.480021,,,,3.160822e+07,1.559196e+07,1.018936e+07,4.334332e+07,2.138075e+07,1.397234e+07,,,,227.0,47.00,82800000,38916000.0,0.816,,509.0,642117
2,1951224N12316,CHARLIE,NAm,CS,JAM,1951,8,8,1951-08-17 21:00:00,1951-08-18 12:00:00,1951-08-18 03:00:00,1951-08-18 06:00:00,15.0,3.0,TS,Cat 3,110.0,0,0,17.0,3,3,132.40,"[(17.3398, -75.4138), (17.6, -76.2), (17.9, -7...","[(17.6, -76.2), (17.9, -76.9)]","[(17.9, -76.9), (18.1, -77.8)]",438.214691,435.729082,2788659.0,2788659.0,2552903.0,1.689243e+06,1.687083e+06,1.195052e+06,2.753688e+06,2.750167e+06,1.948091e+06,2.148198e+10,2.148198e+10,1.948346e+10,133.0,66.23,1440000,953712.0,0.668,56000.0,154.0,20200
3,1951337N09150,AMY,WP,WP,PHL,1951,12,12,1951-12-07 09:00:00,1951-12-17 00:00:00,1951-12-09 12:00:00,1951-12-11 00:00:00,231.0,36.0,TS,Cat 4,120.0,924,0,11.0,4,2,119.20,"[(12.0725, 130.967), (12.1333, 130.517), (12.1...","[(11.7833, 127.9)]","[(10.7333, 124.8), (10.534, 123.174), (10.6833...",2238.058669,1905.587795,20464826.0,12675908.0,5618193.0,4.760039e+06,3.006670e+06,1.114774e+06,1.413200e+07,8.926450e+06,3.309632e+06,1.131050e+11,1.880280e+11,4.897333e+10,64.5,75.35,19200000,14467200.0,0.590,,569.0,60000
4,1952180N05144,EMMA,WP,WP,PHL,1952,7,7,1952-07-01 06:00:00,1952-07-04 18:00:00,1952-07-02 12:00:00,1952-07-02 21:00:00,84.0,9.0,TS,Cat 3,110.0,968,0,15.0,3,1,110691.00,"[(9.33776, 130.185), (9.38633, 129.61), (9.45,...","[(9.45, 129.017), (9.50407, 128.407), (9.56613...","[(10.4667, 123.867), (10.6492, 123.239), (10.8...",1876.186790,1797.712213,,,,5.130796e+06,2.789486e+06,1.988524e+06,1.523273e+07,8.281656e+06,5.903694e+06,,,,66.9,72.41,19900000,14409590.0,0.590,,19.0,103
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1014,2019068S17040,IDAI,SI,SI,MOZ,2019,3,3,2019-03-09 06:00:00,2019-03-14 12:00:00,No landing,No landing,126.0,0.0,NR,Cat 3,105.0,944,70,12.0,3,No landing,151.20,"[(-17.0, 40.2), (-17.1654, 40.782), (-17.2, 41...","[(-19.7, 37.9), (-19.75, 37.5758), (-19.8, 37.2)]","[(-17.0, 40.2)]",957.628405,522.204526,,,,2.396895e+06,0.000000e+00,0.000000e+00,2.467099e+06,0.000000e+00,0.000000e+00,,,,39.9,64.10,30400000,19486400.0,0.217,2000000.0,603.0,1501500
1015,2019068S17040,IDAI,SI,SI,MDG,2019,3,3,2019-03-09 21:00:00,2019-03-12 12:00:00,No landing,No landing,63.0,0.0,NR,Cat 3,105.0,956,77,7.0,3,No landing,151.20,"[(-17.0124, 42.2448), (-17.2, 42.4), (-17.3374...","[(-17.6, 42.6), (-17.6499, 42.5424), (-17.7, 4...","[(-17.3, 43.2)]",404.150545,211.930107,,,,2.341183e+05,0.000000e+00,0.000000e+00,2.403834e+08,0.000000e+00,0.000000e+00,,,,46.4,62.10,27000000,16767000.0,0.404,,3.0,1100
1016,2019113S10051,KENNETH,SI,SI,COM,2019,4,4,2019-04-24 03:00:00,2019-04-25 06:00:00,No landing,No landing,27.0,0.0,NR,Cat 4,124.0,937,138,13.0,4,No landing,118.80,"[(-10.7847, 46.1575), (-10.9, 45.7), (-11.0076...","[(-11.6, 41.8)]","[(-11.6, 41.8)]",486.400190,484.370217,,,,,,,,,,,,,457.0,71.04,851000,604550.4,0.402,,8.0,345311
1017,2019113S10051,KENNETH,SI,SI,MOZ,2019,4,4,2019-04-25 09:00:00,2019-04-26 12:00:00,2019-04-25 15:00:00,2019-04-26 12:00:00,27.0,21.0,NR,Cat 4,121.0,939,0,11.0,4,3,119.00,"[(-11.8576, 41.2852), (-12.1, 40.8), (-12.2265...","[(-11.8576, 41.2852)]","[(-12.2265, 40.429), (-12.3, 40.1), (-12.4076,...",336.098173,304.360948,,,,3.758703e+06,7.220010e+05,1.563119e+05,3.868792e+06,7.431479e+08,1.608902e+05,,,,39.9,64.10,30400000,19486400.0,0.217,230000.0,45.0,400094


## 10. Exporting data set

It is up to you whether include or not the World Bank Indicators. I didn't include most of the selected ones due to some inconsistency between them. Also, after running some experimental models with the past data, those indicators seem to be irrelevant. I am trying to avoid using quantities which are conditioned to some rate of change.

In [226]:
merged_trial.set_index("SID").to_csv("Data_output/Ibtracs_completion_v10.csv")