In [0]:
import pandas as pd
import numpy as np
from collections import Iterable


In [160]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
tcedf = pd.read_csv('/content/drive/My Drive/TCE-DAT_historic-exposure_1950-2015.csv',skiprows = 6)
tcedf = tcedf[tcedf['year'] >= 1980]

In [0]:
emdatdf = pd.read_csv('/content/drive/My Drive/emdat_cleaned.csv')
emdatdf = emdatdf[emdatdf['Year'] >= 1980 ]

In [0]:
### emdat processing

In [0]:
### emdat Try dropping rows without storm names. 
emdat_noname_df = emdatdf.dropna(subset=['Event name'])

In [0]:
### Helper function - Flatten nested list
def flatten(lis):
     for item in lis:
         if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                 yield x
         else:        
             yield item

In [0]:
emdat_cols_drop = ['Latitude','Longitude','Magnitude (scale)',
                   'EU member','OFDA response', 'Appeal',
                  'Declaration', "Aid contribution ('000$)",
                  'Associated disater', 'Associated disater 2']

emdat_noname_df = emdat_noname_df.drop(columns=emdat_cols_drop)

## Some event names have year numbers. Strip this out.
emdat_noname_df['Event name'] = emdat_noname_df['Event name'].str.replace('\d+', '')

In [8]:
### Remove words like "Hurricane", "Cyclone" e.t.c from names column
import re

to_remove = ['Hurricane', 'Tropical', 'Cyclone','depression', 'storm', 'cylone',
             'Storm', r"\(.*\)", "'", '"', "Typhoon", 'CYCLONE', 'cyclone', 'Tropcal', 
             'TYphoon', 'Topical', 'Depression', 'Typhhon', 'Tyhoon', 'Cclone', 
             'TRopical', 'STROM', 'strom']


for each in to_remove:
  emdat_noname_df['Event name'] = emdat_noname_df['Event name'].str.replace(each, '')


'''
for each in to_remove:
  emdat_noname_df['Event name'] = emdat_noname_df['Event name'].apply(lambda x: x.replace(each, ''))
'''

"\nfor each in to_remove:\n  emdat_noname_df['Event name'] = emdat_noname_df['Event name'].apply(lambda x: x.replace(each, ''))\n"

In [0]:
split_terms = ["&", " AND ", "/", " ET ", ","]

for x in split_terms:
  emdat_noname_df['Event name'] = emdat_noname_df['Event name'].str.replace(x, ' & ')


## Indexes of entries with joined names
indexes = emdat_noname_df[emdat_noname_df['Event name'].str.contains("|".join(split_terms))].index


subset_emdat = emdat_noname_df.copy().loc[indexes]
subset_emdat['Event name'] = subset_emdat['Event name'].str.split('&')


cols = subset_emdat.columns.values
cols = [v for v in cols if v != 'Event name']



temp_emdat = []

for _, row in subset_emdat.iterrows():
  for d in row['Event name']:
    temp_emdat.append(flatten([list(row[cols]), d]))


ext_subset_emdat = pd.DataFrame(temp_emdat, columns=cols + ['Event name'])


In [10]:
emdat_noname_df.loc[~emdat_noname_df.index.isin(indexes)].shape

(1705, 34)

In [172]:
emdat_noname_df.shape

(1757, 34)

In [0]:
emdat_noname_df = pd.concat([emdat_noname_df.loc[~emdat_noname_df.index.isin(indexes)], 
                             ext_subset_emdat], sort=True)

In [0]:
emdat_noname_df['Event name'].unique()

In [0]:
## Remove rows without event name
emdat_noname_df[emdat_noname_df['Event name'] != ""]

## Strip leading spaces .str.strip()
emdat_noname_df['Event name'] = emdat_noname_df['Event name'].apply(lambda x: x.strip())

In [0]:
### It appears some observations have name as simply "Hurricane" 
## and removing these leaves the name empty. This should be dropped.
mask = emdatdf['Event name'].str.contains(r'\b(?:{})\b'.format('|'.join(to_remove)))
mask = mask.replace(np.nan, False)


## Convert names to uppercase
emdat_noname_df['Event name'] = emdat_noname_df['Event name'].apply(lambda x: x.upper())

In [14]:
emdat_noname_df['Event name'].value_counts()

              22
IRMA          18
EMILY         15
IVAN          15
MATTHEW       15
              ..
ROANU          1
OLIVIA         1
KALAFANDJI     1
MADELINE       1
INIKI          1
Name: Event name, Length: 736, dtype: int64

In [181]:
emdat_noname_df.shape

(1819, 34)

In [0]:
#### Ibtracs

In [15]:
ibtracdf = pd.read_csv('/content/drive/My Drive/IBTrACS-ALL-list-v04r00-lines-dbf.csv')
ibtracdf = ibtracdf[ibtracdf['year'] >= 1980]
tup_zipped = list(set(tuple(zip(ibtracdf.SID, ibtracdf.NAME))))

## Create a dictionary with ibtracs SID & storm name
SID_Namedict = {x:y for x, y in tup_zipped }

  interactivity=interactivity, compiler=compiler, result=result)


In [0]:
###Tce dat processing

In [0]:
tcedf_copy = tcedf.copy()

## Create column "name" using ibtracID
tcedf_copy['name'] = tcedf_copy['IBTrACS_ID'].apply(lambda x: SID_Namedict.get(str(x)))

## Drop rows without event name
tcedf_copy = tcedf_copy[tcedf_copy['name'] != "NOT_NAMED"]

# Drop rows with no names on both name & tc_name
tcedf_copy = tcedf_copy[(tcedf_copy['TC_name'] != "UNNAMED") & (tcedf_copy['name'].notna())]

In [17]:
tcedf_copy.head()

Unnamed: 0,year,IBTrACS_ID,TC_name,NatCatSERVICE_ID,genesis_basin,countries_affected,ISO3,v_land_SI,v_land_kn,34kn_pop,34kn_assets,64kn_pop,64kn_assets,96kn_pop,96kn_assets,name
2217,1980,1980001S13173,PENI,,SP,single,FJI,43.5,84.5,451029.0,1777832000.0,9653.0,38805210.0,0.0,0.0,PENI
2218,1980,1980003S15137,PAUL,MR198001B002,SP,single,AUS,51.8,100.7,292780.0,17169660000.0,164543.0,7926763000.0,0.0,0.0,PAUL
2220,1980,1980068S13068,LAURE,MR198003B021,SI,single,MUS,49.3,95.9,936704.0,11302200000.0,460352.0,5297437000.0,0.0,0.0,LAURE
2221,1980,1980069S12161,SINA,,SP,multi,NCL,21.7,42.2,59080.0,1432924000.0,0.0,0.0,0.0,0.0,SINA
2222,1980,1980069S12161,SINA,,SP,multi,NZL,44.4,86.3,847876.0,62056580000.0,13217.0,2215335000.0,0.0,0.0,SINA


In [18]:
## SINA appears twice. 1990 & 1980 We would
# need to merge on both name and year. 
tcedf_copy[tcedf_copy['TC_name'] == "SINA"]

Unnamed: 0,year,IBTrACS_ID,TC_name,NatCatSERVICE_ID,genesis_basin,countries_affected,ISO3,v_land_SI,v_land_kn,34kn_pop,34kn_assets,64kn_pop,64kn_assets,96kn_pop,96kn_assets,name
2221,1980,1980069S12161,SINA,,SP,multi,NCL,21.7,42.2,59080.0,1432924000.0,0.0,0.0,0.0,0.0,SINA
2222,1980,1980069S12161,SINA,,SP,multi,NZL,44.4,86.3,847876.0,62056580000.0,13217.0,2215335000.0,0.0,0.0,SINA
3127,1990,1990327S07175,SINA,MR199011B034,SP,single,FJI,70.8,137.6,597688.0,2549236000.0,512366.0,2171554000.0,225854.0,920679559.0,SINA


In [19]:
## TC_NAME
tcedf_copy["TC_name"][tcedf_copy["TC_name"] == 'BELLY'] = "BETTY"
tcedf_copy["TC_name"][tcedf_copy["TC_name"] == 'BRENDA'] = "BRENDAN"




id_split = ['1980073S09133', '1987035S12160', '1988285N09318', '1989209N22130', 
            '1996282N11162', '1998036S13135', '2013130N04093']


for eachsplit in id_split:
  tcedf_copy['TC_name'][tcedf_copy['IBTrACS_ID'] == eachsplit] = tcedf_copy['name'].str.split(":").str[0]
  tcedf_copy['name'][tcedf_copy['IBTrACS_ID'] == eachsplit] = tcedf_copy['name'].str.split(":").str[1]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]


In [0]:
## Columns to keep
tce_dat_columns = ['year', 'IBTrACS_ID', 'name', 'TC_name', 'countries_affected', 'ISO3', 'v_land_SI',
                   'v_land_kn', '34kn_pop', '34kn_assets', '64kn_pop', '64kn_assets', 
                   '96kn_pop', '96kn_assets']

tcedf_copy = tcedf_copy[tce_dat_columns]                  

In [21]:
## Entries where names don't match. This is 
# either due to typos or multiple naming

tcedf_copy[["IBTrACS_ID", "TC_name", "name"]][tcedf_copy['TC_name'] != tcedf_copy['name']]

Unnamed: 0,IBTrACS_ID,TC_name,name
2223,1980073S09133,DORIS-GLORIA,GLORIA
2778,1987035S12160,UMA,VELI
2789,1987140S08170,BLANCH,BLANCH(E)
2901,1988285N09318,JOAN,MIRIAM
2902,1988285N09318,JOAN,MIRIAM
2903,1988285N09318,JOAN,MIRIAM
2904,1988285N09318,JOAN,MIRIAM
2905,1988285N09318,JOAN,MIRIAM
2906,1988285N09318,JOAN,MIRIAM
2907,1988285N09318,JOAN,MIRIAM


In [0]:
### Compare emdat & Tce dat

In [23]:
emdat_list = list(emdat_noname_df['Event name'])

tce_lists = [list(tcedf_copy['TC_name']), list(tcedf_copy['name'])]

for l in tce_lists:
  results = {}
  for i in emdat_list:
    results[i] = l.count(i) 

  count = 0
  for each in results:
    if results[each] != 0:
      count += 1

  print(count) ##478 & 476

495
493


In [0]:
emdatzip = { x:y for x, y in zip(emdat_noname_df['Event name'], emdat_noname_df['Year'])}
tcezip = { x:y for x, y in zip(tcedf_copy['TC_name'], tcedf_copy['year'])}

In [0]:
#### Rolf Merge Code

merged_df = emdat_noname_df.copy()

# merged_df = merged_df.assign()
columns_to_add = tcedf_copy.columns[3:]
for col in columns_to_add:
    merged_df[col] = None


for i, col in emdat_noname_df[emdat_noname_df['Year'] >= 1960].iterrows():
    year = col['Year']
    country_code = col['ISO']
    cyclone_name = col['Event name']
    tce = tcedf_copy.loc[tcedf_copy['ISO3'] == country_code]
    print(cyclone_name)
    if cyclone_name is None or cyclone_name != cyclone_name:
        print('no data')
        continue
    cyclone_name= cyclone_name.upper()
    if len(tce) < 1:
        print('no data')
        continue

    tce_by_year = tce.loc[tce['year'] == year]
    if len(tce_by_year) < 1:
        print('no data')
        continue

    print(i)
    tce_by_cyclone = tce.loc[tce['TC_name']== cyclone_name]
    if len(tce_by_cyclone) < 1:
        print('no data')
        continue

    new_row = col.append(tce_by_cyclone[columns_to_add].iloc[0])

    merged_df.loc[i] = new_row

merged_df.to_csv('merged.csv')

In [35]:
merged_df.shape

(1757, 45)

In [34]:
merged_df[(merged_df['Total affected'].isna() & merged_df['34kn_pop'].isna())].shape

(278, 45)

In [29]:
tcedf_copy.shape, emdat_noname_df.shape

((2501, 14), (1819, 34))

In [28]:
###### New Merge Attempt
tcedf_copy.head(2)

Unnamed: 0,year,IBTrACS_ID,name,TC_name,countries_affected,ISO3,v_land_SI,v_land_kn,34kn_pop,34kn_assets,64kn_pop,64kn_assets,96kn_pop,96kn_assets
2217,1980,1980001S13173,PENI,PENI,single,FJI,43.5,84.5,451029.0,1777832000.0,9653.0,38805210.0,0.0,0.0
2218,1980,1980003S15137,PAUL,PAUL,single,AUS,51.8,100.7,292780.0,17169660000.0,164543.0,7926763000.0,0.0,0.0


In [27]:
emdat_noname_df.head(2)

Unnamed: 0.1,CPI,Continent,Country name,End day,End month,End year,Entry criteria,Event name,Group,ISO,Insured losses ('000$),Local time,Location,Magnitude (value),Num affected,Num homeless,Num injured,Origin,Reconstruction cost ('000$),Region,River basin,Seq,Start day,Start month,Start year,Subgroup,Subsubtype,Subtype,Total affected,Total damage ('000$),Total deaths,Type,Unnamed: 0,Year
447,32.233893,Americas,Haiti,5.0,8.0,1980,Kill,ALLEN,Natural,HTI,,,"South-West, Port-Au-Prince",270.0,330000.0,835000.0,,,,Caribbean,,78,5.0,8.0,1980,Meteorological,,Tropical cyclone,1165000.0,400000.0,220.0,Storm,2198,1980
451,32.233893,Oceania,Australia,,1.0,1980,Govern,AMY,Natural,AUS,,,New South Wales Coast,,,,,,,Australia and New Zealand,,286,,1.0,1980,Meteorological,,Tropical cyclone,,6966.0,,Storm,2213,1980


In [0]:
merged_one = pd.merge(tcedf_copy, emdat_noname_df, left_on=['TC_name', 'year'], right_on=['Event name', 'Year'])
merged_two = pd.merge(tcedf_copy, emdat_noname_df, left_on=['name', 'year'], right_on=['Event name', 'Year'])
merged = pd.concat([merged_one, merged_two], sort=True).drop_duplicates()

In [44]:
merged[merged['Total affected'].isna()].shape

(1060, 48)

In [46]:
merged[merged['64kn_pop'].isna()].shape

(0, 48)

In [42]:
merged[(merged['Total affected'].isna() & merged['34kn_pop'].isna())].shape

(0, 48)