# This notebook adds population as a additional column for all the rows.

In [1]:
import pandas as pd
import numpy as np

## Step 1: Load the file from tornadohistory that has 60,000 rows for US and Puerto Rico since 1950 into a 'tornado dataframe'. 

In [15]:
df_tornadoes = pd.read_csv('1950-2018_all_tornadoes.csv')

In [16]:
df_tornadoes.head()

Unnamed: 0,om,yr,mo,dy,date,time,tz,st,stf,stn,...,len,wid,ns,sn,sg,f1,f2,f3,f4,fc
0,1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,...,9.5,150,2,0,1,0,0,0,0,0
1,1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,...,6.2,150,2,1,2,189,0,0,0,0
2,1,1950,1,3,1950-01-03,11:10:00,3,IL,17,1,...,3.3,100,2,1,2,119,0,0,0,0
3,2,1950,1,3,1950-01-03,11:55:00,3,IL,17,2,...,3.6,130,1,1,1,135,0,0,0,0
4,3,1950,1,3,1950-01-03,16:00:00,3,OH,39,1,...,0.1,10,1,1,1,161,0,0,0,0


In [20]:
df_tornado_mod = df_tornadoes.rename(columns = {
    'om' : 'id_not_to_be_used',
    'yr' : 'year',
    'mo' : 'month',
    'dy' : 'day',
    'tz' : 'timezone',
    'st' : 'state',
    'stf' : 'state_fips_number',
    'stn' : 'state_number',
    'mag' : 'magnitude',
    'inj' : 'injuries',
    'fat' : 'fatalities',
    'closs' : 'crop_loss',
    
})

In [21]:
df_tornado_mod = df_tornado_mod.drop(columns = ['id_not_to_be_used', 'loss', 'crop_loss', 'len', 'wid', 'ns', 'sn', 'sg', 'f1', 'f2', 'f3', 'f4', 'fc'])

### We now have our tornado dataframe ready - df_tornado_mod

In [240]:
df_tornado_mod

Unnamed: 0,year,month,day,date,time,timezone,state,state_fips_number,state_number,magnitude,injuries,fatalities,slat,slon,elat,elon,Calc_Population
0,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,3,3,0,38.7700,-90.2200,38.8300,-90.0300,3954653.0
1,1950,1,3,1950-01-03,11:00:00,3,MO,29,1,3,3,0,38.7700,-90.2200,38.8200,-90.1200,3954653.0
2,1950,1,3,1950-01-03,11:10:00,3,IL,17,1,3,0,0,38.8200,-90.1200,38.8300,-90.0300,8712176.0
3,1950,1,3,1950-01-03,11:55:00,3,IL,17,2,3,3,0,39.1000,-89.3000,39.1200,-89.2300,8712176.0
4,1950,1,3,1950-01-03,16:00:00,3,OH,39,1,1,1,0,40.8800,-84.5800,0.0000,0.0000,7946627.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
64820,2018,12,27,2018-12-27,10:15:00,3,LA,22,0,1,0,0,30.1302,-92.3645,30.1321,-92.3547,5307582.0
64821,2018,12,27,2018-12-27,10:29:00,3,MS,28,0,0,0,0,32.6431,-90.4509,32.6427,-90.4288,4300861.0
64822,2018,12,31,2018-12-31,12:35:00,3,KY,21,0,1,0,0,36.8900,-87.9870,36.8915,-87.9734,5816536.0
64823,2018,12,31,2018-12-31,13:43:00,3,IN,18,0,1,0,0,38.1813,-86.8863,38.2006,-86.8585,7781290.0


## Step 2: Load the state codes and populations from 1950 and 2016 into a 'population dataframe'.

In [144]:
df_pop = pd.read_csv('Population_US1.csv')
df_pop.head()

Unnamed: 0,StateDescription,StateCode,1950,2016
0,Alabama,AL,3061743,4863300
1,Alaska,AK,128643,741894
2,Arizona,AZ,749587,6931071
3,Arkansas,AR,1909511,2988248
4,California,CA,10586223,39250017


In [243]:
# df_pop.loc[df_pop['StateCode'] == 'PR']

## Step 3: Based on previous file, get the annual (compounded) growth rate and add an additional column in the population dataframe.

In [244]:
# This function calculates the final population based on start value, growth rate and years [Compoundng]
def cal_population(start_value, growth_rate, years):
    return round(start_value * (1 + growth_rate)**years, 0)

In [245]:
# This function calculates thge growth rate based on 1950 and 2016 population values from df_pop dataframe. 
def growth_rate2(start_value, end_value, years):
    return round(((end_value / start_value)**(1/years))/100,5)
# ** (1 / (year_diff - 1)) - 1

In [246]:
# This function passes the dd_pop row based on state (acts as the join between 2 dataframes)
def getPopulationRecord(state):
    return df_pop.loc[df_pop['StateCode'] == state]

In [247]:
# Test
getPopulationRecord('AK')

Unnamed: 0,StateDescription,StateCode,1950,2016,ChangeRate
1,Alaska,AK,128643,741894,0.01027


## Step 4: Finally, using this annual growth rate, add an additional column in tornado dataframe that estimates based on the year of the tornado

In [235]:
for index, row in df_tornado_mod.iterrows():
#     print(row.loc['year'])
    
    populationRecord = getPopulationRecord(row.loc['state'])
    df_tornado_mod.at[index,'Calc_Population'] = cal_population(populationRecord['1950'].to_numpy()[0], 
                         populationRecord['ChangeRate'].to_numpy()[0], 
                         row.loc['year'] - 1950)
#     print(index)    
#     print(populationRecord['1950'].to_numpy()[0])
#     print(populationRecord['ChangeRate'].to_numpy()[0])
#     print(row.loc['year'] - 1950)
#     print(cal_population(populationRecord['1950'].to_numpy()[0], 
#                          populationRecord['ChangeRate'].to_numpy()[0], 
#                          row.loc['year'] - 1950))
#     print('-----')


## Save this to a CSV

In [248]:
df_tornado_mod.to_csv('US_Tornadoes_1950_2016_with_ApproxPopulation.csv', header=True, index=None)

# Next phase: Add approximate food / non food info based on data collated so far... 