In [69]:
import pandas as pd
import numpy as np
import pickle
import math
from math import cos, sin, pi
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from rfpimp import permutation_importances

In [70]:
pip install rfpimp

Note: you may need to restart the kernel to use updated packages.


In [71]:
def dist_(x1,y1,x2,y2):
  try:
    x = math.sqrt((x2 - x1)**2 + (y2 - y1)**2)
  except:
    x = 0
  return x

### Exploring IBTrACS data - what cyclones have radii


In [72]:
df2 = pd.read_csv('IBTrACS_original.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [73]:
df2.head()

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,REU_GUSTP,USA_SEAHGT,USA_SEARAD,STORM_SPD,STORM_DR,year,month,day,hour,min
0,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 03:00:00,NR,10.9,80.3,...,,,,9,266,1842,10,25,3,0
1,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 06:00:00,NR,10.87,79.83,...,,,,9,267,1842,10,25,6,0
2,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 09:00:00,NR,10.84,79.35,...,,,,9,267,1842,10,25,9,0
3,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 12:00:00,NR,10.82,78.88,...,,,,9,267,1842,10,25,12,0
4,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 15:00:00,NR,10.8,78.4,...,,,,9,268,1842,10,25,15,0


In [74]:
df2['TOK_R50_L']

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
          ..
669865   NaN
669866   NaN
669867   NaN
669868   NaN
669869   NaN
Name: TOK_R50_L, Length: 669870, dtype: float64

In [75]:
df2['LON_1'] = df2.groupby(['SID'])['LON'].shift(-1)
df2['LAT_1'] = df2.groupby(['SID'])['LAT'].shift(-1)

In [76]:
def fast_wind(row):
  x1 = row['LAT']
  x2 = row['LAT_1']
  y1 = row['LON']
  y2 = row['LON_1']

  return dist_(x1,y1,x2,y2)

df2['dist'] = df2.apply(fast_wind, axis=1)


In [77]:
df2['ISO_TIME'] = pd.to_datetime(df2['ISO_TIME'])
df2["rank"] = df2.groupby("SID")["ISO_TIME"].rank("dense", ascending=False)
df2["rank_1"] = df2.groupby("SID")["ISO_TIME"].rank("dense", ascending=True)

In [78]:
#df2['NEW_D_34'] = [df2['BOM_R34_NE']==df2['BOM_R34_NE']].NAME.unique()

In [79]:
for j in [34,50,64]:
  df2['max_'+str(j)] = df2[['USA_R'+str(j)+'_NE', 'USA_R'+str(j)+'_SE', 'USA_R'+str(j)+'_SW', 'USA_R'+str(j)+'_NW',
                            'REU_R'+str(j)+'_NE', 'REU_R'+str(j)+'_SE', 'REU_R'+str(j)+'_SW', 'REU_R'+str(j)+'_NW',
                            'BOM_R'+str(j)+'_NE', 'BOM_R'+str(j)+'_SE', 'BOM_R'+str(j)+'_SW', 'BOM_R'+str(j)+'_NW']].max(axis=1)  #USA_R34_NE

In [80]:
df2['max_50'] = df2[['max_50', 'TOK_R50_L']].max(axis=1)  #

In [81]:
df2['RMW'] = df2[['USA_RMW', 'REU_RMW', 'BOM_RMW']].max(axis=1)  #

In [82]:
df4 = df2[df2['max_34']==df2['max_34']][['NAME','SEASON']]

In [83]:
df2

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,min,LON_1,LAT_1,dist,rank,rank_1,max_34,max_50,max_64,RMW
0,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 03:00:00,NR,10.90,80.30,...,0,79.83,10.87,0.470956,64.0,1.0,,,,
1,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 06:00:00,NR,10.87,79.83,...,0,79.35,10.84,0.480937,63.0,2.0,,,,
2,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 09:00:00,NR,10.84,79.35,...,0,78.88,10.82,0.470425,62.0,3.0,,,,
3,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 12:00:00,NR,10.82,78.88,...,0,78.40,10.80,0.480416,61.0,4.0,,,,
4,1842298N11080,1842,1,NI,BB,NOT_NAMED,1842-10-25 15:00:00,NR,10.80,78.40,...,0,77.92,10.79,0.480104,60.0,5.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669865,2019141N29291,2019,24,,,ANDREA,2019-05-20 21:00:00,NR,28.81,-68.74,...,0,-68.80,29.06,0.257099,5.0,2.0,69.0,,,66.0
669866,2019141N29291,2019,24,,,ANDREA,2019-05-21 00:00:00,NR,29.06,-68.80,...,0,-68.89,29.40,0.351710,4.0,3.0,69.0,,,63.0
669867,2019141N29291,2019,24,,,ANDREA,2019-05-21 03:00:00,NR,29.40,-68.89,...,0,-69.00,29.80,0.414849,3.0,4.0,69.0,,,60.0
669868,2019141N29291,2019,24,,,ANDREA,2019-05-21 06:00:00,NR,29.80,-69.00,...,0,-69.10,30.24,0.451221,2.0,5.0,69.0,,,58.0


In [84]:
df4 = df4.drop_duplicates()

In [85]:
len(df4)

1418

In [86]:
df4['in_tracks'] = True

### Estimating Radiouses of 34 50 60 kn winds 

In [87]:
df2['WMO_PRES_delta'] = df2.groupby(['SID'])['WMO_PRES'].diff(periods=-1)
df2.groupby('SID')['WMO_PRES_delta'].fillna(df2.median().iloc[0])

0         1969.0
1         1969.0
2         1969.0
3         1969.0
4         1969.0
           ...  
669865    1969.0
669866    1969.0
669867    1969.0
669868    1969.0
669869    1969.0
Name: WMO_PRES_delta, Length: 669870, dtype: float64

In [88]:
df2.groupby('SID')['dist'].fillna(df2.median().iloc[0])

0            0.470956
1            0.480937
2            0.470425
3            0.480416
4            0.480104
             ...     
669865       0.257099
669866       0.351710
669867       0.414849
669868       0.451221
669869    1969.000000
Name: dist, Length: 669870, dtype: float64

In [89]:
df2['dist'].fillna(df2.median().iloc[0])

0            0.470956
1            0.480937
2            0.470425
3            0.480416
4            0.480104
             ...     
669865       0.257099
669866       0.351710
669867       0.414849
669868       0.451221
669869    1969.000000
Name: dist, Length: 669870, dtype: float64

In [90]:
j#more 'BASIN','SUBBASIN' 'TRACK_TYPE' 'DIST2LAND','LANDFALL', 'STORM_DR'
df2[df2['max_50']==df2['max_50']]

Unnamed: 0,SID,SEASON,NUMBER,BASIN,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,...,LON_1,LAT_1,dist,rank,rank_1,max_34,max_50,max_64,RMW,WMO_PRES_delta
395872,1977161N06134,1977,44,WP,MM,RUTH,1977-06-15 06:00:00,TS,19.39,116.89,...,117.06,19.81,0.453100,50.0,43.0,,20.0,,,
395873,1977161N06134,1977,44,WP,MM,RUTH,1977-06-15 09:00:00,TS,19.81,117.06,...,117.23,20.25,0.471699,49.0,44.0,,22.0,,,
395874,1977161N06134,1977,44,WP,MM,RUTH,1977-06-15 12:00:00,TS,20.25,117.23,...,117.38,20.70,0.474342,48.0,45.0,,25.0,,,
395875,1977161N06134,1977,44,WP,MM,RUTH,1977-06-15 15:00:00,TS,20.70,117.38,...,117.52,21.17,0.490408,47.0,46.0,,25.0,,,
395876,1977161N06134,1977,44,WP,MM,RUTH,1977-06-15 18:00:00,TS,21.17,117.52,...,117.64,21.63,0.475395,46.0,47.0,,25.0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
667974,2018307N14251,2018,98,EP,MM,XAVIER,2018-11-05 00:00:00,TS,18.27,-105.57,...,-105.68,18.32,0.120830,24.0,19.0,138.0,20.0,,20.0,
667975,2018307N14251,2018,98,EP,MM,XAVIER,2018-11-05 03:00:00,TS,18.32,-105.68,...,-105.80,18.33,0.120416,23.0,20.0,126.0,21.0,,20.0,
667976,2018307N14251,2018,98,EP,MM,XAVIER,2018-11-05 06:00:00,TS,18.33,-105.80,...,-105.88,18.36,0.085440,22.0,21.0,115.0,23.0,,20.0,
667977,2018307N14251,2018,98,EP,MM,XAVIER,2018-11-05 09:00:00,TS,18.36,-105.88,...,-106.00,18.43,0.138924,21.0,22.0,115.0,23.0,,20.0,


In [91]:
df2['BASIN'] = df2['BASIN'].fillna('NAM')

In [92]:
ib = df2[df2['max_34']==df2['max_34']][['NAME','SEASON', 'SID', 'WMO_PRES_delta', 'NUMBER',
                                      'BASIN','SUBBASIN','NAME','ISO_TIME','NATURE','LAT','LON','WMO_WIND','WMO_PRES',
                                        'TRACK_TYPE','DIST2LAND','LANDFALL',
                                       'STORM_SPD', 'dist',	'STORM_DR',	'year',	'month',
                                      'day', 'hour', 'min', 'max_34', 'max_50', 'max_64', 'rank', 'rank_1'
                                                                                                   
                                                                                                   
                                                                                                   ]]

In [93]:
ib

Unnamed: 0,NAME,SEASON,SID,WMO_PRES_delta,NUMBER,BASIN,SUBBASIN,NAME.1,ISO_TIME,NATURE,...,year,month,day,hour,min,max_34,max_50,max_64,rank,rank_1
472432,DELILAH,1989,1988364S17148,,113,SP,EA,DELILAH,1988-12-31 00:00:00,TS,...,1988,12,31,0,0,79.0,,,64.0,19.0
474660,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 06:00:00,TS,...,1989,5,10,6,0,79.0,,,22.0,35.0
474661,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 09:00:00,TS,...,1989,5,10,9,0,94.0,,,21.0,36.0
474662,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 12:00:00,TS,...,1989,5,10,12,0,110.0,,,20.0,37.0
474663,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 15:00:00,TS,...,1989,5,10,15,0,105.0,,,19.0,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669864,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-20 18:00:00,NR,...,2019,5,20,18,0,69.0,,,6.0,1.0
669865,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-20 21:00:00,NR,...,2019,5,20,21,0,69.0,,,5.0,2.0
669866,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-21 00:00:00,NR,...,2019,5,21,0,0,69.0,,,4.0,3.0
669867,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-21 03:00:00,NR,...,2019,5,21,3,0,69.0,,,3.0,4.0


In [94]:
#ib = pd.get_dummies(ib, columns=['BASIN','SUBBASIN', 'WMO_AGENCY','TRACK_TYPE',  'NATURE'])

In [95]:
ib

Unnamed: 0,NAME,SEASON,SID,WMO_PRES_delta,NUMBER,BASIN,SUBBASIN,NAME.1,ISO_TIME,NATURE,...,year,month,day,hour,min,max_34,max_50,max_64,rank,rank_1
472432,DELILAH,1989,1988364S17148,,113,SP,EA,DELILAH,1988-12-31 00:00:00,TS,...,1988,12,31,0,0,79.0,,,64.0,19.0
474660,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 06:00:00,TS,...,1989,5,10,6,0,79.0,,,22.0,35.0
474661,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 09:00:00,TS,...,1989,5,10,9,0,94.0,,,21.0,36.0
474662,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 12:00:00,TS,...,1989,5,10,12,0,110.0,,,20.0,37.0
474663,ERNIE,1989,1989126S13178,,37,SP,EA,ERNIE,1989-05-10 15:00:00,TS,...,1989,5,10,15,0,105.0,,,19.0,38.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
669864,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-20 18:00:00,NR,...,2019,5,20,18,0,69.0,,,6.0,1.0
669865,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-20 21:00:00,NR,...,2019,5,20,21,0,69.0,,,5.0,2.0
669866,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-21 00:00:00,NR,...,2019,5,21,0,0,69.0,,,4.0,3.0
669867,ANDREA,2019,2019141N29291,,24,NAM,,ANDREA,2019-05-21 03:00:00,NR,...,2019,5,21,3,0,69.0,,,3.0,4.0


In [96]:
# Code modified by Raghuram on 6th April to fill the missing values with median
cols = ['WMO_WIND','WMO_PRES']
ib[cols] = ib.groupby('SID')[cols].ffill().fillna(ib[cols].median()).astype(int)

In [97]:
#ib['RMW_1'] = ib.groupby(['SID'])['RMW'].shift(-1)

In [98]:
# Code modified by Raghuram on 6th April to fill the missing values with median
cols_winds = ['max_34','max_50','max_64', 'dist']     
ib[cols] = ib.groupby('SID')[cols].ffill().fillna(ib[cols].median()).astype(int)

DANAS experiment 

In [99]:
danas = df2[df2['max_34']==df2['max_34']][743:804]

In [100]:
danas= pd.get_dummies(danas, columns=['BASIN'])

In [101]:
danas= pd.get_dummies(danas, columns=['month'])

In [102]:
danas.head()

Unnamed: 0,SID,SEASON,NUMBER,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,WMO_WIND,...,dist,rank,rank_1,max_34,max_50,max_64,RMW,WMO_PRES_delta,BASIN_WP,month_9
564606,2001246N19156,2001,55,MM,DANAS,2001-09-04 06:00:00,TS,18.62,152.12,45.0,...,0.250799,68.0,11.0,90.0,,,40.0,,1,1
564607,2001246N19156,2001,55,MM,DANAS,2001-09-04 09:00:00,TS,18.72,151.89,,...,0.230217,67.0,12.0,105.0,,,40.0,,1,1
564608,2001246N19156,2001,55,MM,DANAS,2001-09-04 12:00:00,TS,18.85,151.7,55.0,...,0.25807,66.0,13.0,120.0,40.0,,40.0,,1,1
564609,2001246N19156,2001,55,MM,DANAS,2001-09-04 15:00:00,TS,19.0,151.49,,...,0.298329,65.0,14.0,112.0,45.0,,38.0,,1,1
564610,2001246N19156,2001,55,MM,DANAS,2001-09-04 18:00:00,TS,19.23,151.3,60.0,...,0.362491,64.0,15.0,120.0,50.0,,36.0,,1,1


In [103]:
danas = ib[ib['max_34']==ib['max_34']][['LAT','LON','WMO_WIND','WMO_PRES','DIST2LAND','LANDFALL','STORM_SPD',	
                'STORM_DR', 'dist',	
                'month_1',	'month_2',	'month_3',	'month_4',	'month_5',	'month_6',
                'month_7',	'month_8',	'month_9',	'month_10',	'month_11',	'month_12',
                'rank', 'rank_1',
                'BASIN_EP','BASIN_NI','BASIN_SI','BASIN_SP','BASIN_WP']][743:804]

KeyError: "['month_6', 'month_5', 'month_7', 'month_8', 'month_12', 'month_1', 'month_3', 'month_9', 'month_4', 'month_2', 'month_11', 'BASIN_WP', 'BASIN_EP', 'BASIN_SI', 'BASIN_NI', 'month_10', 'BASIN_SP'] not in index"

In [None]:
danas_x = ib[ib['max_34']==ib['max_34']][['LAT','LON','WMO_WIND','WMO_PRES','DIST2LAND','LANDFALL','STORM_SPD',	
                'STORM_DR', 'dist',	
                'month_1',	'month_2',	'month_3',	'month_4',	'month_5',	'month_6',
                'month_7',	'month_8',	'month_9',	'month_10',	'month_11',	'month_12',
                'rank', 'rank_1',
                'BASIN_EP','BASIN_NI','BASIN_SI','BASIN_SP','BASIN_WP']][743:804]

In [None]:
y_danas = df2[df2['max_34']==df2['max_34']][['max_64']][743:804]

In [None]:
df2['BASIN'].unique()

In [None]:
# load the model from disk
rf_random_64 = pickle.load(open('rf_random_64.sav', 'rb'))
y_pred = rf_random_64.predict(danas_x)

In [None]:
y_pred

In [None]:
y_danas 

In [None]:
ib.info()

In [104]:
# Code modified by Raghuram on 6th April to fill the missing values with median
cols = ['WMO_PRES_delta']
ib[cols] = ib.groupby('SID')[cols].ffill().fillna(ib[cols].median()).astype(int)

In [105]:
ib = pd.get_dummies(ib, columns=['BASIN'])

In [106]:

ib['dist'] = ib['dist'].fillna(method = 'ffill')

In [107]:
ib = pd.get_dummies(ib, columns=['month']) 

In [108]:
import numpy as np
from sklearn.model_selection import train_test_split
column_names = ['LAT','LON','WMO_WIND','WMO_PRES','DIST2LAND','LANDFALL','STORM_SPD',	
                'STORM_DR', 'dist',	
                'month_1',	'month_2',	'month_3',	'month_4',	'month_5',	'month_6',
                'month_7',	'month_8',	'month_9',	'month_10',	'month_11',	'month_12',
                'rank', 'rank_1',
                'BASIN_EP','BASIN_NI','BASIN_WP', 
               'BASIN_SI', 'BASIN_NAM',   'BASIN_SP', 'BASIN_SA'
               
               ]
X = ib[column_names]
y = ib['max_34']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)



In [109]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
#model.fit(X, y)
# Code modified by Raghuram on 6th April to avoid data leakage
model.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [110]:
y_pred = model.predict(X_test)

In [111]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 50.91719456994159
Mean Squared Error: 46195.53320144454
Root Mean Squared Error: 214.9314616370636


In [112]:
df66 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df68 = df66.head(25)

In [113]:
df66

Unnamed: 0,Actual,Predicted
580850,80.0,176.863142
576931,140.0,159.522478
575177,95.0,102.595795
647936,115.0,168.586491
616719,135.0,118.990054
...,...,...
656264,110.0,88.122782
606775,30.0,92.151783
593950,126.0,147.967705
576179,100.0,117.866004


### Model - Random Forest Regressor

In [114]:
# Code for 'Model - Random Forest Regressor' added by Raghuram on 6th April 2020
from sklearn.ensemble import RandomForestRegressor

model2 = RandomForestRegressor(n_estimators = 60, random_state = 0)
model2.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=60, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

In [115]:
y_pred = model2.predict(X_test)

In [116]:
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 19.351481402913304
Mean Squared Error: 9315.366780305709
Root Mean Squared Error: 96.51614776971628


**Observation**: With this RandomForestRegressor model, MAE reduced from 51 to 20.9 and RMSE reduced from 215 to 104 when compared with Linear Regression model.

In [117]:
df_rf = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df_rf.head(125)

Unnamed: 0,Actual,Predicted
580850,80.0,95.450000
576931,140.0,166.200000
575177,95.0,94.616667
647936,115.0,114.700000
616719,135.0,137.200000
...,...,...
570444,130.0,143.383333
651220,120.0,140.133333
577275,120.0,120.816667
641667,80.0,80.700000


In [118]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 50, stop = 300, num = 50)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 22)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [50, 55, 60, 65, 70, 75, 80, 85, 90, 95, 101, 106, 111, 116, 121, 126, 131, 136, 141, 146, 152, 157, 162, 167, 172, 177, 182, 187, 192, 197, 203, 208, 213, 218, 223, 228, 233, 238, 243, 248, 254, 259, 264, 269, 274, 279, 284, 289, 294, 300], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 14, 19, 24, 29, 33, 38, 43, 48, 52, 57, 62, 67, 71, 76, 81, 86, 90, 95, 100, 105, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [119]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_2 = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random_2.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  7.3min
[Parallel(n_jobs=-1)]: Done 333 tasks      | elapsed: 16.9min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 30.7min finished


RandomizedSearchCV(cv=3, error_score=nan,
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   ccp_alpha=0.0,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   max_samples=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators=100,
                              

In [120]:
rf_random_2.best_params_

{'n_estimators': 289,
 'min_samples_split': 2,
 'min_samples_leaf': 1,
 'max_features': 'sqrt',
 'max_depth': 24,
 'bootstrap': False}

NameError: name 'rf_random' is not defined

In [122]:
y_pred_2 = rf_random_2.predict(X_test)

In [123]:
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred_2))  
print('Mean Squared Error:', mean_squared_error(y_test, y_pred_2))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y_test, y_pred_2)))

Mean Absolute Error: 17.33592971541802
Mean Squared Error: 7757.809454536857
Root Mean Squared Error: 88.07842786140576


##Populating with predictions 

In [124]:
filename = 'rf_random_34.sav'
pickle.dump(rf_random_2, open(filename, 'wb'))
 



In [125]:
# load the model from disk
rf_random_34 = pickle.load(open('rf_random_34.sav', 'rb'))
result = rf_random_34.score(X_test, y_test)
print(result)

0.8393681703567177


In [None]:
pip install rfpimp

In [None]:
perm_imp_rfpimp

In [None]:
rf_random_34

Unwraping no names

In [None]:
df[df['in_ibtrax'] != True][df['in_pre'] == True]

In [126]:
pre_df  = pd.read_csv('OUTPUT_WBI_cyclones.csv', sep = ';', low_memory=False)

In [127]:
pre_df_unique  = pre_df[['NAME',	'Year']]

In [128]:
pre_df_unique = pre_df_unique.drop_duplicates()

In [129]:
pre_df['in_pre'] = True

In [130]:
df2['in_ibtrax'] = True

In [131]:
df = pd.merge(df2, pre_df, how='outer', left_on = ['NAME', 'SEASON'], right_on=['NAME','Year'], suffixes=('', '_pre'))

In [132]:
df = pd.get_dummies(df, columns=['BASIN','month'])

In [133]:
#y = df['max_50'][df['max_50']!=df['max_50']]

In [134]:
df['WMO_WIND'] =df.groupby('SID')['WMO_WIND'].fillna(df['WMO_WIND'].median())
df['WMO_WIND'] = df['WMO_WIND'].fillna(df['WMO_WIND'].median())
df['WMO_PRES']=df.groupby('SID')['WMO_PRES'].fillna(df['WMO_PRES'].median())
df['WMO_PRES'] = df['WMO_PRES'].fillna(df['WMO_PRES'].median())
df['STORM_SPD']=df.groupby('SID')['STORM_SPD'].fillna(df['STORM_SPD'].median())
df['STORM_SPD'] = df['STORM_SPD'].fillna(df['STORM_SPD'].median())
df['STORM_SPD'] = df['STORM_SPD'].fillna(df['STORM_SPD'].median())
df['STORM_DR']=df.groupby('SID')['STORM_DR'].fillna(df['STORM_DR'].median())
df['STORM_DR']=df.groupby('SID')['STORM_DR'].fillna(df['STORM_DR'].median())

In [135]:
df['dist'] = df['dist'].fillna(method='ffill')

##TESTING ACCURACY ON NEW DATA

In [136]:

import numpy as np
from sklearn.model_selection import train_test_split
column_names = ['LAT','LON','WMO_WIND','WMO_PRES','DIST2LAND','LANDFALL','STORM_SPD',	
                'STORM_DR', 'dist',	
                'month_1.0',	'month_2.0',	'month_3.0',	'month_4.0',	'month_5.0',	'month_6.0',
                'month_7.0',	'month_8.0',	'month_9.0',	'month_10.0',	'month_11.0',	'month_12.0',
                'rank', 'rank_1',
                'BASIN_EP','BASIN_NI','BASIN_WP', 
               'BASIN_SI', 'BASIN_NAM',   'BASIN_SP', 'BASIN_SA']
X = df[column_names][df['max_34']==df['max_34']]
y = df['max_34'][df['max_34']==df['max_34']]

In [137]:
df['dist'] = df['dist'].fillna(method='ffill')

In [138]:
X

Unnamed: 0,LAT,LON,WMO_WIND,WMO_PRES,DIST2LAND,LANDFALL,STORM_SPD,STORM_DR,dist,month_1.0,...,month_12.0,rank,rank_1,BASIN_EP,BASIN_NI,BASIN_WP,BASIN_SI,BASIN_NAM,BASIN_SP,BASIN_SA
476375,-17.92,150.02,35.0,997.0,295.0,295.0,8.0,82.0,0.471699,0,...,1,64.0,19.0,0,0,0,0,0,1,0
479476,-12.85,153.81,35.0,1001.0,424.0,367.0,13.0,272.0,0.642806,0,...,0,22.0,35.0,0,0,0,0,0,1,0
479477,-12.79,153.17,45.0,994.0,367.0,313.0,12.0,277.0,0.577062,0,...,0,21.0,36.0,0,0,0,0,0,1,0
479478,-12.70,152.60,40.0,1000.0,313.0,271.0,11.0,280.0,0.559017,0,...,0,20.0,37.0,0,0,0,0,0,1,0
479479,-12.60,152.05,45.0,994.0,265.0,234.0,10.0,281.0,0.498197,0,...,0,19.0,38.0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693153,28.60,-68.70,45.0,994.0,933.0,933.0,4.0,348.0,0.213776,0,...,0,6.0,1.0,0,0,0,0,1,0,0
693154,28.81,-68.74,45.0,994.0,952.0,952.0,5.0,349.0,0.257099,0,...,0,5.0,2.0,0,0,0,0,1,0,0
693155,29.06,-68.80,45.0,994.0,956.0,926.0,6.0,347.0,0.351710,0,...,0,4.0,3.0,0,0,0,0,1,0,0
693156,29.40,-68.89,45.0,994.0,926.0,889.0,8.0,346.0,0.414849,0,...,0,3.0,4.0,0,0,0,0,1,0,0


In [139]:
y_pred_34 = rf_random_34.predict(X)

In [140]:
y

476375     79.0
479476     79.0
479477     94.0
479478    110.0
479479    105.0
          ...  
693153     69.0
693154     69.0
693155     69.0
693156     69.0
693157     69.0
Name: max_34, Length: 67618, dtype: float64

In [141]:
df_rf_34 = pd.DataFrame({'Actual': y, 'Predicted': y_pred_34})

In [142]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 67618 entries, 476375 to 693157
Data columns (total 30 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   LAT         67618 non-null  float64
 1   LON         67618 non-null  float64
 2   WMO_WIND    67618 non-null  float64
 3   WMO_PRES    67618 non-null  float64
 4   DIST2LAND   67618 non-null  float64
 5   LANDFALL    67618 non-null  float64
 6   STORM_SPD   67618 non-null  float64
 7   STORM_DR    67618 non-null  float64
 8   dist        67618 non-null  float64
 9   month_1.0   67618 non-null  uint8  
 10  month_2.0   67618 non-null  uint8  
 11  month_3.0   67618 non-null  uint8  
 12  month_4.0   67618 non-null  uint8  
 13  month_5.0   67618 non-null  uint8  
 14  month_6.0   67618 non-null  uint8  
 15  month_7.0   67618 non-null  uint8  
 16  month_8.0   67618 non-null  uint8  
 17  month_9.0   67618 non-null  uint8  
 18  month_10.0  67618 non-null  uint8  
 19  month_11.0  67618 n

In [143]:
print('Mean Absolute Error:', mean_absolute_error(y, y_pred_34))  
print('Mean Squared Error:', mean_squared_error(y, y_pred_34))  
print('Root Mean Squared Error:', np.sqrt(mean_squared_error(y, y_pred_34)))

Mean Absolute Error: 23.961326911428134
Mean Squared Error: 8871.232248966626
Root Mean Squared Error: 94.1872191380902


##PREDICTIONS

In [144]:
df_to_pred = df[df['max_34']!=df['max_34']][df['SID']==df['SID']]

  """Entry point for launching an IPython kernel.


In [146]:
df_to_pred['dist'] = df_to_pred['dist'].fillna(method = 'ffill')

In [147]:
df['BASIN_NI']

0         1
1         1
2         1
3         1
4         1
         ..
693177    0
693178    0
693179    0
693180    0
693181    0
Name: BASIN_NI, Length: 693182, dtype: uint8

In [182]:
df['art_name'] = df['NAME'].astype(str)+df['SEASON'].astype(str)+df['LAT'].astype(str)+df['LON'].astype(str)

In [183]:
df = df.set_index('art_name')

In [184]:
df

Unnamed: 0_level_0,SID,SEASON,NUMBER,SUBBASIN,NAME,ISO_TIME,NATURE,LAT,LON,WMO_WIND,...,month_3.0,month_4.0,month_5.0,month_6.0,month_7.0,month_8.0,month_9.0,month_10.0,month_11.0,month_12.0
art_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
NOT_NAMED1842.010.980.3,1842298N11080,1842.0,1.0,BB,NOT_NAMED,1842-10-25 03:00:00,NR,10.90,80.30,45.0,...,0,0,0,0,0,0,0,1,0,0
NOT_NAMED1842.010.8779.83,1842298N11080,1842.0,1.0,BB,NOT_NAMED,1842-10-25 06:00:00,NR,10.87,79.83,45.0,...,0,0,0,0,0,0,0,1,0,0
NOT_NAMED1842.010.8479.35,1842298N11080,1842.0,1.0,BB,NOT_NAMED,1842-10-25 09:00:00,NR,10.84,79.35,45.0,...,0,0,0,0,0,0,0,1,0,0
NOT_NAMED1842.010.8278.88,1842298N11080,1842.0,1.0,BB,NOT_NAMED,1842-10-25 12:00:00,NR,10.82,78.88,45.0,...,0,0,0,0,0,0,0,1,0,0
NOT_NAMED1842.010.878.4,1842298N11080,1842.0,1.0,BB,NOT_NAMED,1842-10-25 15:00:00,NR,10.80,78.40,45.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DAMANnannannan,,,,,DAMAN,NaT,,,,45.0,...,0,0,0,0,0,0,0,0,0,0
EVANnannannan,,,,,EVAN,NaT,,,,45.0,...,0,0,0,0,0,0,0,0,0,0
BEJISAnannannan,,,,,BEJISA,NaT,,,,45.0,...,0,0,0,0,0,0,0,0,0,0
CEMPAKAnannannan,,,,,CEMPAKA,NaT,,,,45.0,...,0,0,0,0,0,0,0,0,0,0


In [185]:

import numpy as np
from sklearn.model_selection import train_test_split
column_names = ['LAT','LON','WMO_WIND','WMO_PRES','DIST2LAND','LANDFALL','STORM_SPD',	
                'STORM_DR', 'dist',	
                'month_1.0',	'month_2.0',	'month_3.0',	'month_4.0',	'month_5.0',	'month_6.0',
                'month_7.0',	'month_8.0',	'month_9.0',	'month_10.0',	'month_11.0',	'month_12.0',
                'rank', 'rank_1',
                'BASIN_EP','BASIN_NI','BASIN_WP', 
               'BASIN_SI', 'BASIN_NAM',   'BASIN_SP', 'BASIN_SA']
X = df[column_names][df['in_pre'] == True][df['LAT'] == df['LAT']]
y = df['max_34'][df['in_pre'] == True][df['LAT'] == df['LAT']]


  # Remove the CWD from sys.path while we load stuff.


ValueError: cannot reindex from a duplicate axis

In [186]:
X

Unnamed: 0,LAT,LON,WMO_WIND,WMO_PRES,DIST2LAND,LANDFALL,STORM_SPD,STORM_DR,dist,month_1.0,...,month_12.0,rank,rank_1,BASIN_EP,BASIN_NI,BASIN_WP,BASIN_SI,BASIN_NAM,BASIN_SP,BASIN_SA
223479,6.60,145.00,45.0,994.0,991.0,991.0,13.0,270.0,0.680000,0,...,0,98.0,1.0,0,0,1,0,0,0,0
223480,6.60,144.32,45.0,994.0,1007.0,1007.0,13.0,270.0,0.620000,0,...,0,97.0,2.0,0,0,1,0,0,0,0
223481,6.60,143.70,45.0,994.0,1026.0,1026.0,11.0,270.0,0.530000,0,...,0,96.0,3.0,0,0,1,0,0,0,0
223482,6.60,143.17,45.0,994.0,1045.0,1040.0,10.0,270.0,0.470000,0,...,0,95.0,4.0,0,0,1,0,0,0,0
223483,6.60,142.70,45.0,994.0,1040.0,1022.0,9.0,270.0,0.450000,0,...,0,94.0,5.0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693098,17.96,84.86,45.0,994.0,84.0,69.0,7.0,15.0,0.367696,0,...,0,5.0,46.0,0,1,0,0,0,0,0
693099,18.30,85.00,45.0,994.0,69.0,61.0,7.0,30.0,0.392046,0,...,0,4.0,47.0,0,1,0,0,0,0,0
693100,18.61,85.24,45.0,994.0,67.0,64.0,8.0,34.0,0.468722,0,...,0,3.0,48.0,0,1,0,0,0,0,0
693101,19.00,85.50,45.0,994.0,64.0,30.0,10.0,24.0,0.578705,0,...,0,2.0,49.0,0,1,0,0,0,0,0


In [187]:
y_pred_34 = rf_random_34.predict(X)

In [None]:
y_pred_34

In [None]:
X

In [None]:
X.loc[:,'Outcome'] = y_pred_34
X.to_csv('max_34_.csv')
len(X)

In [None]:
https://www.ias.ac.in/public/Volumes/jess/124/07/1573-1598.pdf

In [188]:
y_['preds'] = y_pred_34

NameError: name 'y_' is not defined

In [190]:
df_64 = pd.merge(df,y_pred_34,how = 'left',left_index = True, right_index = True)

TypeError: Can only merge Series or DataFrame objects, a <class 'numpy.ndarray'> was passed