In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.model_selection import StratifiedShuffleSplit,StratifiedKFold
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

In [63]:
init_data = pd.read_csv('./OUTPUT_WBI_exposer_cyclones_v14.csv',sep=';')
init_data.head()

Unnamed: 0,SID,NAME,ISO,YEAR,COORDS,COORDS_MAX_WINDS,COORDS_MIN_DIST2LAND,BASIN,SUB BASIN,MONTH_START,...,Food production index (2004-2006 = 100),GDP per capita (constant 2010 US$),Net flows from UN agencies US$,"Life expectancy at birth, total (years)",Adjusted savings: education expenditure (% of GNI),Income_level_Final,POP_MAX_34_ADJ,POP_MAX_50_ADJ,POP_MAX_64_ADJ,TOTAL_AFFECTED
0,1949163N07145,DELLA,JPN,1949,"[(21.5, 125.424), (22.2, 125.8), (22.9089, 126...","[(27.6584, 128.82), (28.9141, 129.572999999999...","[(31.9866, 130.683), (33.5183, 130.97299999999...",WP,WP,6,...,67.79,8607.657082,0.0,67.666098,2.867878,Low,11983000.0,5803981.0,3320907.0,194046
1,1950241N23140,JANE,JPN,1950,"[(23.8845, 139.74), (23.9335, 139.639), (23.98...","[(28.7509, 133.748), (29.1201, 133.762), (29.6...","[(34.9374, 135.476), (36.1016, 136.095), (37.2...",WP,WP,8,...,67.79,8607.657082,0.0,67.666098,2.867878,Low,31608220.0,15591960.0,10189360.0,642117
2,1951224N12316,CHARLIE,JAM,1951,"[(17.3398, -75.4138), (17.6, -76.2), (17.9, -7...","[(17.6, -76.2), (17.9, -76.9)]","[(17.9, -76.9), (18.1, -77.8)]",NAm,CS,8,...,69.93,3796.219401,0.0,64.77,2.578304,Low,1689243.0,1687083.0,1195052.0,20200
3,1951337N09150,AMY,PHL,1951,"[(12.0725, 130.967), (12.1333, 130.517), (12.1...","[(11.7833, 127.9)]","[(10.7333, 124.8), (10.534, 123.174), (10.6833...",WP,WP,12,...,25.68,1059.467412,0.0,61.105,2.757732,Low_Middle,4760039.0,3006670.0,1114774.0,60000
4,1952180N05144,EMMA,PHL,1952,"[(9.33776, 130.185), (9.38633, 129.61), (9.45,...","[(9.45, 129.017), (9.50407, 128.407), (9.56613...","[(10.4667, 123.867), (10.6492, 123.239), (10.8...",WP,WP,7,...,25.68,1059.467412,0.0,61.105,2.757732,Low_Middle,5130796.0,2789486.0,1988524.0,103


In [64]:
## Code from Fabi to remove outliers using IQR

def null_cols(data):

    """
    This function takes a dataframe df and shows the columns of df that have NaN values
    and the number of them

    """

    nulls = data.isna().sum()
    return nulls[nulls > 0] / len(data) * 100

def detect_low_variance(data, col, n= 90):

    """
    This function takes a dataframe data, a column col from data
    and a number n between 0 and 100.
    Returns True if the minimum value of column is equal to its n-th percentile
    and False otherwise. The predetermined value of the percentile is 90.
    """
    min_val = data[col].min()
    perc_val = np.percentile(data[col], n)
    return min_val == perc_val

def drop_low_variance(data, n= 90):

    """
    Takes a dataframe data and a number n between 0 and 100.
    It returns a dataframe after removing the numerical columns having low
    variance having as a reference the value n.
    """

    low_var_cols = [col for col in data.select_dtypes(include = [np.number]).columns if detect_low_variance(data, col, n)]
    return data.drop(low_var_cols, axis=1)

def iqr(data, cols, t=1.5):

    """
    This function computes the interquartal range with rule t.
    "data" is a dataframe, "cols" is a list of columns with numerical values
    from "data" and "t" is a positive number. By default t takes the value 1.5.
    iqr returns a dictionary of dictionaries containing the lower and upper
    extremes of the adjusted IQR for each column.
    """
    Q1 = data[cols].quantile(0.25)
    Q3 = data[cols].quantile(0.75)
    IQR = Q3-Q1
    low_bound = {}
    upp_bound = {}
    for col in list(IQR.index):
        low_bound[col] = Q1[col]-t*IQR[col]
        upp_bound[col] = Q3[col]+t*IQR[col]
    return {"low_b": low_bound, "upp_b": upp_bound}

def drop_outliers(data, cols, t=1.5):

    """
    Takes a dataset data, a list of columns cols with numerical values and a positive
    number t which is the rule for calculating the interquartal range.
    Returns a data set without outliers removed accourding with the rule t.
    """
    iqr_d = iqr(data, cols, t)
    for col in cols:
#         return data[~((data[col]< iqr_d["low_b"][col]) | (data[col]> iqr_d["upp_b"][col]))]
        data = data[~((data[col]< iqr_d["low_b"][col]) | (data[col]> iqr_d["upp_b"][col]))]
    return data.copy()


In [65]:
null_columns_num = []
null_columns_cat = []
total_columns = init_data.columns.to_list()
for col in total_columns:
    if init_data[init_data[col].isnull()].shape[0] > 0:
        if init_data[col].dtype == 'object':
            null_columns_cat.append(col)
        else:
            null_columns_num.append(col)

In [66]:
knn_imputer = KNNImputer(n_neighbors=5)
simple_imputer = SimpleImputer(strategy='mean')
for col in null_columns_cat:
    init_data[col] = knn_imputer.fit_transform(init_data[[col]])
for col in null_columns_num:
    init_data[col] = simple_imputer.fit_transform(init_data[[col]])

In [149]:
num1_features = init_data.select_dtypes(include = ['int64', 'float64']).columns.to_list()
num_features = ['V_LAND_KN', 'TOTAL_HOURS_IN_LAND', 'MAX_WIND', 'MIN_PRES', 'MIN_DIST2LAND', 'MAX_STORMSPEED',
                 'DISTANCE_TRACK_VINCENTY', 'HDI']
data = drop_outliers(init_data, num_features)
print(f'Input shape of the dataset {init_data.shape}')
print(f'Shape of the data after dropping with low varience {drop_low_variance(init_data).shape}')
print(f'Shape of the data after dropping the outliers of all columns {drop_outliers(init_data,num1_features).shape}')
print(f'Shape of the data after dropping the outliers {data.shape}')

Input shape of the dataset (991, 51)
Shape of the data after dropping with low varience (991, 51)
Shape of the data after dropping the outliers of all columns (218, 51)
Shape of the data after dropping the outliers (610, 51)


In [69]:
data.corr(method = 'spearman')['TOTAL_AFFECTED'].sort_values(ascending=False)

TOTAL_AFFECTED                                        1.000000
TOTAL_DEATHS                                          0.383302
34KN_POP                                              0.263747
Net flows from UN agencies US$                        0.252973
64KN_POP                                              0.241824
MAX_WIND                                              0.236357
V_LAND_KN                                             0.233406
MAX_USA_SSHS                                          0.222447
96KN_POP                                              0.195337
RURAL_POP(%)                                          0.191339
POP_MAX_50_ADJ                                        0.187753
POP_MAX_64_ADJ                                        0.176240
POP_MAX_34_ADJ                                        0.166236
MONTH_START                                           0.150850
MONTH_END                                             0.146185
DISTANCE_TRACK_VINCENTY                               0

In [70]:
cat_features = init_data.select_dtypes(include=['object']).columns.to_list()

In [85]:
cat_features = ['BASIN','NATURE', 'SUB BASIN', 'GENERAL_CATEGORY']
num_features = ['MONTH_END', 'TOTAL_HOURS_IN_LAND', 'MAX_WIND', 'MIN_PRES', 'MIN_DIST2LAND', 'MAX_STORMSPEED',
               'V_LAND_KN', 'DISTANCE_TRACK_VINCENTY', 'POP_DEN_SQ_KM', 'RURAL_POP(%)', 'HDI', 'GDP per capita (constant 2010 US$)',
               'TOTAL_AFFECTED']

In [128]:
total_data = data[cat_features + num_features].copy()
X = total_data.drop(columns = 'TOTAL_AFFECTED', axis=1)
y = total_data['TOTAL_AFFECTED']

- Stratified shuffle split will not work properly as all the categorical data cannot be placed in both train and test
- So using kfold cross validation

In [129]:
X['GENERAL_CATEGORY'].value_counts()

TS       176
Cat 1    125
Cat 4    118
Cat 2     68
Cat 3     67
Cat 5     39
TD        17
Name: GENERAL_CATEGORY, dtype: int64

In [130]:
# sss_split = StratifiedShuffleSplit(n_splits=1, test_size =0.2, random_state=23)
# for train_index, test_indes in sss_split.split(X, X[cat_features]):
#     train_x, test_x = X.iloc[train_index], X.ilco[test_index]
#     train_y, test_y = y.iloc[train_index], y.iloc[test_index]

In [131]:
X['GENERAL_CATEGORY'] = X['GENERAL_CATEGORY'].replace('TS', 'Cat 0')
X['GENERAL_CATEGORY'] = X['GENERAL_CATEGORY'].replace('TD', 'Cat 0')

In [132]:
X['GENERAL_CATEGORY'].value_counts()

Cat 0    193
Cat 1    125
Cat 4    118
Cat 2     68
Cat 3     67
Cat 5     39
Name: GENERAL_CATEGORY, dtype: int64

In [133]:
X = pd.concat([pd.get_dummies(X[cat_features]), X],axis=1)
X.drop(columns=cat_features, axis=1, inplace=True)

In [134]:
skf = StratifiedKFold(n_splits=3, random_state=None)
# X is the feature set and y is the target
for train_index, val_index in skf.split(X,y): 
    X_train, X_test = X.iloc[train_index], X.iloc[val_index] 
    y_train, y_test = y.iloc[train_index], y.iloc[val_index]



<class 'pandas.core.frame.DataFrame'>
Int64Index: 407 entries, 1 to 990
Data columns (total 42 columns):
BASIN_EP                              407 non-null uint8
BASIN_NAm                             407 non-null uint8
BASIN_NI                              407 non-null uint8
BASIN_SI                              407 non-null uint8
BASIN_SP                              407 non-null uint8
BASIN_WP                              407 non-null uint8
NATURE_DS                             407 non-null uint8
NATURE_ET                             407 non-null uint8
NATURE_MX                             407 non-null uint8
NATURE_NR                             407 non-null uint8
NATURE_SS                             407 non-null uint8
NATURE_TS                             407 non-null uint8
SUB BASIN_AS                          407 non-null uint8
SUB BASIN_BB                          407 non-null uint8
SUB BASIN_CP                          407 non-null uint8
SUB BASIN_CS                          40

In [139]:
from sklearn.model_selection import GridSearchCV
## parameters for RF
rf_param = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [140]:
rf = RandomForestRegressor()
grid_search_rf = GridSearchCV(estimator = rf, param_grid = rf_param, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [141]:
grid_search_rf.fit(X_train, y_train)

Fitting 3 folds for each of 288 candidates, totalling 864 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:   53.5s
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 864 out of 864 | elapsed:  2.1min finished


GridSearchCV(cv=3, error_score=nan,
             estimator=RandomForestRegressor(bootstrap=True, ccp_alpha=0.0,
                                             criterion='mse', max_depth=None,
                                             max_features='auto',
                                             max_leaf_nodes=None,
                                             max_samples=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             n_estimators=100, n_jobs=None,
                                             oob_score=False, random_state=None,
                                             verbose=0, warm_start=False),
             iid='deprecated', n_jo

In [142]:
grid_search_rf.best_estimator_

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=100, max_features=2, max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=5,
                      min_samples_split=10, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [143]:
y_pred_man_rf = grid_search_rf.predict(X_test)

In [145]:
print(f'mean absolute error : {mean_absolute_error(y_pred_man_rf, y_test)}')
print(f'mean squared error : {mean_squared_error(y_pred_man_rf, y_test)}')
print(f'root mean squared error : {np.sqrt(mean_squared_error(y_pred_man_rf, y_test))}')

mean absolute error : 820242.5503333409
mean squared error : 2146448551322.242
root mean squared error : 1465076.2953929198
