### 0 loading libs & competition data

In [1]:
import pandas as pd    
%run cfcs.py

In [2]:
riskfutures = pd.read_csv('Data/corn_climate_risk_futures_daily_master.csv')
marketshare = pd.read_csv('Data/corn_regional_market_share.csv')
## dir need to change before submission to Kaggle input directories

#### Note: EDAs were done seperately.

### 1 Baseline Feature Engineering

In [3]:
mergedf = riskfutures.copy()
mergedf['day_of_year'] = pd.to_datetime(mergedf['date_on'],format='%Y-%m-%d').dt.dayofyear
mergedf['quarter'] = pd.to_datetime(mergedf['date_on'],format='%Y-%m-%d').dt.quarter

In [4]:
mergedf = mergedf.merge(marketshare[['region_id','percent_country_production']],how='left',on='region_id')

In [5]:
mergedf['percent_country_production'] = mergedf['percent_country_production'].fillna(0.0)
## see EDA_regional_marketshare.ipynb

#### New Climate Features Risks

In [6]:
# Total cnt locations for each rows
mergedf['total_location_by_region'] = mergedf['climate_risk_cnt_locations_heat_stress_risk_low'] + \
                                    mergedf['climate_risk_cnt_locations_heat_stress_risk_medium'] + \
                                    mergedf['climate_risk_cnt_locations_heat_stress_risk_high']

# Climate Risk for Coldwave, and Flood:
for i in range(1, 5):
    mergedf[f'medium_coldstress_lag_{i}'] = mergedf['climate_risk_cnt_locations_unseasonably_cold_risk_medium'].shift(i)
    mergedf[f'medium_precip_lag_{i}'] = mergedf['climate_risk_cnt_locations_excess_precip_risk_medium'].shift(i)
    
    mergedf[f'medium_coldstress_lag_{i}'] = mergedf[f'medium_coldstress_lag_{i}'].fillna(0)
    mergedf[f'medium_precip_lag_{i}'] = mergedf[f'medium_precip_lag_{i}'].fillna(0)

for j in range(1, 3): 
    mergedf[f'high_coldstress_lag_{j}'] = mergedf['climate_risk_cnt_locations_unseasonably_cold_risk_high'].shift(j)
    mergedf[f'high_precip_lag_{j}'] = mergedf['climate_risk_cnt_locations_excess_precip_risk_high'].shift(j)

    mergedf[f'high_coldstress_lag_{j}'] = mergedf[f'high_coldstress_lag_{j}'].fillna(0)
    mergedf[f'high_precip_lag_{j}'] = mergedf[f'high_precip_lag_{j}'].fillna(0)

mergedf['medium_coldstress_4days_average'] = mergedf[[f'medium_coldstress_lag_{i}' for i in range(1, 5)]].mean(axis=1)
mergedf['medium_coldstress_2days_average'] = mergedf[[f'medium_coldstress_lag_{i}' for i in range(1, 3)]].mean(axis=1)
mergedf['high_coldstress_2days_average'] = mergedf[[f'high_coldstress_lag_{i}' for i in range(1, 3)]].mean(axis=1)

mergedf['medium_precip_4days_average'] = mergedf[[f'medium_precip_lag_{i}' for i in range(1, 5)]].mean(axis=1)
mergedf['medium_precip_2days_average'] = mergedf[[f'medium_precip_lag_{i}' for i in range(1, 3)]].mean(axis=1)
mergedf['high_precip_2days_average'] = mergedf[[f'high_precip_lag_{i}' for i in range(1, 3)]].mean(axis=1)

mergedf['climate_risk_cnt_locations_coldwave_risk_high'] = (mergedf['medium_coldstress_4days_average'] + mergedf['high_coldstress_2days_average']) / 2
mergedf['climate_risk_cnt_locations_coldwave_risk_medium'] = (mergedf['medium_coldstress_2days_average'] + mergedf['high_coldstress_lag_1']) / 2

mergedf['climate_risk_cnt_locations_flood_risk_high'] = (mergedf['medium_precip_4days_average'] + mergedf['high_precip_2days_average']) / 2
mergedf['climate_risk_cnt_locations_flood_risk_medium'] = (mergedf['medium_precip_2days_average'] + mergedf['high_precip_lag_1']) / 2


#### Importers and Exporters Manual Weightings

In [7]:
supply_weights = {
    # Exporters
    "United States": 2.00, "Brazil": 1.85, "Argentina": 1.75, "Ukraine": 1.60, "Russia": 1.40, "Canada": 1.40,
    # Importers
    "China": 1.30, "Mexico": 1.25,
    # Marginal
    "South Africa": 1.20,
    # Neither
    "Paraguay": 1.10, "India": 1.05
    }

mergedf['supply_chain_weightings'] = mergedf['country_name'].map(supply_weights)

seasonal_weights = {
            'Off-season': 1,
            'Planting': 1.5,
            'Mid-season': 2,
            'Harvest': 2,
            'Peak Harvest': 1.5,
        }

mergedf['seasonality_weightings'] = mergedf['harvest_period'].map(seasonal_weights)
mergedf['adjusted_weightings'] = mergedf['supply_chain_weightings'] * mergedf['seasonality_weightings']

#### Weather Forecasting Score

In [8]:
mergedf['sum_of_medium_high_drought_risk'] = mergedf['climate_risk_cnt_locations_drought_risk_medium'] + mergedf['climate_risk_cnt_locations_drought_risk_high']
mergedf['sum_of_medium_high_excess_precip_risk'] = mergedf['climate_risk_cnt_locations_excess_precip_risk_medium'] + mergedf['climate_risk_cnt_locations_excess_precip_risk_high']
mergedf['sum_of_medium_high_unseasonably_cold_risk'] = mergedf['climate_risk_cnt_locations_unseasonably_cold_risk_medium'] + mergedf['climate_risk_cnt_locations_unseasonably_cold_risk_high']
mergedf['sum_of_medium_high_heat_stress_risk'] = mergedf['climate_risk_cnt_locations_heat_stress_risk_medium'] + mergedf['climate_risk_cnt_locations_heat_stress_risk_high']

mergedf['sum_of_medium_high_drought_risk_lag_1yr'] = mergedf['sum_of_medium_high_drought_risk'].shift(365)
mergedf['sum_of_medium_high_excess_precip_risk_lag_1yr'] = mergedf['sum_of_medium_high_excess_precip_risk'].shift(365)
mergedf['sum_of_medium_high_unseasonably_cold_risk_lag_1yr'] = mergedf['sum_of_medium_high_unseasonably_cold_risk'].shift(365)
mergedf['sum_of_medium_high_heat_stress_risk_lag_1yr'] = mergedf['sum_of_medium_high_heat_stress_risk'].shift(365)

mergedf['sum_of_medium_high_drought_risk_lag_1yr'] = mergedf['sum_of_medium_high_drought_risk_lag_1yr'].fillna(0)
mergedf['sum_of_medium_high_excess_precip_risk_lag_1yr'] = mergedf['sum_of_medium_high_excess_precip_risk_lag_1yr'].fillna(0)
mergedf['sum_of_medium_high_unseasonably_cold_risk_lag_1yr'] = mergedf['sum_of_medium_high_unseasonably_cold_risk_lag_1yr'].fillna(0)
mergedf['sum_of_medium_high_heat_stress_risk_lag_1yr'] = mergedf['sum_of_medium_high_heat_stress_risk_lag_1yr'].fillna(0)

mergedf['diff_drought'] = mergedf['sum_of_medium_high_drought_risk'] - mergedf['sum_of_medium_high_drought_risk_lag_1yr']
mergedf['diff_excessprecip'] = mergedf['sum_of_medium_high_excess_precip_risk'] - mergedf['sum_of_medium_high_excess_precip_risk_lag_1yr']
mergedf['diff_cold'] = mergedf['sum_of_medium_high_unseasonably_cold_risk'] - mergedf['sum_of_medium_high_unseasonably_cold_risk_lag_1yr']
mergedf['diff_heatstress'] = mergedf['sum_of_medium_high_heat_stress_risk'] - mergedf['sum_of_medium_high_heat_stress_risk_lag_1yr']

mergedf.head()

Unnamed: 0,ID,crop_name,country_name,country_code,region_name,region_id,harvest_period,growing_season_year,date_on,climate_risk_cnt_locations_heat_stress_risk_low,...,sum_of_medium_high_unseasonably_cold_risk,sum_of_medium_high_heat_stress_risk,sum_of_medium_high_drought_risk_lag_1yr,sum_of_medium_high_excess_precip_risk_lag_1yr,sum_of_medium_high_unseasonably_cold_risk_lag_1yr,sum_of_medium_high_heat_stress_risk_lag_1yr,diff_drought,diff_excessprecip,diff_cold,diff_heatstress
0,8af42722-3f05-4ede-80fc-605e0e2b3b67,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-15,23,...,0,0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
1,54f4ddc5-e7ab-4bfb-ad6a-5649841af563,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-16,23,...,0,0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0
2,63a41fce-d371-4295-a58a-dc6491664020,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-17,23,...,9,0,0.0,0.0,0.0,0.0,9.0,0.0,9.0,0.0
3,cddfa440-e0eb-4735-beb1-1aca2afefe53,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-18,23,...,3,0,0.0,0.0,0.0,0.0,8.0,0.0,3.0,0.0
4,3eaacfe1-29be-4da9-b5c9-a9457d2d2b83,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-19,23,...,2,0,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0


In [9]:
category = ['drought', 'excessprecip', 'cold', 'heatstress']

mergedf['worse_off_indicator'] = 0
mergedf['better_off_indicator'] = 0

for disaster in category:
    x = mergedf.loc[:, f'diff_{disaster}']

    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    
    mergedf['worse_off_indicator'] += pd.Series(1, index=x.index).where(x > 1, 0)
    mergedf['better_off_indicator'] += pd.Series(1, index=x.index).where(x < -1, 0)

mergedf.head()

Unnamed: 0,ID,crop_name,country_name,country_code,region_name,region_id,harvest_period,growing_season_year,date_on,climate_risk_cnt_locations_heat_stress_risk_low,...,sum_of_medium_high_drought_risk_lag_1yr,sum_of_medium_high_excess_precip_risk_lag_1yr,sum_of_medium_high_unseasonably_cold_risk_lag_1yr,sum_of_medium_high_heat_stress_risk_lag_1yr,diff_drought,diff_excessprecip,diff_cold,diff_heatstress,worse_off_indicator,better_off_indicator
0,8af42722-3f05-4ede-80fc-605e0e2b3b67,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-15,23,...,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,1,0
1,54f4ddc5-e7ab-4bfb-ad6a-5649841af563,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-16,23,...,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,1,0
2,63a41fce-d371-4295-a58a-dc6491664020,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-17,23,...,0.0,0.0,0.0,0.0,9.0,0.0,9.0,0.0,2,0
3,cddfa440-e0eb-4735-beb1-1aca2afefe53,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-18,23,...,0.0,0.0,0.0,0.0,8.0,0.0,3.0,0.0,2,0
4,3eaacfe1-29be-4da9-b5c9-a9457d2d2b83,Corn: Commodity Tracked,Argentina,AR,Buenos Aires,bffad37a-7c60-432f-984a-8ea83a944311,Harvest,2017,2016-06-19,23,...,0.0,0.0,0.0,0.0,7.0,0.0,2.0,0.0,2,0


In [10]:
mergedf['forecasting_weighting_for_supply_shock'] = mergedf['worse_off_indicator'] * mergedf['adjusted_weightings']


print(mergedf['supply_chain_weightings'].isna().sum())
print(mergedf['adjusted_weightings'].isna().sum())
print(mergedf['forecasting_weighting_for_supply_shock'].isna().sum())

0
0
0


#### 1.1 Production-Weighted Risk Scores

In [11]:
risk_categories = ['heat_stress', 'unseasonably_cold', 'excess_precip', 'drought', \
                    'coldwave', 'flood']

for risk in risk_categories:
    #low = f'climate_risk_cnt_locations_{risk}_risk_low'
    medium = f'climate_risk_cnt_locations_{risk}_risk_medium'
    high = f'climate_risk_cnt_locations_{risk}_risk_high'
    
    risk_scores = (1*mergedf[medium]+2*mergedf[high])/\
                           (mergedf['total_location_by_region'])
    ## define regional daily risk score as normalized weighted sum of number of locations
    
    production_weighted_risk_scores = (risk_scores*mergedf['percent_country_production'])/100
    ## use marketshare data to get production-weighted regional daily risk scores
    
    mergedf[f'climate_risk_{risk}_score'] = risk_scores
    mergedf[f'climate_risk_{risk}_weighted_score'] = production_weighted_risk_scores
    ## iterate for all four climate risk types; total 8 new engieered features

#### 1.2 Composite Risk Indices

In [12]:
mergedf['climate_risk_temperature_stress'] = \
mergedf[[f'climate_risk_{risk}_score' for risk in risk_categories[:2]]].max(axis=1)
## maximum of temperature-related risk scores
mergedf['climate_risk_precipitation_stress'] = \
mergedf[[f'climate_risk_{risk}_score' for risk in risk_categories[2:]]].max(axis=1)
## maximum of precipitation-related risk scores
mergedf['climate_risk_overall_stress'] = \
mergedf[[f'climate_risk_{risk}_score' for risk in risk_categories]].max(axis=1)
## maximum of all risk scores
mergedf['climate_risk_avg_stress'] = \
mergedf[[f'climate_risk_{risk}_score' for risk in risk_categories]].mean(axis=1)
## average of all risk scores
## total 4 new engineered features

#### 1.3 Risk Temporal Summaries

In [13]:
mergedf = mergedf.sort_values(['region_name','date_on'])
window_period = [14,20,40,60,90,120,240]
## three periods to compute risk scores moving avg and maximum 
for window in window_period:
    for risk in risk_categories:
        mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
        mergedf.groupby(['region_name'])[f'climate_risk_{risk}_score']\
               .rolling(window=window,min_periods=1).mean().reset_index(level=0,drop=True)
## compute risk score moving avg with different windows for different risk types in each region

        mergedf[f'climate_risk_{risk}_max_{window}d'] = \
        mergedf.groupby(['region_name'])[f'climate_risk_{risk}_score']\
               .rolling(window=window,min_periods=1).max().reset_index(level=0,drop=True)
## compute maximum risk scores with different windows for different risk types in each region
## total 3*4*2 = 24 new features

  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_risk_{risk}_max_{window}d'] = \
  mergedf[f'climate_risk_{risk}_ma_{window}d'] = \
  mergedf[f'climate_ri

#### 1.4 Risk Momentum

In [14]:
features_change1d = mergedf.groupby('region_name')[[f'climate_risk_{risk}_score' for risk in risk_categories]]\
       .diff(periods=1)\
       .rename(columns=dict(zip([f'climate_risk_{risk}_score' for risk in risk_categories],\
                                [f'climate_risk_{risk}_change_1d' for risk in risk_categories])))
## Daily Change of risk scores for each risk type in each region 

features_acceleration = features_change1d.diff(periods=1)
## Acceleration of daily Change of risk scores for each risk type in each region

features_change1w = mergedf.groupby('region_name')[[f'climate_risk_{risk}_score' for risk in risk_categories]]\
       .diff(periods=7)\
       .rename(columns=dict(zip([f'climate_risk_{risk}_score' for risk in risk_categories],\
                                [f'climate_risk_{risk}_change_1d' for risk in risk_categories])))
## Weekly Change of risk scores for each risk type in each region 

mergedf = pd.concat([mergedf,\
           features_change1d,\
           features_change1w,\
           features_acceleration],axis=1)
## 12 new features in Risk Momentum category

#### 1.5 Cross-Regional features

In [26]:
feature_country = pd.concat([\
mergedf.groupby(['country_name', 'date_on'])\
[[f'climate_risk_{risk}_score' for risk in risk_categories]]\
.agg(['mean','max','std']),
## compute country-wide daily avg, max, and std risk scores
mergedf.groupby(['country_name', 'date_on'])\
[[f'climate_risk_{risk}_weighted_score' for risk in risk_categories]]\
.agg('sum')],axis=1)
## compute country-wide daily production-weighted sum risk scores
feature_country.columns = [f'climate_risk_{risk}_score_country_{metric}'\
                          for risk in risk_categories \
                          for metric in ['mean','max','std']]+\
                          [f'climate_risk_{risk}_weighted_score_country_sum'\
                          for risk in risk_categories]
## rename new features
mergedf = mergedf.merge(feature_country.reset_index(),\
              how='left',\
              on=['country_name','date_on'])
## add 4*4=16 new features

#### Non Linear Transformation
- Including Log1p, Squareroot without chaginging signs, Thresh Magnitude

In [16]:
import numpy as np

In [17]:
climate_risk_cols = [c for c in mergedf.columns if c.startswith('climate_risk_')]
for feature_name in climate_risk_cols:
    x = mergedf.loc[:, feature_name]

    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    
    mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
    mergedf[f'{feature_name}_log1p'] = mergedf[f'{feature_name}_log1p'].fillna(0)
    mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
    mergedf[f'{feature_name}_ssqrt'] = mergedf[f'{feature_name}_ssqrt'].fillna(0)
    mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
    mergedf[f'{feature_name}_thresh_mag'] = mergedf[f'{feature_name}_thresh_mag'].fillna(0)

  mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
  mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
  mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
  mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
  mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
  mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
  mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
  mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
  mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
  mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
  mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
  mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
  mergedf[f'{feature_name}_thresh_mag'] = x.where(x>1, 0)
  mergedf[f'{feature_name}_log1p'] = np.log1p(x.clip(lower=0))
  mergedf[f'{feature_name}_ssqrt'] = np.sign(x) * np.sqrt(np.abs(x))
  mergedf[f'{feature_name}_thresh_mag'] = x.where(

#### Supply-Chained Weightings Features

In [18]:
cols_to_drop = ['climate_risk_cnt_locations_heat_stress_risk_low',
 'climate_risk_cnt_locations_heat_stress_risk_medium',
 'climate_risk_cnt_locations_heat_stress_risk_high',
 'climate_risk_cnt_locations_unseasonably_cold_risk_low',
 'climate_risk_cnt_locations_unseasonably_cold_risk_medium',
 'climate_risk_cnt_locations_unseasonably_cold_risk_high',
 'climate_risk_cnt_locations_excess_precip_risk_low',
 'climate_risk_cnt_locations_excess_precip_risk_medium',
 'climate_risk_cnt_locations_excess_precip_risk_high',
 'climate_risk_cnt_locations_drought_risk_low',
 'climate_risk_cnt_locations_drought_risk_medium',
 'climate_risk_cnt_locations_drought_risk_high',
 'climate_risk_cnt_locations_coldwave_risk_high',
 'climate_risk_cnt_locations_coldwave_risk_medium',
 'climate_risk_cnt_locations_flood_risk_high',
 'climate_risk_cnt_locations_flood_risk_medium']

In [19]:
climate_risk_cols = [c for c in mergedf.columns if c.startswith('climate_risk_')]
climate_risk_selected_cols = [item for item in climate_risk_cols if item not in cols_to_drop]

for feature_name in climate_risk_selected_cols:
    x = mergedf.loc[:, feature_name]

    if isinstance(x, pd.DataFrame):
        x = x.iloc[:, 0]
    
    mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))

mergedf.head()

  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_shock'] + 1e-8))
  mergedf[f'{feature_name}_supply_weighting'] = (x * (mergedf['forecasting_weighting_for_supply_

Unnamed: 0,ID,crop_name,country_name,country_code,region_name,region_id,harvest_period,growing_season_year,date_on,climate_risk_cnt_locations_heat_stress_risk_low,...,climate_risk_excess_precip_weighted_score_country_sum_thresh_mag_supply_weighting,climate_risk_drought_weighted_score_country_sum_log1p_supply_weighting,climate_risk_drought_weighted_score_country_sum_ssqrt_supply_weighting,climate_risk_drought_weighted_score_country_sum_thresh_mag_supply_weighting,climate_risk_coldwave_weighted_score_country_sum_log1p_supply_weighting,climate_risk_coldwave_weighted_score_country_sum_ssqrt_supply_weighting,climate_risk_coldwave_weighted_score_country_sum_thresh_mag_supply_weighting,climate_risk_flood_weighted_score_country_sum_log1p_supply_weighting,climate_risk_flood_weighted_score_country_sum_ssqrt_supply_weighting,climate_risk_flood_weighted_score_country_sum_thresh_mag_supply_weighting
0,8308c20c-d655-4500-b41a-af7b2dd8b255,Corn: Commodity Tracked,Paraguay,PY,Alto Paraná,6a4dabfc-c598-44e0-9d99-ea9e31d110cb,Planting,2016,2016-01-01,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.086327e-09,1.015486e-08,1.031212e-08
1,c6e315ff-6f9e-436d-9cb9-a2ad08b50c78,Corn: Commodity Tracked,Paraguay,PY,Alto Paraná,6a4dabfc-c598-44e0-9d99-ea9e31d110cb,Planting,2016,2016-01-02,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.416217e-09,7.451357e-09,0.0
2,3e01ae6b-350a-49d5-b6d4-a6939034a280,Corn: Commodity Tracked,Paraguay,PY,Alto Paraná,6a4dabfc-c598-44e0-9d99-ea9e31d110cb,Planting,2016,2016-01-03,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.10089e-09,4.835162e-09,0.0
3,719fa75b-e4b6-4d0c-ade7-fc14c8a43c49,Corn: Commodity Tracked,Paraguay,PY,Alto Paraná,6a4dabfc-c598-44e0-9d99-ea9e31d110cb,Planting,2016,2016-01-04,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.120427e-09,3.443263e-09,0.0
4,06335569-7672-4ed5-b425-712d084826d0,Corn: Commodity Tracked,Paraguay,PY,Alto Paraná,6a4dabfc-c598-44e0-9d99-ea9e31d110cb,Planting,2016,2016-01-05,10,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 1.6 Baseline features CFCS score

In [20]:
corrtable = compute_partial_correlations(mergedf)
## compute cfcs score using backtesting functionalities
# 9m 31.9s for 1231 columns

In [21]:
sigcorr_report(corrtable).sort_values(by='avg_sig_corr', ascending=False).head(20)

Unnamed: 0_level_0,avg_sig_corr,max_sig_corr,sig_corr_count,sig_corr_ratio(%)
climate_variable,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
climate_risk_coldwave_max_40d_thresh_mag,0.7,0.782,13,0.814
climate_risk_coldwave_max_60d_thresh_mag,0.671,0.781,24,1.486
climate_risk_drought_ma_120d_thresh_mag,0.626,0.781,10,0.964
climate_risk_coldwave_ma_14d,0.622,0.632,2,0.11
climate_risk_coldwave_max_90d_thresh_mag,0.621,0.825,24,1.384
climate_risk_coldwave_ma_14d_log1p,0.619,0.629,2,0.11
climate_risk_coldwave_max_14d_thresh_mag,0.619,0.757,6,0.447
climate_risk_coldwave_max_40d_supply_weighting,0.614,0.67,2,0.101
climate_risk_excess_precip_max_240d_log1p,0.611,0.8,58,2.585
climate_risk_excess_precip_max_240d,0.61,0.788,54,2.406


In [22]:
cfcs(corrtable)

0.26% of all correlations are significant
Average significant correlation is 0.557
highest absolute correlation found is 0.825
final CFCS score is 52.66


{'cfcs_score': np.float64(52.657663784527955),
 'avg_sig_score': np.float64(55.685943572673956),
 'max_corr_score': np.float64(82.54),
 'sig_count_score': 0.2634599909549161}

In [23]:
features_sig = [
    'climate_risk_coldwave_max_40d_thresh_mag',
    'climate_risk_coldwave_max_60d_thresh_mag',
    'climate_risk_coldwave_max_90d_thresh_mag',
    'climate_risk_coldwave_max_40d_supply_weighting',
    'climate_risk_coldwave_max_90d_thresh_mag',
    'climate_risk_excess_precip_max_240d_log1p',
    'climate_risk_excess_precip_max_240d_ssqrt',
    'climate_risk_excess_precip_max_240d',
    'climate_risk_drought_ma_120d_thresh_mag',
    'climate_risk_drought_ma_90d_ssqrt',
    'climate_risk_drought_ma_60d_log1p',
    'climate_risk_drought_ma_90d',
    'climate_risk_drought_ma_60d',
]


submissiondf = mergedf.dropna()
submissiondf = submissiondf[submissiondf.columns[pd.Series(submissiondf.columns).apply(\
lambda x:( not x.startswith('climate_risk')) or (x in features_sig)
)]]

In [24]:
corrtable_1 = compute_partial_correlations(submissiondf)

In [25]:
cfcs(corrtable_1)

1.92% of all correlations are significant
Average significant correlation is 0.614
highest absolute correlation found is 0.825
final CFCS score is 55.85


{'cfcs_score': np.float64(55.85235037187289),
 'avg_sig_score': np.float64(61.4116),
 'max_corr_score': np.float64(82.54),
 'sig_count_score': 1.9227518593644355}