In [182]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

In [173]:
df_agri = pd.read_csv('/private/tmp/test_agged.csv')
# get uniq lat longs
uniq_latlngs = set(tuple(t) for t in df_agri[['lat', 'lng']].values)
lat_tick = np.median(sorted(np.diff(df_agri['lat'].unique())))
lng_tick = np.median(sorted(np.diff(df_agri['lng'].unique())))
uniq_argi_types = df_agri['type'].unique()

In [124]:
weather_aggs = defaultdict(Counter)
weather_vals = ['prcp', 'tmin', 'tmax', 'vp']
for val in weather_vals:
    df_weather = pd.read_csv('/private/tmp/{}.csv'.format(val), index_col=0)
    for lat, lng in uniq_latlngs:
        lat_mask = (df_weather['lat']<(lat+lat_tick)) & (df_weather['lat']>=(lat))
        lng_mask = (df_weather['lng']>=(lng+lng_tick)) & (df_weather['lng']<(lng))
        weather_aggs[(lat, lng)][val] += df_weather[lat_mask&lng_mask]['val'].mean()

In [146]:
df_weather_by_loc = pd.DataFrame(weather_aggs).T
df_weather_by_loc.index = df_weather_by_loc.index.set_names(['lat', 'lng'])
df_weather_by_loc = df_weather_by_loc.dropna()

In [187]:
df_outs = []
rfr = RandomForestRegressor(n_estimators=20)
for ag_type in uniq_argi_types:
    df_one_type = df_agri[df_agri['type']==ag_type].set_index(['lat', 'lng'])
    df_train = df_one_type.join(df_weather_by_loc, how='inner')
    print"type {}: {} training data".format(ag_type, len(df_train))
    
    # get CV error
    cv_ct_ = cross_val_predict(rfr, df_train[weather_vals], df_train['ct'])
    print("RMSE: {}, median: {}".format(
        np.sqrt(mean_squared_error(cv_ct_, df_train['ct'])), 
        np.median(df_train['ct'])
    ))
    
    # do the fitting and predicting
    rfr.fit(df_train[weather_vals], df_train['ct'])
    ct_ = rfr.predict(df_weather_by_loc[weather_vals])
    
    # format for export
    df_weather_by_loc['pred__'+str(ag_type)] = ct_
    df_out = df_weather_by_loc[['pred__'+str(ag_type)]].reset_index()
    df_out['type'] = ag_type
    df_out['ct'] = df_out['pred__'+str(ag_type)].astype(int)
    
    # append
    df_outs.append(df_out[['lat','lng','type','ct']])
    print("")

type 123: 309 training data
RMSE: 90044.2265688, median: 1155.0

type 131: 312 training data
RMSE: 229122.146608, median: 20333.5

type 69: 110 training data
RMSE: 150197.555762, median: 239.0

type 76: 74 training data
RMSE: 66199.3760305, median: 265.0

type 141: 220 training data
RMSE: 43350.9335878, median: 203.5

type 142: 301 training data
RMSE: 519937.893995, median: 98412.0

type 143: 200 training data
RMSE: 83358.1501164, median: 382.0

type 176: 305 training data
RMSE: 404754.223925, median: 49570.0

type 195: 288 training data
RMSE: 49346.5621895, median: 1009.0

type 152: 312 training data
RMSE: 756636.517214, median: 1910827.0

type 121: 312 training data
RMSE: 65150.6063544, median: 17340.0

type 122: 312 training data
RMSE: 55664.22876, median: 5590.5

type 111: 279 training data
RMSE: 68704.6641215, median: 6626.0

type 28: 181 training data
RMSE: 13317.5110461, median: 168.0

type 61: 269 training data
RMSE: 104356.353729, median: 801.0

type 190: 302 training data
RMS

ValueError: Cannot have number of splits n_splits=3 greater than the number of samples: 1.

In [176]:
df_out_fin = pd.concat(df_outs)

# per tile normalizations
Assume that no additional farmland can be made available. 

In [203]:
# get sum of used tiles
df_out_by_loc = df_out_fin.groupby(['lat', 'lng']).sum()[['ct']]
df_agri_by_loc = df_agri.groupby(['lat', 'lng']).sum()[['ct']]

In [205]:
# join and divide
df_tile_usage_norm = df_out_by_loc.join(df_agri_by_loc, rsuffix='_agri')
df_tile_usage_norm['multiplier'] = df_tile_usage_norm['ct_agri']/df_tile_usage_norm['ct']

In [211]:
# join and multiply with multiplier
df_out_fin = df_out_fin.set_index(['lat', 'lng']).join(df_tile_usage_norm[['multiplier']])
df_out_fin['ct_use_normed'] = df_out_fin['ct']*df_out_fin['multiplier']

In [215]:
df_out_fin = df_out_fin.reset_index()[['lat','lng','type','ct_use_normed']]

In [216]:
df_out_fin.to_csv('/private/tmp/test_agged_predictions.csv', index=False)

# Viz check

In [218]:
import gmaps
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key

In [219]:
df_plot = df_out_fin[df_out_fin['type']==1]

In [220]:
m = gmaps.Map()
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key
m.add_layer(
    gmaps.heatmap_layer(
        df_plot[['lat','lng']].values, 
        weights=df_plot['ct_use_normed'],
        point_radius=0.3,
        max_intensity=10000,
        dissipating=False
    )
)
m