In [221]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

In [222]:
df_agri = pd.read_csv('/private/tmp/test_agged.csv')
# get uniq lat longs
uniq_latlngs = set(tuple(t) for t in df_agri[['lat', 'lng']].values)
lat_tick = np.median(sorted(np.diff(df_agri['lat'].unique())))
lng_tick = np.median(sorted(np.diff(df_agri['lng'].unique())))
uniq_argi_types = df_agri['type'].unique()

In [None]:
weather_aggs = defaultdict(Counter)
weather_vals = ['prcp', 'tmin', 'tmax', 'vp']
for val in weather_vals:
    df_weather = pd.read_csv('/private/tmp/{}.csv'.format(val), index_col=0)
    for lat, lng in uniq_latlngs:
        lat_mask = (df_weather['lat']<(lat+lat_tick)) & (df_weather['lat']>=(lat))
        lng_mask = (df_weather['lng']>=(lng+lng_tick)) & (df_weather['lng']<(lng))
        weather_aggs[(lat, lng)][val] += df_weather[lat_mask&lng_mask]['val'].mean()

In [None]:
df_weather_by_loc = pd.DataFrame(weather_aggs).T
df_weather_by_loc.index = df_weather_by_loc.index.set_names(['lat', 'lng'])
df_weather_by_loc = df_weather_by_loc.dropna()

In [None]:
df_outs = []
rfr = RandomForestRegressor(n_estimators=20)
for ag_type in uniq_argi_types:
    df_one_type = df_agri[df_agri['type']==ag_type].set_index(['lat', 'lng'])
    df_train = df_one_type.join(df_weather_by_loc, how='inner')
    print"type {}: {} training data".format(ag_type, len(df_train))
    
    # get CV error
    cv_ct_ = cross_val_predict(rfr, df_train[weather_vals], df_train['ct'])
    print("RMSE: {}, median: {}".format(
        np.sqrt(mean_squared_error(cv_ct_, df_train['ct'])), 
        np.median(df_train['ct'])
    ))
    
    # do the fitting and predicting
    rfr.fit(df_train[weather_vals], df_train['ct'])
    ct_ = rfr.predict(df_weather_by_loc[weather_vals])
    
    # format for export
    df_weather_by_loc['pred__'+str(ag_type)] = ct_
    df_out = df_weather_by_loc[['pred__'+str(ag_type)]].reset_index()
    df_out['type'] = ag_type
    df_out['ct'] = df_out['pred__'+str(ag_type)].astype(int)
    
    # append
    df_outs.append(df_out[['lat','lng','type','ct']])
    print("")

In [226]:
df_out_fin = pd.concat(df_outs)

# per tile normalizations
Assume that no additional farmland can be made available. 

In [227]:
# get sum of used tiles
df_out_by_loc = df_out_fin.groupby(['lat', 'lng']).sum()[['ct']]
df_agri_by_loc = df_agri.groupby(['lat', 'lng']).sum()[['ct']]

In [228]:
# join and divide
df_tile_usage_norm = df_out_by_loc.join(df_agri_by_loc, rsuffix='_agri')
df_tile_usage_norm['multiplier'] = df_tile_usage_norm['ct_agri']/df_tile_usage_norm['ct']

In [229]:
# join and multiply with multiplier
df_out_fin = df_out_fin.set_index(['lat', 'lng']).join(df_tile_usage_norm[['multiplier']])
df_out_fin['ct_use_normed'] = df_out_fin['ct']*df_out_fin['multiplier']

In [230]:
df_out_fin = df_out_fin.reset_index()[['lat','lng','type','ct_use_normed']]

In [231]:
df_out_fin.to_csv('/private/tmp/test_agged_predictions.csv', index=False)

# Viz check

In [232]:
import gmaps
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key

In [233]:
df_plot = df_out_fin[df_out_fin['type']==1]

In [237]:
m = gmaps.Map()
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key
m.add_layer(
    gmaps.heatmap_layer(
        df_plot[['lat','lng']].values, 
        weights=df_plot['ct_use_normed'],
        point_radius=0.4,
        max_intensity=1.382335e+05,
        dissipating=False
    )
)
m