In [265]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, Counter

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_predict

In [266]:
crop_types = """
1	Corn
2	Cotton
3	Rice
4	Sorghum
5	Soybeans
6	Sunflower
10	Peanuts
11	Tobacco
12	Sweet Corn
13	Pop or Orn Corn
14	Mint
21	Barley
22	Durum Wheat
23	Spring Wheat
24	Winter Wheat
25	Other Small Grains
26	Dbl Crop WinWht/Soybeans
27	Rye
28	Oats
29	Millet
30	Speltz
31	Canola
32	Flaxseed
33	Safflower
34	Rape Seed
35	Mustard
36	Alfalfa
38	Camelina
39	Buckwheat
41	Sugarbeets
42	Dry Beans
43	Potatoes
44	Other Crops
45	Sugarcane
46	Sweet Potatoes
47	Misc Vegs & Fruits
48	Watermelons
49	Onions
50	Cucumbers
51	Chick Peas
52	Lentils
53	Peas
54	Tomatoes
55	Caneberries
56	Hops
57	Herbs
58	Clover/Wildflowers
61	Fallow/Idle Cropland
66	Cherries
67	Peaches
68	Apples
69	Grapes
70	Christmas Trees
71	Other Tree Crops
72	Citrus
74	Pecans
75	Almonds
76	Walnuts
77	Pears
204	Pistachios
205	Triticale
206	Carrots
207	Asparagus
208	Garlic
209	Cantaloupes
210	Prunes
211	Olives
212	Oranges
213	Honeydew Melons
214	Broccoli
216	Peppers
217	Pomegranates
218	Nectarines
219	Greens
220	Plums
221	Strawberries
222	Squash
223	Apricots
224	Vetch
225	Dbl Crop WinWht/Corn
226	Dbl Crop Oats/Corn
227	Lettuce
229	Pumpkins
230	Dbl Crop Lettuce/Durum Wht
231	Dbl Crop Lettuce/Cantaloupe
232	Dbl Crop Lettuce/Cotton
233	Dbl Crop Lettuce/Barley
234	Dbl Crop Durum Wht/Sorghum
235	Dbl Crop Barley/Sorghum
236	Dbl Crop WinWht/Sorghum
237	Dbl Crop Barley/Corn
238	Dbl Crop WinWht/Cotton
239	Dbl Crop Soybeans/Cotton
240	Dbl Crop Soybeans/Oats
241	Dbl Crop Corn/Soybeans
242	Blueberries
243	Cabbage
244	Cauliflower
245	Celery
246	Radishes
247	Turnips
248	Eggplants
249	Gourds
250	Cranberries
254	Dbl Crop Barley/Soybeans
"""

non_crop_types = """
37	Other Hay/Non Alfalfa
59	Sod/Grass Seed
60	Switchgrass
63	Forest
64	Shrubland
65	Barren
81	Clouds/No Data
82	Developed
83	Water
87	Wetlands
88	Nonag/Undefined
92	Aquaculture
111	Open Water
112	Perennial Ice/Snow
121	Developed/Open Space
122	Developed/Low Intensity
123	Developed/Med Intensity
124	Developed/High Intensity
131	Barren
141	Deciduous Forest
142	Evergreen Forest
143	Mixed Forest
152	Shrubland
176	Grass/Pasture
190	Woody Wetlands
195	Herbaceous Wetlands
"""

In [267]:
crop_key = dict((int(i.split("\t")[0]), i.split("\t")[1]) for i in crop_types.split("\n") if i.strip())
non_crop_types = dict((int(i.split("\t")[0]), i.split("\t")[1]) for i in non_crop_types.split("\n") if i.strip())

In [268]:
df_agri = pd.read_csv('/private/tmp/test_aggregated_crops.csv')

# remove non_crop_types
df_agri = df_agri[df_agri['type'].isin(crop_key)]

# get uniq lat longs
uniq_latlngs = set(tuple(t) for t in df_agri[['lat', 'lng']].values)
lat_tick = np.median(sorted(np.diff(df_agri['lat'].unique())))
lng_tick = np.median(sorted(np.diff(df_agri['lng'].unique())))
uniq_argi_types = df_agri['type'].unique()

In [270]:
%%time
weather_aggs = defaultdict(Counter)
weather_vals = ['prcp', 'tmin', 'tmax', 'vp']
for val in weather_vals:
    df_weather = pd.read_csv('/private/tmp/{}.csv'.format(val), index_col=0)
    for lat, lng in uniq_latlngs:
        lat_mask = (df_weather['lat']<(lat+lat_tick)) & (df_weather['lat']>=(lat))
        lng_mask = (df_weather['lng']>=(lng+lng_tick)) & (df_weather['lng']<(lng))
        weather_aggs[(lat, lng)][val] += df_weather[lat_mask&lng_mask]['val'].mean()

CPU times: user 2min 9s, sys: 2.33 s, total: 2min 12s
Wall time: 2min 32s


In [271]:
df_weather_by_loc = pd.DataFrame(weather_aggs).T
df_weather_by_loc.index = df_weather_by_loc.index.set_names(['lat', 'lng'])
df_weather_by_loc = df_weather_by_loc.dropna()

In [272]:
df_outs = []
rfr = RandomForestRegressor(n_estimators=20)
for ag_type in uniq_argi_types:
    df_one_type = df_agri[df_agri['type']==ag_type].set_index(['lat', 'lng'])
    df_train = df_one_type.join(df_weather_by_loc, how='inner')
    print"type {} ({}): {} training data".format(ag_type, crop_key[ag_type], len(df_train))
    
    # get CV error
    if len(df_train) > 50:
        cv_ct_ = cross_val_predict(rfr, df_train[weather_vals], df_train['ct'])
        print("RMSE: {}, median: {}, mean: {}".format(
            np.sqrt(mean_squared_error(cv_ct_, df_train['ct'])), 
            np.median(df_train['ct']),
            np.mean(df_train['ct'])
        ))
    
    # do the fitting and predicting
    rfr.fit(df_train[weather_vals], df_train['ct'])
    ct_ = rfr.predict(df_weather_by_loc[weather_vals])
    
    # format for export
    df_weather_by_loc['pred__'+str(ag_type)] = ct_
    df_out = df_weather_by_loc[['pred__'+str(ag_type)]].reset_index()
    df_out['type'] = ag_type
    df_out['ct'] = df_out['pred__'+str(ag_type)].astype(int)
    
    # append
    df_outs.append(df_out[['lat','lng','type','ct']])
    print("")

type 61 (Fallow/Idle Cropland): 2446 training data
RMSE: 106594.879357, median: 2810.0, mean: 37279.0936222

type 69 (Grapes): 496 training data
RMSE: 51665.0228735, median: 21.0, mean: 10509.1229839

type 205 (Triticale): 1156 training data
RMSE: 3686.14988933, median: 56.0, mean: 978.269896194

type 14 (Mint): 74 training data
RMSE: 2291.1387357, median: 19.0, mean: 475.283783784

type 21 (Barley): 1318 training data
RMSE: 29673.534718, median: 153.0, mean: 6393.08801214

type 24 (Winter Wheat): 2061 training data
RMSE: 127206.639504, median: 1722.0, mean: 40708.6365842

type 28 (Oats): 1873 training data
RMSE: 11389.0831635, median: 254.0, mean: 2514.74319274

type 36 (Alfalfa): 2112 training data
RMSE: 73620.1603999, median: 5437.0, mean: 33278.1216856

type 58 (Clover/Wildflowers): 913 training data
RMSE: 5832.07098149, median: 37.0, mean: 829.530120482

type 76 (Walnuts): 123 training data
RMSE: 50768.7791746, median: 42.0, mean: 15719.6341463

type 75 (Almonds): 86 training data

RMSE: 38711.0604105, median: 1184.0, mean: 15561.2063492

type 240 (Dbl Crop Soybeans/Oats): 424 training data
RMSE: 684.748041378, median: 26.0, mean: 205.351415094

type 241 (Dbl Crop Corn/Soybeans): 101 training data
RMSE: 293.700246555, median: 3.0, mean: 95.1782178218

type 254 (Dbl Crop Barley/Soybeans): 284 training data
RMSE: 5264.05604244, median: 22.0, mean: 1070.02464789



In [273]:
df_out_fin = pd.concat(df_outs)

# per tile normalizations
Assume that no additional farmland can be made available. 

In [274]:
# get sum of used tiles
df_out_by_loc = df_out_fin.groupby(['lat', 'lng']).sum()[['ct']]
df_agri_by_loc = df_agri.groupby(['lat', 'lng']).sum()[['ct']]

In [275]:
# join and divide
df_tile_usage_norm = df_out_by_loc.join(df_agri_by_loc, rsuffix='_agri')
df_tile_usage_norm['multiplier'] = df_tile_usage_norm['ct_agri']/df_tile_usage_norm['ct']

In [276]:
# join and multiply with multiplier
df_out_fin = df_out_fin.set_index(['lat', 'lng']).join(df_tile_usage_norm[['multiplier']])
df_out_fin['ct_use_normed'] = df_out_fin['ct']*df_out_fin['multiplier']

In [277]:
df_out_fin = df_out_fin.reset_index()[['lat','lng','type','ct_use_normed']]

In [278]:
df_out_fin.to_csv('/private/tmp/test_aggregated_crops_predicted.csv', index=False)

# Viz check

In [279]:
import gmaps
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key

In [289]:
crop_type = 57
crop_name = crop_key[crop_type]
df_plot = df_out_fin[df_out_fin['type']==crop_type]
print(crop_name)

Herbs


In [290]:
m = gmaps.Map()
gmaps.configure(api_key="AIzaSyDUk70qd04kdHjWcAI0MyMbFv5N0dtMk5c") # Your Google API key
m.add_layer(
    gmaps.heatmap_layer(
        df_plot[['lat','lng']].values, 
        weights=df_plot['ct_use_normed'],
        point_radius=0.4,
        max_intensity=df_plot['ct_use_normed'].quantile(0.75),
        dissipating=False
    )
)
m