In [1]:
import pandas as pd, numpy as np, json, math
import datetime as dt

In [3]:
from obs.models import Observation

## This is for observations that are keyed to existing sites
see Geometry notebook for creating new sites from lat, long columns.

Data procedure: 
1. First get sites into db with Geometry notebook. 
2. Then get photos (see Retrieving photos notebook for grabbing them from urls, and so you can give photos filenames that can be associated with observations by id or something).
3. Lastly, format the data with JSON field. This can include URLs for photos.

In [2]:
df = pd.read_excel('../atlasdata/BELCJul2019samples.xls')
#watch out for missing or blank fields, do cleanup as needed
df.dtypes

site_id                    int64
observer_id                int64
type_id                    int64
parentobs_id               int64
sampleID                  object
num_composited             int64
date              datetime64[ns]
top_cm                     int64
bottom_cm                  int64
samplers                  object
description               object
dtype: object

In [10]:
df = df.rename(columns={'site_description':'description','date':'olddate'})

In [57]:
#add foreign key fields for atlasbiowork Observation model
df['observer_id'] = 1 # this is my observer_id
df.rename(columns={'pk':'site_id'}, inplace=True)
df['type_id'] = 19 #for infiltration

In [68]:
#Collect timings columns in format 'mm:ss, mm:ss, mm:ss'
#convert decimal minutes to mm:ss
def pad(n):
    if n > 0:
        return str(math.floor(n)) + ':' + str.zfill(str(math.floor((n % 1)*60)),2)
    else:
        return ''

timing_cols=['ring_1:inf_time1','ring_1:inf_time2','ring_1:inf_time3','ring_1:inf_time4']   
for i, row in df[timing_cols].iterrows():
    l = list(row)
    m = list(pad(value) for value in l if not math.isnan(value))
    df.loc[i,'timings_1'] = ', '.join(m)


In [3]:
#we need YYYY-MM-DD format as string for the JSON field
df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d')
df.dtypes

site_id            int64
observer_id        int64
type_id            int64
parentobs_id       int64
sampleID          object
num_composited     int64
date              object
top_cm             int64
bottom_cm          int64
samplers          object
description       object
dtype: object

this next part is tricky. Must use JSON functions, not to_dict().

In [4]:
site_fields = ['id', 'name', 'geometry', 'accuracy']
observation_fields = ['values','observer_id', 'site_id', 'type_id', 'parentobs_id']
samples_fields = ['top_cm', 'bottom_cm', 'description','num_composited','sampleID','date','samplers']
analysis_values=['sampleID','analysis', 'lab', 'date_analyzed', 'unit2', 'value2', 'unit1',
       'value1', 'unit3', 'value3', 'analysisNote']

In [5]:
my_values = samples_fields
#infil_values = ['timings_1','description','date']#columns for JSON values field for this datatype.
s = df[my_values].to_json(orient='records')
t = pd.Series(json.loads(s))
v = pd.DataFrame(t, columns=['values'])
#now we have a JSON values field that we can add to df

In [6]:
#reduce unneeded columns
df= df[['site_id',
       'observer_id', 'type_id','parentobs_id']]

In [7]:
#merge in the values columns. The indexes should correspond since they are from the same df
df = df.merge(v, left_index=True, right_index=True)

In [8]:
df

Unnamed: 0,site_id,observer_id,type_id,parentobs_id,values
0,520,1,31,2363,"{'top_cm': 0, 'bottom_cm': 10, 'description': ..."
1,520,1,31,2365,"{'top_cm': 10, 'bottom_cm': 20, 'description':..."
2,520,1,31,2367,"{'top_cm': 20, 'bottom_cm': 35, 'description':..."
3,521,1,31,2373,"{'top_cm': 20, 'bottom_cm': 35, 'description':..."
4,522,1,31,2375,"{'top_cm': 0, 'bottom_cm': 10, 'description': ..."
5,522,1,31,2377,"{'top_cm': 10, 'bottom_cm': 20, 'description':..."
6,522,1,31,2379,"{'top_cm': 20, 'bottom_cm': 35, 'description':..."
7,523,1,31,2381,"{'top_cm': 0, 'bottom_cm': 10, 'description': ..."
8,523,1,31,2383,"{'top_cm': 10, 'bottom_cm': 20, 'description':..."
9,523,1,31,2385,"{'top_cm': 20, 'bottom_cm': 35, 'description':..."


In [11]:
# df = df[['site_id', 'observer_id', 'type_id', 'parentobs_id', 'values']]
df['values'] = df['values'].str.replace("\"","'")

In [13]:
df.dtypes

site_id           int64
observer_id       int64
type_id           int64
parentobs_id      int64
values          float64
dtype: object

In [10]:
#save it for good measure
df.to_csv('../atlasdata/belc.csv', sep='\t',index=False)

<font color= red>WARNING: db operation ahead</font>
don't use bulk_create. Values field has to be a dict, not a str

site_id                                                       462
observer_id                                                    45
type_id                                                        33
parentobs_id                                                 4450
values          {'sampleID': 'TR1A', 'unit3': 'total organic c...
Name: 0, dtype: object

In [18]:
df['values'] = df['values'].astype(str)

In [26]:
#now this seems to work!!!
for row in df.itertuples():
    p = Observation(
        site_id=row[1],
        observer_id=row[2],
        type_id=row[3],
        parentobs_id=row[4],
        values=row[5])
    p.save()  #UNCOMMENT AT YOUR PERIL!!!! 
    
    