In [1]:
import os
import shutil
import subprocess
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import time
from utils.shapefiles import sampleShapefileLocations
from analysis.peaksdata import filterPeaksHaversineDist
from utils.divtree_gen import *
from utils.seqdata_gen import *
from utils.seq2demodst import *
# process each region (note: it takes a long time!)
regionShapesDir = 'data/regionShapes'
regionPeaksDir = 'data/regionPeaks'
regionSeqsDir = 'data/regionSeqs'
regionTreeSeqsDir = 'data/regionTreeSeqs'
diskRadius = 30

regionShapes = ['andes_peru.shp']

## For RNN, sequence representation of the tree

In [2]:


for region in regionShapes:
    st = time.time()
    # sample stats locations inside polygon, separated at least 1/2 radius distance
    sampleLocations = sampleShapefileLocations(os.path.join(regionShapesDir, region), diskRadius)
    print(region, ": ", len(sampleLocations), "samples")
    # region peaks DB
    df = pd.read_csv(os.path.join(regionPeaksDir, region.replace('.shp', '.csv')))

    allTrees = []
    # compute sequences
    for di,diskCenter in tqdm(enumerate(sampleLocations)):
        # filter peaks in disk using haversine distance
        peaks = filterPeaksHaversineDist(df, diskCenter, diskRadius)
        # skip if not enough peaks
        if peaks.shape[0] < 20:
            continue
        # build the divide tree
        rootNode = genDivideTree(peaks)
        seqOfTree = genFullSeq(rootNode, isDFS=True)
        allTrees.append(seqOfTree)
    fout = open(os.path.join(regionTreeSeqsDir, region.replace('.shp', '.txt')), 'w') 
    for seqTree in allTrees:
        fout.write(",".join( [";".join([str(i) for i in v] ) for v in s]))
        fout.write('\n')
    fout.close()

    print('%s: %3d samples, %3d sequences, %d s'%(region, len(sampleLocations), len(allSeqs), time.time() - st)) 

print('done!')

0it [00:00, ?it/s]

andes_peru.shp :  213 samples


213it [31:37,  8.91s/it]


NameError: name 'allSeqs' is not defined

## For Demo

In [None]:


for region in regionShapes:
    st = time.time()
    # sample stats locations inside polygon, separated at least 1/2 radius distance
    sampleLocations = sampleShapefileLocations(os.path.join(regionShapesDir, region), diskRadius)
    print(region, ": ", len(sampleLocations), "samples")
    # region peaks DB
    df = pd.read_csv(os.path.join(regionPeaksDir, region.replace('.shp', '.csv')))

    allSeqs = []
    # compute sequences
    for di,diskCenter in enumerate(sampleLocations):
        # filter peaks in disk using haversine distance
        peaks = filterPeaksHaversineDist(df, diskCenter, diskRadius)
        # skip if not enough peaks
        if peaks.shape[0] < 20:
            continue
        paths = genSeq(peaks)
        seqs = [p for p in paths if len(p) > 10]
        allSeqs += seqs
        # for debug, draw Seqs tree
        # drawSeq(peaks, seqs)
        print('%s: %3d/%3d samples '%(region, di+1, len(sampleLocations)), end='\r' if di+1 < len(sampleLocations) else '\n')
    fout = open(os.path.join(regionSeqsDir, region.replace('.shp', '.txt')), 'w') 
    for s in allSeqs:
        fout.write(" ".join([str(v) for v in s]))
        fout.write('\n')
    fout.close()

    print('%s: %3d samples, %3d sequences, %d s'%(region, len(sampleLocations), len(allSeqs), time.time() - st)) 

print('done!')