# Data Preprocessing

## Import Packages

In [293]:
import pandas as pd
import random
import json
import numpy as np

## Read Files

In [294]:
#path = r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629'

#all_files = glob.glob(os.path.join(path, "*.csv"))
#df_from_each_file = (pd.read_csv(f) for f in all_files)
#df_from_each_file

traj1 = pd.read_csv(r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629\Ornitela#GPS#Orni17039#2018-08-03.csv')
traj2 = pd.read_csv(r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629\Ornitela#GPS#Orni17499#2017-08-07.csv')
traj3 = pd.read_csv(r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629\Ornitela#GPS#Orni180426#2018-07-13.csv')
traj4 = pd.read_csv(r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629\Ornitela#GPS#Orni181425#2018-07-08.csv')

traj1.head()

Unnamed: 0,UUID,Timestamp,Longitude,Latitude,Altitude,HorAccuracy,Velocity,Azimuth,Satcount
0,Orni17039,2018-08-04T00:07:35Z,151.866577,70.704399,-181,1.4,0.0,29,6
1,Orni17039,2018-08-04T00:17:35Z,151.874908,70.703972,-96,1.4,0.277778,91,6
2,Orni17039,2018-08-04T00:27:58Z,151.878937,70.703163,-10,1.4,0.0,55,5
3,Orni17039,2018-08-04T00:37:34Z,151.880768,70.702072,-28,1.4,0.277778,180,5
4,Orni17039,2018-08-04T00:47:37Z,151.88118,70.701691,-43,1.4,0.277778,117,5


## Basic Statistics

In [295]:
traj1.describe()

Unnamed: 0,Longitude,Latitude,Altitude,HorAccuracy,Velocity,Azimuth,Satcount
count,57089.0,57089.0,58820.0,58820.0,58820.0,58820.0,58820.0
mean,138.24131,52.388495,16.252771,1.289439,0.613605,176.473699,6.120826
std,11.548027,15.065494,104.907227,0.517321,2.910612,107.581909,2.244807
min,126.228523,36.553467,-1806.0,0.0,0.0,0.0,0.0
25%,126.816452,36.83889,-6.0,1.0,0.0,83.0,5.0
50%,132.69162,46.699409,3.0,1.3,0.0,177.0,6.0
75%,151.720505,70.678825,25.0,1.5,0.277778,270.0,7.0
max,153.753036,70.879745,8928.0,15.9,34.444444,359.0,18.0


## Generate other data based on four dataframes

I assume that the four data represent four different birds species, due to the clustering nature of bird migrations, I generate another 249 lines for each species based on the known route (250 trajectory for each group, a total of four groups).

#### The bird velocity concentrated in a few values, so I create a list containing these values for later generation of Synthetic data 

In [296]:
def v_list (df):
    '''return the value of speed data and the number of unique velocity value'''
    vl= df['Velocity'].unique()
    num = len(vl)
    return vl,num

#### The idea behind the random generation of simulation data is to assume that the distance between birds are very small during migration. 250 birds can distribute over about 100 square metres (1 degree = 111.1 km)

In [297]:
def gene_250 (df):
    '''generate the other 249 routes and append to the original records'''
    output = df.copy()
    a, b = v_list(df)
    for i in range(1,250):
        traj_old = output
        traj_new = df.copy()
        # Create a unique ID for each route
        traj_new['UUID'] = traj_old['UUID'][1] + '-' + str(i)
        # Suppose the 249 simulated birds were distributed within 100 meters of a known bird trajectory 
        traj_new['Longitude'] = traj_new['Longitude'] + random.uniform(-1, 1)/1000
        traj_new['Latitude'] = traj_new['Latitude'] + random.uniform(-1, 1)/1000
        # For each row, random choose a value in the velocity value list
        for i in range(len(traj_new)):
            j = random.randint(0,b-1)
            traj_new.loc[i,'Velocity'] = a[j]
        # Append the new record to the old one
        output = pd.concat([traj_old, traj_new], ignore_index=True)
    return output

In [298]:
# Apply the function for each csv file
result1 = gene_250 (traj1)
result2 = gene_250 (traj2)
result3 = gene_250 (traj3)
result4 = gene_250 (traj4)
# Concatenating results to one (contain the total 1000 trajectory)
result = pd.concat([result1, result2, result3, result4], ignore_index=True)

In [299]:
result.shape

(69685000, 9)

## Trajectory Generalization
#### GPS tracks are typically pretty messy (record every ten min). To tidy the visualization, I generalized the lines to one location per day (Ramer–Douglas–Peucker algorithm is not suitable in our case) then filter the points by distance (only keep the starting, ending points and the point with a distance greater than 100km), which worked well at the display scale. 

In [301]:
result.dtypes

UUID            object
Timestamp       object
Longitude      float64
Latitude       float64
Altitude         int64
HorAccuracy    float64
Velocity       float64
Azimuth          int64
Satcount         int64
dtype: object

#### Filter by 

In [302]:
# Extract the date from Timestamp column
result['date'] = pd.to_datetime(result['Timestamp']).dt.date
#test['date'] = test['date'].str.split()
result

Unnamed: 0,UUID,Timestamp,Longitude,Latitude,Altitude,HorAccuracy,Velocity,Azimuth,Satcount,date
0,Orni17039,2018-08-04T00:07:35Z,151.866577,70.704399,-181,1.4,0.000000,29,6,2018-08-04
1,Orni17039,2018-08-04T00:17:35Z,151.874908,70.703972,-96,1.4,0.277778,91,6,2018-08-04
2,Orni17039,2018-08-04T00:27:58Z,151.878937,70.703163,-10,1.4,0.000000,55,5,2018-08-04
3,Orni17039,2018-08-04T00:37:34Z,151.880768,70.702072,-28,1.4,0.277778,180,5,2018-08-04
4,Orni17039,2018-08-04T00:47:37Z,151.881180,70.701691,-43,1.4,0.277778,117,5,2018-08-04
...,...,...,...,...,...,...,...,...,...,...
69684995,Orni181425-249,2019-12-06T13:09:29Z,91.296734,29.881055,3735,1.3,24.444444,138,7,2019-12-06
69684996,Orni181425-249,2019-12-06T13:19:28Z,91.296894,29.880942,3716,1.4,16.666667,299,7,2019-12-06
69684997,Orni181425-249,2019-12-06T13:29:49Z,91.296894,29.880948,3722,1.0,36.666667,247,9,2019-12-06
69684998,Orni181425-249,2019-12-06T13:39:28Z,91.296840,29.880956,3732,0.9,27.777778,232,11,2019-12-06


In [327]:
# Calculate the midpoint of each bird's position throughout the day and use it as the final visualization 
gene = pd.DataFrame(result.groupby(['UUID', 'date']).mean()[['Latitude', 'Longitude', 'Altitude', 'Velocity']]).reset_index()
gene

Unnamed: 0,UUID,date,Latitude,Longitude,Altitude,Velocity
0,Orni17039,2018-08-04,70.692198,151.912495,-8.847222,0.059799
1,Orni17039,2018-08-05,70.694873,151.906805,-4.458333,0.054012
2,Orni17039,2018-08-06,70.705443,151.894010,-2.125000,0.044367
3,Orni17039,2018-08-07,70.705096,151.911464,-9.416667,0.040509
4,Orni17039,2018-08-08,70.706498,151.924265,-6.208333,0.032793
...,...,...,...,...,...,...
527745,Orni181425-99,2019-12-02,29.883648,91.311764,3742.388889,15.918210
527746,Orni181425-99,2019-12-03,29.880203,91.271444,3750.104167,16.400463
527747,Orni181425-99,2019-12-04,29.891185,91.287189,3747.958333,18.406636
527748,Orni181425-99,2019-12-05,29.879446,91.301970,3745.847222,17.256944


In [328]:
a = gene['UUID'].unique()
len(a)

1000

#### Calculate distance between points

In [329]:
# The function to calcualte distance between two point (Lat/lng to km)

from math import radians, cos, sin, asin, sqrt

def haversine(df):
    
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(np.radians, [df.Longitude.shift(), df.Latitude.shift(), df.loc[1:, 'Longitude'], df.loc[1:, 'Latitude']])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2
    c = 2 * np.arcsin(np.sqrt(a)) 
    # Radius of earth in kilometers is 6371
    km = 6371* c
    return km

In [340]:
gene['Dist'] = haversine(gene)

gene = gene[(gene['Dist'] > 100) | (gene['Dist'].isnull())]
gene['Dist'].fillna(0, inplace=True)
gene

Unnamed: 0,UUID,date,Latitude,Longitude,Altitude,Velocity,Dist
0,Orni17039,2018-08-04,70.692198,151.912495,-8.847222,0.059799,0.000000
58,Orni17039,2018-10-01,68.468535,151.391314,10.666667,0.023148,248.082969
62,Orni17039,2018-10-05,64.451274,150.033179,247.681818,3.939394,450.724750
63,Orni17039,2018-10-06,55.418043,141.931739,217.083333,7.326389,1099.154638
64,Orni17039,2018-10-07,44.067327,132.166910,360.192308,9.679487,1440.929785
...,...,...,...,...,...,...,...
527672,Orni181425-99,2019-09-20,35.778128,99.032639,4179.937500,17.173997,273.564905
527676,Orni181425-99,2019-09-24,34.721175,97.763942,4258.555556,16.286651,164.572621
527710,Orni181425-99,2019-10-28,33.930302,95.775606,4664.430556,16.915509,202.656968
527711,Orni181425-99,2019-10-29,32.146813,92.486415,4719.090278,17.631173,365.115034


In [341]:
gene.to_csv(r'C:\Users\29911\Downloads\bird\data\Data_Hu Dandan_multiple_20200629\Ornitela_simplify1000.csv', index = False)

## Convert Dataframe to Geojson for visualization

In [342]:
def df_to_geojson(df, properties, lat='Latitude', lon='Longitude'):
    """
    Turn a dataframe containing point data into a geojson formatted python dictionary
    
    df : the dataframe to convert to geojson
    properties : a list of columns in the dataframe to turn into geojson feature properties
    lat : the name of the column in the dataframe that contains latitude data
    lon : the name of the column in the dataframe that contains longitude data
    """
    
    # create a new python dict to contain our geojson data, using geojson format
    geojson = {'type':'FeatureCollection', 'features':[]}

    # loop through each row in the dataframe and convert each row to geojson format
    for _, row in df.iterrows():
        # create a feature template to fill in
        feature = {'type':'Feature',
                   'properties':{},
                   'geometry':{'type':'Point',
                               'coordinates':[]}}

        # fill in the coordinates
        feature['geometry']['coordinates'] = [row[lon],row[lat]]

        # for each column, get the value and add it as a new feature property
        for prop in properties:
            feature['properties'][prop] = row[prop]
        
        # add this feature (aka, converted dataframe row) to the list of features inside our dict
        geojson['features'].append(feature)
    
    return geojson

In [346]:
useful_columns = ['UUID', 'date', 'Altitude', 'Velocity']
geojson_dict = df_to_geojson(gene, properties=useful_columns)
geojson_str = json.dumps(geojson_dict, indent=2, default=str)

In [349]:
# save the geojson result to a file
output_filename = './data/Ornitela_simplify1000.geojson'
with open(output_filename, 'w') as output_file:
    output_file.write('{}'.format(geojson_str))
    
# how many features did we save to the geojson file?
print('{} geotagged features saved to file'.format(len(geojson_dict['features'])))

35000 geotagged features saved to file
