<a href="https://colab.research.google.com/github/WookwonShim/data-analysis/blob/main/2018_central_park_squirrel_census.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading and Preparation

In [5]:
from google.colab import drive
import pandas as pd
import numpy as np

# Mount Google Drive
drive.mount('/content/drive')

# Load data
squirrel = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/2018-central-park-squirrel-census-squirrel-data.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Exploration: Squirrel Data

In [6]:
# Take a look at .info()
squirrel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3023 entries, 0 to 3022
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   X                                           3023 non-null   float64
 1   Y                                           3023 non-null   float64
 2   Unique Squirrel ID                          3023 non-null   object 
 3   Hectare                                     3023 non-null   object 
 4   Shift                                       3023 non-null   object 
 5   Date                                        3023 non-null   int64  
 6   Hectare Squirrel Number                     3023 non-null   int64  
 7   Age                                         2902 non-null   object 
 8   Primary Fur Color                           2968 non-null   object 
 9   Highlight Fur Color                         1937 non-null   object 
 10  Combination 

In [7]:
# Take a look at a sample
squirrel.sample(n=5, random_state=28).T

Unnamed: 0,1945,1174,984,663,1757
X,-73.974718,-73.955326,-73.968172,-73.956366,-73.960156
Y,40.769282,40.797559,40.78356,40.796682,40.790051
Unique Squirrel ID,5E-PM-1012-05,40D-PM-1013-04,22B-PM-1014-07,39D-AM-1008-03,31E-PM-1006-08
Hectare,05E,40D,22B,39D,31E
Shift,PM,PM,PM,AM,PM
Date,10122018,10132018,10142018,10082018,10062018
Hectare Squirrel Number,5,4,7,3,8
Age,Adult,Adult,Adult,Adult,Adult
Primary Fur Color,Cinnamon,Gray,Gray,Gray,Gray
Highlight Fur Color,"Gray, White",Cinnamon,Cinnamon,,Cinnamon


In [8]:
# CONVERT DATA TYPES
# Convert the date from %m%d%Y to pd.datetime.
squirrel['Date'] = pd.to_datetime(squirrel['Date'], format='%m%d%Y')

squirrel['Hectare'].nunique() #339/3023 unique values
squirrel['Hectare'] = squirrel['Hectare'].astype('category') # Convert object to category.

squirrel['Shift'].unique() # AM or PM
squirrel['Shift'] = squirrel['Shift'].astype('category') # Convert object to category.

squirrel['Hectare Squirrel Number'].max() # max value 23
squirrel['Hectare Squirrel Number'].min() # min value 1
squirrel['Hectare Squirrel Number'] = squirrel['Hectare Squirrel Number'].astype('int8') # Convert int64 to int8.

squirrel.Age.unique() # [nan, 'Adult', 'Juvenile', '?']
squirrel['Age'] = squirrel['Age'].astype('category') # Convert object to category.
squirrel['Age'] = squirrel['Age'].replace('?', np.nan) # treat ? as a missing value, NaN

squirrel['Primary Fur Color'].unique() # [nan, 'Gray', 'Cinnamon', 'Black']
squirrel['Primary Fur Color'] = squirrel['Primary Fur Color'].astype('category') # Convert object to category.

squirrel['Highlight Fur Color'].unique() # has multiple categories. e.g., 'Gray', 'Cinnamon, White', 'Gray, White'

# Step 1: Split the multiple categories into a list of categories
squirrel['Highlight Fur Color'] = squirrel['Highlight Fur Color'].str.split(', ')

# Step 2: Handle NaN values by filling them with an empty list
squirrel['Highlight Fur Color'] = squirrel['Highlight Fur Color'].apply(lambda x: x if isinstance(x, list) else [])

# Step 3: Explode the list to get each category in its own row
exploded = squirrel.explode('Highlight Fur Color')

# Step 4: Create dummy variables for each unique category
dummies = pd.get_dummies(exploded['Highlight Fur Color'], prefix='highlight_color')

# Step 5: Group by the original index and sum to collapse back into the original rows
dummies = dummies.groupby(exploded.index).sum()

# Step 6: Combine the original DataFrame with the dummies DataFrame
squirrel = pd.concat([squirrel, dummies], axis=1)

squirrel['Location'].unique() # [nan, 'Above Ground', 'Ground Plane']
squirrel['Location'] = squirrel['Location'].astype('category') # Convert object to category.

# After converting the data types.
squirrel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3023 entries, 0 to 3022
Data columns (total 35 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   X                                           3023 non-null   float64       
 1   Y                                           3023 non-null   float64       
 2   Unique Squirrel ID                          3023 non-null   object        
 3   Hectare                                     3023 non-null   category      
 4   Shift                                       3023 non-null   category      
 5   Date                                        3023 non-null   datetime64[ns]
 6   Hectare Squirrel Number                     3023 non-null   int8          
 7   Age                                         2898 non-null   category      
 8   Primary Fur Color                           2968 non-null   category      
 9   Highligh

In [9]:
# ORDER AND RENAME COLUMNS
# Renaming columns in the DataFrame
renamed_columns = {
    'Unique Squirrel ID': 'unique_squirrel_id',
    'Hectare': 'hectare',
    'Hectare Squirrel Number': 'hectare_squirrel_number',
    'X': 'x',
    'Y': 'y',
    'Lat/Long': 'lat_long',
    'Date': 'date',
    'Shift': 'shift',
    'Age': 'age',
    'Primary Fur Color': 'primary_fur_color',
    'Highlight Fur Color': 'highlight_fur_color',
    'Combination of Primary and Highlight Color': 'combination_fur_color',
    'Color notes': 'color_notes',
    'Running': 'running',
    'Chasing': 'chasing',
    'Climbing': 'climbing',
    'Eating': 'eating',
    'Foraging': 'foraging',
    'Other Activities': 'other_activities',
    'Kuks': 'kuks',
    'Quaas': 'quaas',
    'Moans': 'moans',
    'Tail flags': 'tail_flags',
    'Tail twitches': 'tail_twitches',
    'Approaches': 'approaches',
    'Indifferent': 'indifferent',
    'Runs from': 'runs_from',
    'Other Interactions': 'other_interactions',
    'Location': 'location',
    'Above Ground Sighter Measurement': 'above_ground_sighter_measurement',
    'Specific Location': 'specific_location'
}

# Identification Columns
id_cols = [
    'unique_squirrel_id', 'hectare', 'hectare_squirrel_number'
]

# Location Columns
loc_cols = [
    'x', 'y', 'lat_long'
]

# Time Columns
time_cols = [
    'date', 'shift'
]

# Physical Characteristics
physical_char_cols = [
    'age', 'primary_fur_color', 'highlight_fur_color',
    'combination_fur_color', 'color_notes'
]

# Behavior Columns
behavior_cols = [
    'running', 'chasing', 'climbing', 'eating',
    'foraging', 'other_activities', 'kuks', 'quaas',
    'moans', 'tail_flags', 'tail_twitches', 'approaches',
    'indifferent', 'runs_from', 'other_interactions'
]

# Additional Location Details
other_loc_cols = [
    'location', 'above_ground_sighter_measurement', 'specific_location'
]

# Combined Ordered Columns
ordered_columns = (
    id_cols
    + loc_cols
    + time_cols
    + physical_char_cols
    + behavior_cols
    + other_loc_cols
)

# Applying the renaming and ordering to the DataFrame
squirrel = squirrel.rename(columns=renamed_columns)[ordered_columns]

In [10]:
# Take another look at a sample after data type conversion and updating column names
squirrel.sample(n=5, random_state=28).T

Unnamed: 0,1945,1174,984,663,1757
unique_squirrel_id,5E-PM-1012-05,40D-PM-1013-04,22B-PM-1014-07,39D-AM-1008-03,31E-PM-1006-08
hectare,05E,40D,22B,39D,31E
hectare_squirrel_number,5,4,7,3,8
x,-73.974718,-73.955326,-73.968172,-73.956366,-73.960156
y,40.769282,40.797559,40.78356,40.796682,40.790051
lat_long,POINT (-73.9747175419546 40.7692820030878),POINT (-73.9553263156025 40.7975592060572),POINT (-73.9681722244801 40.783559826281),POINT (-73.9563661256875 40.7966823114508),POINT (-73.9601561064548 40.7900513933531)
date,2018-10-12 00:00:00,2018-10-13 00:00:00,2018-10-14 00:00:00,2018-10-08 00:00:00,2018-10-06 00:00:00
shift,PM,PM,PM,AM,PM
age,Adult,Adult,Adult,Adult,Adult
primary_fur_color,Cinnamon,Gray,Gray,Gray,Gray


In [11]:
squirrel.describe(include=['object', 'category']).T # there seems to be a duplicate in unique_squirrel_id. why?

Unnamed: 0,count,unique,top,freq
unique_squirrel_id,3023,3018,7D-PM-1010-01,2
hectare,3023,339,14D,32
lat_long,3023,3023,POINT (-73.9561344937861 40.7940823884086),1
shift,3023,2,PM,1676
age,2898,2,Adult,2568
primary_fur_color,2968,3,Gray,2473
highlight_fur_color,3023,11,[],1086
combination_fur_color,3023,22,Gray+,895
color_notes,182,135,Gray & Cinnamon selected as Primary. White sel...,9
other_activities,437,307,digging,19


In [12]:
squirrel.describe(include=['object', 'category']).T

Unnamed: 0,count,unique,top,freq
unique_squirrel_id,3023,3018,7D-PM-1010-01,2
hectare,3023,339,14D,32
lat_long,3023,3023,POINT (-73.9561344937861 40.7940823884086),1
shift,3023,2,PM,1676
age,2898,2,Adult,2568
primary_fur_color,2968,3,Gray,2473
highlight_fur_color,3023,11,[],1086
combination_fur_color,3023,22,Gray+,895
color_notes,182,135,Gray & Cinnamon selected as Primary. White sel...,9
other_activities,437,307,digging,19


In [13]:
squirrel.describe(include=['number']) # Avg 4.12, median 3, Max 23

Unnamed: 0,hectare_squirrel_number,x,y
count,3023.0,3023.0,3023.0
mean,4.123718,-73.967184,40.780853
std,3.096492,0.007726,0.010285
min,1.0,-73.981159,40.764911
25%,2.0,-73.973102,40.771676
50%,3.0,-73.968594,40.778166
75%,6.0,-73.960189,40.791219
max,23.0,-73.949722,40.800119


In [28]:
squirrel[squirrel.hectare_squirrel_number > 20] # Hectare 14E has 3 sightings reported on the same day, same shift, for over 20 squirrels. are these the same group of squirrels reported multiple times? why?

Unnamed: 0,unique_squirrel_id,hectare,hectare_squirrel_number,x,y,lat_long,date,shift,age,primary_fur_color,...,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,location,above_ground_sighter_measurement,specific_location
16,14E-AM-1008-23,14E,23,-73.970393,40.776503,POINT (-73.9703925210471 40.7765032004992),2018-10-08,AM,Adult,Gray,...,False,False,False,False,True,False,,Ground Plane,False,
1322,14E-AM-1008-21,14E,21,-73.970182,40.77637,POINT (-73.9701824376506 40.7763703306736),2018-10-08,AM,Adult,Gray,...,False,False,False,True,False,False,"curious,but not expecting food",Ground Plane,False,
1398,14E-AM-1008-22,14E,22,-73.970443,40.776376,POINT (-73.970442878377 40.7763755560359),2018-10-08,AM,Adult,Gray,...,False,False,False,False,True,False,,Ground Plane,False,


In [35]:
squirrel[squirrel.hectare == '14E'][['unique_squirrel_id', 'hectare', 'hectare_squirrel_number', 'date', 'shift']].sort_values(by=['date', 'hectare_squirrel_number'])
# after examining the hectare 14E, it turns out it's the squirrel id for the hectare at the time. There were 23 squirrels.
# maybe the count was done across the park at the same time, by different individuals assigned to each hectare to avoid double count.

# 14E seems to be very popular, a complete outlier. what's the reason? maybe people giving food? check other_animal in hectare_data

Unnamed: 0,unique_squirrel_id,hectare,hectare_squirrel_number,date,shift
263,14E-AM-1008-01,14E,1,2018-10-08,AM
90,14E-AM-1008-02,14E,2,2018-10-08,AM
2005,14E-AM-1008-03,14E,3,2018-10-08,AM
887,14E-AM-1008-04,14E,4,2018-10-08,AM
135,14E-AM-1008-05,14E,5,2018-10-08,AM
540,14E-AM-1008-06,14E,6,2018-10-08,AM
556,14E-AM-1008-07,14E,7,2018-10-08,AM
943,14E-AM-1008-08,14E,8,2018-10-08,AM
79,14E-AM-1008-09,14E,9,2018-10-08,AM
2920,14E-AM-1008-10,14E,10,2018-10-08,AM


In [14]:
squirrel.describe(include=['bool']).T

Unnamed: 0,count,unique,top,freq
running,3023,2,False,2293
chasing,3023,2,False,2744
climbing,3023,2,False,2365
eating,3023,2,False,2263
foraging,3023,2,False,1588
kuks,3023,2,False,2921
quaas,3023,2,False,2973
moans,3023,2,False,3020
tail_flags,3023,2,False,2868
tail_twitches,3023,2,False,2589


In [15]:
top_5_hectare = squirrel.groupby('hectare')['unique_squirrel_id'].count().sort_values(ascending=False).head(5)
top_5_hectare # further check what correlates with the squirrel sighting frequency in a hectare.

hectare
14D    32
32E    30
14E    28
01B    27
07H    26
Name: unique_squirrel_id, dtype: int64

In [16]:
top_primary_fur_color = squirrel.groupby('primary_fur_color')['unique_squirrel_id'].count().sort_values(ascending=False)
top_primary_fur_color # mostly gray. what's the temperature outside? is it the season that correlates with the color?

primary_fur_color
Gray        2473
Cinnamon     392
Black        103
Name: unique_squirrel_id, dtype: int64

In [17]:
behavior = squirrel[behavior_cols].select_dtypes('bool')
behavior.sum(axis=0).sort_values(ascending=False)

indifferent      1454
foraging         1435
eating            760
running           730
runs_from         678
climbing          658
tail_twitches     434
chasing           279
approaches        178
tail_flags        155
kuks              102
quaas              50
moans               3
dtype: int64

In [18]:
squirrel[id_cols + loc_cols].sort_values(by='hectare')

Unnamed: 0,unique_squirrel_id,hectare,hectare_squirrel_number,x,y,lat_long
1175,1A-PM-1014-04,01A,4,-73.980790,40.768216,POINT (-73.9807898224726 40.768215817511)
2935,1A-PM-1014-05,01A,5,-73.980988,40.768322,POINT (-73.9809876538513 40.7683223242143)
2490,1A-AM-1007-03,01A,3,-73.980691,40.768115,POINT (-73.9806905259671 40.7681152210858)
1054,1A-AM-1007-04,01A,4,-73.981108,40.767516,POINT (-73.9811078442462 40.767515940567)
2386,1A-PM-1014-01,01A,1,-73.980852,40.768315,POINT (-73.9808517324721 40.7683154647217)
...,...,...,...,...,...,...
3007,42H-PM-1014-04,42H,4,-73.950353,40.797654,POINT (-73.9503533343658 40.7976543603953)
2909,42H-PM-1014-03,42H,3,-73.950606,40.797619,POINT (-73.9506062997507 40.7976189496674)
574,42I-PM-1014-03,42I,3,-73.949722,40.796517,POINT (-73.9497217674555 40.796517007214)
2695,42I-PM-1014-01,42I,1,-73.950146,40.797094,POINT (-73.9501457233643 40.797094356558)


In [19]:
squirrel_count_per_hectare = squirrel.groupby('hectare')['unique_squirrel_id'].count().sort_values(ascending=False)
squirrel_count_per_hectare

hectare
14D    32
32E    30
14E    28
01B    27
07H    26
       ..
26B     1
18E     1
24I     1
24B     1
23D     1
Name: unique_squirrel_id, Length: 339, dtype: int64

In [20]:
squirrel_count_per_date = squirrel.groupby('date')['unique_squirrel_id'].count()
squirrel_count_per_date

date
2018-10-06    337
2018-10-07    405
2018-10-08    285
2018-10-10    335
2018-10-12    218
2018-10-13    434
2018-10-14    368
2018-10-17    216
2018-10-18    200
2018-10-19    158
2018-10-20     67
Name: unique_squirrel_id, dtype: int64

In [26]:
# Group by date and hectare, then count the unique squirrel IDs
squirrel_count_per_date_hectare = squirrel.groupby(['hectare', 'date'])['unique_squirrel_id'].count()

# Sort each group in descending order based on the count of unique squirrel IDs
squirrel_count_per_date_hectare_sorted = squirrel_count_per_date_hectare.groupby(level=0, group_keys=False).apply(lambda x: x.sort_values(ascending=False))


# Display the sorted counts per date and hectare
#print(squirrel_count_per_date_hectare_sorted)

squirrel_count_per_date_hectare_sorted

hectare  date      
01A      2018-10-14    7
         2018-10-07    4
         2018-10-06    0
         2018-10-08    0
         2018-10-10    0
                      ..
42I      2018-10-13    0
         2018-10-17    0
         2018-10-18    0
         2018-10-19    0
         2018-10-20    0
Name: unique_squirrel_id, Length: 3729, dtype: int64

# Data Exploration: Hectare Data

In [160]:
hectare = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/2018-central-park-squirrel-census-hectare-data.csv')

hectare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 13 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Hectare                        700 non-null    object 
 1   Shift                          700 non-null    object 
 2   Date                           700 non-null    int64  
 3   Anonymized Sighter             700 non-null    float64
 4   Sighter Observed Weather Data  681 non-null    object 
 5   Litter                         319 non-null    object 
 6   Litter Notes                   6 non-null      object 
 7   Other Animal Sightings         668 non-null    object 
 8   Hectare Conditions             660 non-null    object 
 9   Hectare Conditions Notes       74 non-null     object 
 10  Number of sighters             700 non-null    int64  
 11  Number of Squirrels            700 non-null    int64  
 12  Total Time of Sighting         680 non-null    flo

In [39]:
hectare.head()

# take a look at sighters
# take a look at the time. the data dictionary wasn't available for hectare-data.
# see the num of squir
# litter is tracked? does it attrac squirrels?

Unnamed: 0,Hectare,Shift,Date,Anonymized Sighter,Sighter Observed Weather Data,Litter,Litter Notes,Other Animal Sightings,Hectare Conditions,Hectare Conditions Notes,Number of sighters,Number of Squirrels,Total Time of Sighting
0,01A,AM,10072018,110.0,"70º F, Foggy",Some,,"Humans, Pigeons",Busy,,1,4,22.0
1,01A,PM,10142018,177.0,"54º F, overcast",Abundant,,"Humans, Pigeons",Busy,,1,7,26.0
2,01B,AM,10122018,11.0,"60º F, sunny",Some,,"Humans, Dogs, Pigeons, Horses",Busy,,1,17,23.0
3,01B,PM,10192018,109.0,"59.8º F, Sun, Cool",Some,,"Humans, Dogs, Pigeons, Sparrow, Blue jay",Busy,,1,10,35.0
4,01C,PM,10132018,241.0,"55° F, Partly Cloudy",,,"Humans, Dogs, Pigeons, Birds",Busy,,1,10,25.0


In [41]:
hectare.columns.tolist()

['Hectare',
 'Shift',
 'Date',
 'Anonymized Sighter',
 'Sighter Observed Weather Data',
 'Litter',
 'Litter Notes',
 'Other Animal Sightings',
 'Hectare Conditions',
 'Hectare Conditions Notes',
 'Number of sighters',
 'Number of Squirrels',
 'Total Time of Sighting']

In [161]:
col_map = {
    'Hectare'               : 'hectare',
    'Shift'                 : 'shift',
    'Date'                  : 'date',
    'Anonymized Sighter'    : 'sighter',
    'Sighter Observed Weather Data' : 'weather',
    'Litter'                : 'litter',
    'Litter Notes'          : 'litter_note',
    'Other Animal Sightings': 'other_animals',
    'Hectare Conditions'    : 'hectare_condition',
    'Hectare Conditions Notes': 'hectare_condition_note',
    'Number of sighters'    : 'num_of_sighters',
    'Number of Squirrels'   : 'hectare_squirrel_number',
    'Total Time of Sighting': 'total_time'
}

hectare = hectare.rename(columns=col_map)
hectare.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   hectare                  700 non-null    object 
 1   shift                    700 non-null    object 
 2   date                     700 non-null    int64  
 3   sighter                  700 non-null    float64
 4   weather                  681 non-null    object 
 5   litter                   319 non-null    object 
 6   litter_note              6 non-null      object 
 7   other_animals            668 non-null    object 
 8   hectare_condition        660 non-null    object 
 9   hectare_condition_note   74 non-null     object 
 10  num_of_sighters          700 non-null    int64  
 11  hectare_squirrel_number  700 non-null    int64  
 12  total_time               680 non-null    float64
dtypes: float64(2), int64(3), object(8)
memory usage: 71.2+ KB


In [82]:
num_of_sighters = hectare.sighter.nunique()

hectare[hectare.sighter == 189190] # num of sighters 2. probably sighter id 189 and 190 did it together

Unnamed: 0,hectare,shift,date,sighter,weather,litter,litter_note,other_animals,hectare_condition,hectare_condition_note,num_of_sighters,num_of_squirrels,total_time
626,07C,PM,10202018,189190.0,"62º F, Sunny",,,"Humans, Dogs, Sparrows, Other Birds",Calm,,2,6,23.0


In [87]:
hectare.num_of_sighters.value_counts()

num_of_sighters
1    610
2     80
3     10
Name: count, dtype: int64

In [163]:
hectare['sighter'] = hectare['sighter'].astype('str').replace('\.0$', '', regex=True).replace('\.', '', regex=True)
hectare[hectare.num_of_sighters >= 2] # want to find a way to split sighter ids when grouped of 2 or 3.

Unnamed: 0,hectare,shift,date,sighter,weather,litter,litter_note,other_animals,hectare_condition,hectare_condition_note,num_of_sighters,hectare_squirrel_number,total_time
601,19H,PM,10082018,164165,cloudy,,,,Calm,,2,1,5.0
611,01C,AM,10122018,225226,"cool, sunny",Some,,"Humans, Dogs, Cats",Calm,,2,2,30.0
612,01D,PM,10072018,3334,"80º F, Cloudy",,,"Humans, Dogs, Pigeons, Sparrows",,,2,7,25.0
613,01E,PM,10072018,3334,"80º F, Sunny",,,"Humans, Dogs, Pigeons, Robins. Blue jays",Busy,,2,3,20.0
614,01I,PM,10122018,225226,"cold, sunny",Some,,"Humans, Hawks, Dogs, Pigeons, Cats",Busy,,2,4,70.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,09A,AM,10132018,179180181,,Some,,"Humans, Dogs, Pigeons, Rats, Blue jays",Busy,,3,12,20.0
696,11F,PM,10062018,168,,Some,,"Humans, Dogs, Pigeons",Busy,,3,3,26.0
697,12G,PM,10072018,106107108,80º F,,,"Humans, Cat bird",Busy,,3,7,29.0
698,12H,PM,10072018,106107108,79º F,,,"Humans, Sparrows",Busy,,3,8,30.0


In [162]:
set(squirrel.columns) - (set(squirrel.columns) - set(hectare.columns)) # common columns between the two sets.

{'date', 'hectare', 'hectare_squirrel_number', 'shift'}

In [164]:
hectare.hectare_condition.unique() # why is there 'Calm, Busy', it's contradictory.

hectare[hectare.hectare_condition == 'Calm, Busy'] # all the 'Calm, Busy' reporting is done by a group of two. Maybe it was concatnated after reporting individually in the same hectare?
# no, after reading the notes, it seems like it's when there's a separate path/area. e.g., 'Calm (in Park), Busy (on Road)' will assume the latter is true.

Unnamed: 0,hectare,shift,date,sighter,weather,litter,litter_note,other_animals,hectare_condition,hectare_condition_note,num_of_sighters,hectare_squirrel_number,total_time
183,13C,AM,10102018,83,71º F Cloudy and wicked humid,Some,,"Humans, Dogs, Blue jay, Cardinal, Warbler","Calm, Busy",Sporadically punctuated busy,1,4,20.0
199,14B,PM,10142018,222,"57º F, overcast",Some,,"Humans, Dogs, Rat, Ducks","Calm, Busy","Busy, calm off walkways",1,8,22.0
212,14I,AM,10132018,240,"~60º F, rainy",,,"Humans, Dogs, Pigeons","Calm, Busy","Calm (in Park), Busy (on Road)",1,1,21.0
213,15A,PM,10172018,224,"52º F, cloudy, light rain drops, chilly",,,"Humans, Birds","Calm, Busy",Busy at sidewalk,1,1,20.0
318,21I,AM,10182018,37,"44º F, Sunny",Abundant,,"Humans, Dogs, Pigeons","Calm, Busy","Busy near street, near empty on the playground.",1,0,20.0
324,22D,PM,10122018,16,"61º F, sunny",,,"Humans, Pigeons, Sparrows","Calm, Busy",,1,7,26.0
345,23H,PM,10192018,45,"60º F, Partly Cloudy",Some,,"Humans, Dogs","Calm, Busy",Busy on Running Path,1,1,17.0
501,36C,PM,10132018,45,"56° F, Partly Cloudy",,,"Humans, Dogs, Dogs (on path), Ducks & Geese (i...","Calm, Busy",Busy on West Dr,1,6,25.0
502,36C,AM,10142018,243,"50º F, Calm, Fair, Sunny",,,"Humans, Birds","Calm, Busy","Calm in woods, busy outside with fund walk",1,4,40.0
542,38F,PM,10132018,222,"56º F, partly cloudy",,,"Humans, Small birds, Chipmunks","Calm, Busy","Only small chunk near road was busy, the rest ...",1,6,22.0


In [145]:
hectare.date = pd.to_datetime(hectare.date, format='%m%d%Y')

squirrel.merge(hectare, on=['date', 'hectare', 'shift'])

squirrel.shape

(3023, 31)

In [156]:
squirrel.merge(hectare, on=['date', 'hectare', 'shift'], how='left').head(3)

Unnamed: 0,unique_squirrel_id,hectare,hectare_squirrel_number,x,y,lat_long,date,shift,age,primary_fur_color,...,sighter,weather,litter,litter_note,other_animals,hectare_condition,hectare_condition_note,num_of_sighters,num_of_squirrels,total_time
0,37F-PM-1014-03,37F,3,-73.956134,40.794082,POINT (-73.9561344937861 40.7940823884086),2018-10-14,PM,,,...,243.0,"59º F, overcast, SW 6 mph",,,Humans,Busy,,1.0,3.0,24.0
1,21B-AM-1019-04,21B,4,-73.968857,40.783783,POINT (-73.9688574691102 40.7837825208444),2018-10-19,AM,,,...,213214.0,"Mid 40s, Clear, No Wind",Some,,"Humans, Dogs, Pigeons, Birds",Calm,,2.0,7.0,25.0
2,11B-PM-1014-08,11B,8,-73.974281,40.775534,POINT (-73.97428114848522 40.775533619083),2018-10-14,PM,,Gray,...,234.0,"Cool, Cloudy",Some,,"Humans, Dogs, Horses",Busy,,1.0,8.0,18.0


In [152]:
squirrel[squirrel.hectare=='37F']

Unnamed: 0,unique_squirrel_id,hectare,hectare_squirrel_number,x,y,lat_long,date,shift,age,primary_fur_color,...,moans,tail_flags,tail_twitches,approaches,indifferent,runs_from,other_interactions,location,above_ground_sighter_measurement,specific_location
0,37F-PM-1014-03,37F,3,-73.956134,40.794082,POINT (-73.9561344937861 40.7940823884086),2018-10-14,PM,,,...,False,False,False,False,False,False,,,,
173,37F-PM-1014-01,37F,1,-73.955443,40.794567,POINT (-73.9554432393295 40.7945669054449),2018-10-14,PM,,Gray,...,False,False,False,False,False,True,,Ground Plane,FALSE,
825,37F-PM-1014-02,37F,2,-73.955726,40.793951,POINT (-73.95572635334891 40.7939505791448),2018-10-14,PM,Juvenile,Gray,...,False,False,False,False,True,False,,Ground Plane,FALSE,LAWN
2568,37F-AM-1010-01,37F,1,-73.955966,40.794583,POINT (-73.9559663734409 40.7945828261377),2018-10-10,AM,Adult,Gray,...,False,True,False,False,True,False,,Above Ground,4,


In [168]:
hectare.other_animals.unique() # are there too many categories to explode this into columns? Also, how to see how many categories in here.

array(['Humans, Pigeons', 'Humans, Dogs, Pigeons, Horses',
       'Humans, Dogs, Pigeons, Sparrow, Blue jay',
       'Humans, Dogs, Pigeons, Birds', 'Humans', 'Humans, Dogs, Ducks',
       'Humans, Dogs, Cats, Raccoons',
       'Humans, Dogs, Pigeons, Ducks, Bluejay, Sparrow, Starling',
       'Humans, Pigeons, Small birds', 'Humans, Dogs',
       'Humans, Dogs, Pigeons, Downy Woodpecker, Sparrows',
       'Humans, Small birds', 'Humans, Hawks, Dogs',
       'Humans, Pigeons, Mice', 'Humans, Pigeons, Ducks, Geese', nan,
       'Humans, Dogs, Birds', 'Humans, Dogs (off-leash)', 'Humans, Birds',
       'Humans, Dogs, Pigeons',
       'Humans, Dogs, Ducks, Cardinal, Small birds, Robin',
       'Humans, Dogs, Birds, but not pigeons', 'Humans, Dogs, Sparrows',
       'Humans, Dogs, Blue Jays', 'Humans, Dogs, Small birds, Dogs',
       'Humans, Dogs, Birds, Song birds and Crows',
       'Humans, Dogs, Pigeons, Rat',
       'Humans, Dogs, Sparrows, Starlings, American robins',
       'Humans,