# Central Park Squirrels Exploratory Data Analysis

## Imports

In [None]:
import pandas as pd
import altair as alt
alt.data_transformers.enable('vegafusion')

DataTransformerRegistry.enable('vegafusion')

## Loading & Cleaning

In [None]:
squirrels = pd.read_csv('../data/raw/2018_Central_Park_Squirrel_Census.csv')

squirrels['Date'] = pd.to_datetime(squirrels['Date'], format='%m%d%Y')
squirrels.columns = squirrels.columns.str.lower().str.replace(' ', '_')

## Dataset Overview

In [None]:
squirrels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3023 entries, 0 to 3022
Data columns (total 31 columns):
 #   Column                                      Non-Null Count  Dtype         
---  ------                                      --------------  -----         
 0   x                                           3023 non-null   float64       
 1   y                                           3023 non-null   float64       
 2   unique_squirrel_id                          3023 non-null   object        
 3   hectare                                     3023 non-null   object        
 4   shift                                       3023 non-null   object        
 5   date                                        3023 non-null   datetime64[ns]
 6   hectare_squirrel_number                     3023 non-null   int64         
 7   age                                         2902 non-null   object        
 8   primary_fur_color                           2968 non-null   object        
 9   highligh

## EDA

In [None]:
colours = ['#B2BEB5', '#D2691E', '#000000']
order = ['Gray', 'Cinnamon', 'Black']

colour_sightings = alt.Chart(
    squirrels.dropna(subset = ['primary_fur_color']),
    title = alt.Title(text = 'Most Common Fur Colours')
    ).mark_bar().encode(
        x = alt.X('primary_fur_color:N').sort('-y').title('Primary Fur Colour'),
        y = alt.Y('count():Q').title('Number of Sightings'),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).legend(None)
    ).properties(width = 400, height = 250)

colour_sightings

In [None]:
location_sightings = alt.Chart(
    squirrels.dropna(subset = ['x', 'y']),
    title = alt.Title(text = 'Squirrel Sightings by Location and Colour')
    ).mark_circle(size = 15, opacity = 0.5).encode(
        x = alt.X('x:Q').title('Longitude (x)').scale(zero = False),
        y = alt.Y('y:Q').title('Latitude (y)').scale(zero = False),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).title('Primary Fur Colour'),
    ).properties(width = 400, height = 250)

location_sightings

In [None]:
day_night_sightings = alt.Chart(
    squirrels.dropna(subset = ['primary_fur_color', 'shift'])
    ).mark_bar().encode(
        x = alt.X('primary_fur_color:N').sort('-y').title(None),
        y = alt.Y('count():Q').title('Number of Sightings'),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).legend(None)
    ).facet(
        column = alt.Column('shift:N', title = 'Time of Day')
    )

day_night_sightings

In [None]:
cumulative_sightings = alt.Chart(
    squirrels.dropna(subset = ['primary_fur_color', 'date']),
    title = alt.Title(text = 'Cumulative Sightings of Different Fur Colours Over Time')
    ).transform_aggregate(
        count = 'count()',
        groupby = ['date', 'primary_fur_color']
    ).transform_window(
        cumulative_count = 'sum(count)',
        sort = [alt.SortField('date')],
        groupby = ['primary_fur_color']
    ).mark_line().encode(
        x = alt.X('date:T', title = 'Date'),
        y = alt.Y('cumulative_count:Q', title = 'Cumulative Sightings'),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).title('Primary Fur Colour')
    ).properties(width = 400, height = 250)

cumulative_sightings

In [None]:
vocal_cols = ['kuks', 'quaas', 'moans']

squirrel_vocals = (
    squirrels[['primary_fur_color'] + vocal_cols]
    .dropna(subset = ['primary_fur_color'])
    .assign(
        any_vocal = lambda df: df[vocal_cols]
            .fillna(False)
            .astype(bool)
            .any(axis=1)
    )
    [['primary_fur_color', 'any_vocal']]
)

colour_noise = alt.Chart(
    squirrel_vocals,
    title = alt.Title(text = 'Proportion of Squirrels Making Vocalisations by Fur Colour')
).mark_bar().encode(
    x = alt.X('primary_fur_color:N').sort('-y').title('Primary Fur Colour'),
    y = alt.Y('mean(any_vocal):Q', title = 'Proportion of Squirrels Making Vocalisations'),
    color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).legend(None)
).properties(width = 400, height = 250)

colour_noise

In [None]:
colour_run = alt.Chart(
    squirrels.dropna(subset = ['primary_fur_color', 'runs_from']),
    title = alt.Title(text = 'Proportion of Squirrels That Run From Humans by Fur Colour')
    ).mark_bar().encode(
        x = alt.X('primary_fur_color:N').sort('-y').title('Primary Fur Colour'),
        y = alt.Y('mean(runs_from):Q', title = 'Proportion of Squirrels That Run From Humans'),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).legend(None)
    ).properties(width = 400, height = 250)

colour_run

In [None]:
colour_eat = alt.Chart(
    squirrels.dropna(subset = ['primary_fur_color', 'eating']),
    title = alt.Title(text = 'Proportion of Squirrels Eating When Sighted by Fur Colour')
    ).mark_bar().encode(
        x = alt.X('primary_fur_color:N').sort('-y').title('Primary Fur Colour'),
        y = alt.Y('mean(eating):Q', title = 'Proportion of Squirrels That Are Eating'),
        color = alt.Color('primary_fur_color:N').scale(domain = order, range = colours).legend(None)
    ).properties(width = 400, height = 250)

colour_eat