This notebook contains code and output of descriptive analyses for the 2000-2017 CC dataset after cleaning

In [1]:
import pandas as pd
import numpy as np
import os,glob

import plotly
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go

plotly.tools.set_config_file(world_readable=True)

# increase print limit
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 50

Run the following chunk if running from local folder

In [2]:
# Source Data
sourceDataPers = 'C:/Users/Christopher/Google Drive/TailDemography/outputFiles'
sourceDataBig = 'S:/Chris/TailDemography/data'

#Output Data paths
outputPers = 'C:/Users/Christopher/Google Drive/TailDemography/outputFiles'
outputBig = 'S:/Chris/TailDemography/data'

In [3]:
os.chdir(sourceDataBig)
mysourcefile = glob.glob('cleaned CC data 2000-2017*')
mysourcefile

['cleaned CC data 2000-2017_2018-09-07 22_47_08.525360.csv',
 'cleaned CC data 2000-2017_2018-09-08 16_41_11.234222.csv',
 'cleaned CC data 2000-2017_2018-09-08 23_41_22.661706.csv',
 'cleaned CC data 2000-2017_2018-09-11 21_19_30.768558.csv']

In [4]:
df=pd.read_csv(mysourcefile[-1])
df.head()

Unnamed: 0,species,toes_orig,date,sex,svl,tl,rtl_orig,mass,paint.mark,location,meters,newRecap,painted,misc,vial,year,rtl,autotomized,new.recap_orig,sighting,review_sex,review_species,review_painted,review_new.recap,review_rtl,toes,toe_pattern,tl_svl,mass_svl,year_diff,svl_diff,initialCaptureDate,liznumber,daysSinceCapture,capture
0,j,1-13-19,2000-03-17,f,52.0,74.0,0.0,4.2,r1c,1falls,,new,,,,2000.0,0.0,intact,new,,True,False,False,False,False,1-13-19,,1.423077,0.080769,0,0.0,2000-03-17,37,0,1
1,j,1-13-20,2000-03-17,m,56.0,77.0,0.0,5.6,r2c,1falls,,new,,,,2000.0,0.0,intact,new,,True,False,False,False,False,1-13-20,,1.375,0.1,0,0.0,2000-03-17,512,0,1
2,j,1-14-19,2000-03-17,f,57.0,81.0,0.0,6.6,r3c,wall on rt side v wall at pine xing,,new,,,,2000.0,0.0,intact,new,,True,False,False,False,False,1-14-19,,1.421053,0.115789,0,0.0,2000-03-17,44,0,1
3,j,1-14-20,2000-03-17,f,57.0,79.0,0.0,5.5,r4c,wall on rt side v wall at pine xing,,new,,,,2000.0,0.0,intact,new,,True,False,False,False,False,1-14-20,,1.385965,0.096491,0,0.0,2000-03-17,45,0,1
4,j,3-8,2000-03-17,f,82.0,89.0,27.0,17.0,r5c,oak across from bottom wall at pine xing,,recap,,shed since last recapture,,2000.0,27.0,autotomized,recap,,True,False,False,False,False,3-8,,1.085366,0.207317,0,0.0,2000-03-17,273,0,1


In [5]:
df.groupby('liznumber').capture.max().value_counts()

1    947
2    272
3    116
4     45
5     23
6      9
7      4
8      3
Name: capture, dtype: int64

## Analyze the data

## Reducing the analyses sample by date range and capture

In [6]:
# convert date to pandas datetime
df.date=pd.to_datetime(df.date)
# limiting months to between May and August
# df = df.loc[(df.date.dt.month>=5) & (df.date.dt.month<=8)]
# limit to first captures
df_first = df.sort_values(by=['liznumber','date'])
df_first = df_first.loc[~df_first.duplicated(subset='liznumber')]

In [7]:
df.groupby('liznumber').capture.max().value_counts()

1    947
2    272
3    116
4     45
5     23
6      9
7      4
8      3
Name: capture, dtype: int64

### Reducing data to species and sex of interest

In [8]:
species2keep=['j']
df_first = df_first.loc[df_first.species.isin(species2keep)]
print ("\n{} of the original data set are entries belonging to a species of interest {}"\
       .format(df_first.shape[0],species2keep))
sex2keep=['m','f']
df = df_first.loc[df_first.sex.isin(sex2keep)]
print ("\n{} of the original data set are entries belonging to a sex categories of interest {}"\
       .format(df_first.shape[0],sex2keep))


912 of the original data set are entries belonging to a species of interest ['j']

912 of the original data set are entries belonging to a sex categories of interest ['m', 'f']


## Number of lizards (_Sj_) by year and sex

In [9]:
df.to_csv('Cleaned Sj data.csv')

In [10]:
# pull out all individuals that we've recaught for Sj and writes to csv
multicapToes=df.loc[(df.species=='j')& (df.toes!="")& (df.toes!='NA')]\
.toes.value_counts()[df.loc[df.species=='j']\
                     .toes.value_counts()>1].index.tolist()
df.loc[df.toes.isin(multicapToes)].sort_values(by=['toes','date']).to_csv('multicaps.csv')

## Maximum Number of Captures

In [11]:
data = [go.Histogram(x = df.groupby('liznumber').capture.max())]
layout = go.Layout(
    title = 'Maximum Number of Captures per Individual 2000-2017',
    titlefont = dict(
        size = 20),
    xaxis = dict(
        dtick = 1,
        title = 'Maximum Number of Captures',
        titlefont = dict(
            size = 18)),
    yaxis = dict(
        title = 'Number of Lizards',
        titlefont = dict(
            size = 18)))

fig = go.Figure(
        data = data,
        layout = layout)
py.iplot(fig, filename = 'Histogram of Maximum Captures per Individual in Crystal Creek 2000 - 2017')

In [12]:
dfF = df.loc[df.sex =='f']
dfM = df.loc[df.sex =='m']

In [13]:
females = go.Histogram(x = dfF.groupby('liznumber').capture.max(),name='females')
males = go.Histogram(x = dfM.groupby('liznumber').capture.max(),name='males')

data = [males,females]
layout = go.Layout(
    title = 'Maximum Number of Captures per Individual 2000-2017',
    titlefont = dict(
        size = 20),
    xaxis = dict(
        dtick = 1,
        title = 'Maximum Number of Captures',
        titlefont = dict(
            size = 18)),
    yaxis = dict(
        title = 'Number of Lizards',
        titlefont = dict(
            size = 18)))

fig = go.Figure(
        data = data,
        layout = layout)
py.iplot(fig, filename = 'Histogram of Maximum Captures per Individual in Crystal Creek 2000 - 2017')

In [14]:
data = [go.Violin(y = df.groupby('liznumber').capture.max(),name = ' ')]
layout = go.Layout(
    title = 'Maximum Number of Captures per Individual 2000-2017',
    titlefont = dict(
        size = 20),
    xaxis = dict(
        title = 'Maximum Number of Captures',
        titlefont = dict(
            size = 18)),
    yaxis = dict(
        dtick = 1,
        title = 'Number of Lizards',
        titlefont = dict(
            size = 18)))

fig = go.Figure(
        data = data,
        layout = layout)
py.iplot(fig, filename = 'Violin Plot of Maximum Captures per Individual in Crystal Creek 2000 - 2017')

In [15]:
females = go.Violin(y = dfF.groupby('liznumber').capture.max(),name='females')
males = go.Violin(y = dfM.groupby('liznumber').capture.max(),name='males')

data = [males,females]
layout = go.Layout(
    title = 'Maximum Number of Captures per Individual 2000-2017',
    titlefont = dict(
        size = 20),
    xaxis = dict(
        dtick = 1,
        title = 'Maximum Number of Captures',
        titlefont = dict(
            size = 18)),
    yaxis = dict(
        dtick = 1,
        title = 'Number of Lizards',
        titlefont = dict(
            size = 18)))

fig = go.Figure(
        data = data,
        layout = layout)
py.iplot(fig, filename = 'Violin Plot of Maximum Captures per Individual in Crystal Creek 2000 - 2017')

In [19]:
lizards = [go.Scatter(x = df.liznumber,
                   y = df.loc[df.daysSinceCapture>0].groupby('liznumber').daysSinceCapture.max(), 
                     mode = 'markers')]
# year1 = [go.scatter.Line(y = 365)]
# year2 = [go.scatter.Line(y = 365*2)]
# year3 = [go.scatter.Line(y = 365*3)]
# year4 = [go.scatter.Line(y = 365*4)]
# year5 = [go.scatter.Line(y = 365*5)]
# year6 = [go.scatter.Line(y = 365*6)]
# year7 = [go.scatter.Line(y = 365*7)]
# year8 = [go.scatter.Line(y = 365*8)]

# data = [lizards, year1, year2, year3, year4, year5, year6, year7, year8]
data = lizards
layout = go.Layout(
    title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017',
        titlefont = dict(
            size = 20),
    xaxis = dict(
            title='Lizard Number',
            titlefont=dict(
                size=18)),
    yaxis = dict(
            title='Greatest Number of Days Since<br> Initial Capture',
            titlefont=dict(
                size=18)))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = 'Days Since Initial Capture in Crystal Creek 2000 - 2017')

In [17]:
lizards = [go.Box(y = df.loc[df.daysSinceCapture>0].groupby('liznumber').daysSinceCapture.max(), name = ' ')]

data = lizards
layout = go.Layout(
    title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017',
        titlefont = dict(
            size = 20),
    xaxis = dict(
            title='Lizard Number',
            titlefont=dict(
                size=18)),
    yaxis = dict(
            title='Greatest Number of Days Since<br> Initial Capture',
            titlefont=dict(
                size=18)))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = 'Days Since Initial Capture in Crystal Creek 2000 - 2017')

In [18]:
females = [go.Box(y = dfF.loc[dfF.daysSinceCapture>0].groupby('liznumber').daysSinceCapture.max(),name= 'females')]
males = [go.Box(y = dfM.loc[dfM.daysSinceCapture>0].groupby('liznumber').daysSinceCapture.max(),name = 'males')]

data = [females, males]
layout = go.Layout(
    title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex',
        titlefont = dict(
            size = 20),
    xaxis = dict(
            title='Sex',
            titlefont=dict(
                size=18)),
    yaxis = dict(
            title='Greatest Number of Days Since<br> Initial Capture',
            titlefont=dict(
                size=18)))

fig = go.Figure(
        data = data,
        layout = layout)
py.iplot(fig, filename = 'Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex')

ValueError: 
    Invalid element(s) received for the 'data' property of 
        Invalid elements include: [[Box({
    'name': 'females', 'y': array([], dtype=float64)
})], [Box({
    'name': 'males', 'y': array([], dtype=float64)
})]]

    The 'data' property is a tuple of trace instances
    that may be specified as:
      - A list or tuple of trace instances
        (e.g. [Scatter(...), Bar(...)])
      - A list or tuple of dicts of string/value properties where:
        - The 'type' property specifies the trace type
            One of: ['area', 'bar', 'box', 'candlestick', 'carpet',
                     'choropleth', 'cone', 'contour',
                     'contourcarpet', 'heatmap', 'heatmapgl',
                     'histogram', 'histogram2d',
                     'histogram2dcontour', 'mesh3d', 'ohlc',
                     'parcoords', 'pie', 'pointcloud', 'sankey',
                     'scatter', 'scatter3d', 'scattercarpet',
                     'scattergeo', 'scattergl', 'scattermapbox',
                     'scatterpolar', 'scatterpolargl',
                     'scatterternary', 'splom', 'streamtube',
                     'surface', 'table', 'violin']

        - All remaining properties are passed to the constructor of
          the specified trace type

        (e.g. [{'type': 'scatter', ...}, {'type': 'bar, ...}])

In [None]:
females = go.Scatter(
    x = dfF.liznumber,
    y = dfF.groupby('liznumber').daysSinceCapture.max(),
    name = 'females',
    mode = 'markers',
    marker = dict(
        opacity = 0.75,
        line = dict(
            width = .5,
            color = 'rgb(0, 0, 0)'
        )
    )
)

males = go.Scatter(
    x = dfM.liznumber,
    y = dfM.groupby('liznumber').daysSinceCapture.max(),
    name = 'males',
    mode = 'markers',
    marker = dict(
        opacity = 0.75,
        line = dict(
            width = .5,
        )
    )
)

data = [females, males]

layout = dict(title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex',
              yaxis = dict(
                  title='Greatest Number of Days Since<br> Initial Capture',
                  titlefont=dict(
                      size=18)
              ),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex')

In [None]:
males = go.Histogram(x = dfM.capture,name='males')
females = go.Histogram(x = dfF.capture, name = 'females')
data = [males,females]
py.iplot(data, filename = 'Frequency of Captures by Sex in Crystal Creek 2000 - 2017')

In [None]:
df.capture.value_counts()