This notebook contains code and output of descriptive analyses for the 2000-2017 CC dataset after cleaning

In [1]:
import pandas as pd
import numpy as np
from ggplot import *
#Magic to make plots display

%matplotlib notebook

You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


Run the following chunk if running from local folder

In [2]:
df=pd.read_csv('C:\\Users\\Christopher\\Google Drive\\TailDemography\\outputFiles\\cleaned CC data 2000-2017_.csv')

Run the following chunk if running from google sheets

import gspread
from oauth2client.service_account import ServiceAccountCredentials
#use creds to create a client to interact with the Google Drive API
scope = ['https://spreadsheets.google.com/feeds']
creds = ServiceAccountCredentials.from_json_keyfile_name('TD_client.json', scope)
client = gspread.authorize(creds)

data = client.open("cleaned CC data 2000-2017_.csv").sheet1
df=pd.DataFrame(data.get_all_records())

Now we read the data in from Google Sheets

## Analyze the data

## Reducing the analyses sample by date range and capture

In [3]:
# convert date to pandas datetime
df.date=pd.to_datetime(df.date)
# limiting months to between May and August
df = df.loc[(df.date.dt.month>=5) & (df.date.dt.month<=8)]
# limit to first captures
df_first = df.sort_values(by=['lizardNumber','date'])
df_first = df_first.loc[~df_first.duplicated(subset='lizardNumber')]

### Reducing data to species and sex of interest

In [4]:
species2keep=['j']
df_first = df_first.loc[df_first.species.isin(species2keep)]
print ("\n{} of the original data set are entries belonging to a species of interest {}"\
       .format(df_first.shape[0],species2keep))
sex2keep=['m','f']
df = df_first.loc[df_first.sex.isin(sex2keep)]
print ("\n{} of the original data set are entries belonging to a sex categories of interest {}"\
       .format(df_first.shape[0],sex2keep))


941 of the original data set are entries belonging to a species of interest ['j']

941 of the original data set are entries belonging to a sex categories of interest ['m', 'f']


## Number of lizards (_Sj_) by year and sex

In [7]:
df.loc[:,'year'] = pd.to_numeric(df.loc[:,'year'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


In [9]:
svlTl = ggplot(aes(x = 'svl',y = 'tl',fill = 'sex'), data=df)
svlTl + geom_point() 


<IPython.core.display.Javascript object>

<ggplot: (-9223371918955767727)>

def mySummary (df, plotType = 'bar' , figsize = None, title = None , ylabel= None , percFormat = False , rot = 0, \
               xticks = None, x = None , y = None,legend = False, ax_other = None, hline = None, hlcolor = 'r',\
               hlstyle = '-', vline = None, vlcolor = 'r',ylim = None,vlstyle = '--'):
    import pandas as pd
    import matplotlib as plt
    %matplotlib notebook
    assert (isinstance(df,pd.DataFrame)|isinstance(df,pd.Series)|isinstance(df,pd.core.groupby.DataFrameGroupBy)\
           |isinstance(df,pd.core.groupby.SeriesGroupBy))
    assert ((figsize is None) | (isinstance (figsize, tuple)))
    assert isinstance (percFormat,bool)
    ax = df.plot(x=x , y=y , figsize = figsize, kind = plotType , rot = rot , title = title,\
                 legend = legend, ax = ax_other,ylim = ylim)
    ax.set_ylabel(ylabel)
    ax.axhline(y = hline, color = hlcolor, linestyle = hlstyle)
    ax.axvline(x = vline,color = vlcolor, linestyle = vlstyle)
    if percFormat == False:
        res = ax
    else:
        vals = ax.get_yticks()
        ax.set_yticklabels(['{:3.2f}%'.format(x*100) for x in vals])
        res = ax
    return res

maleSjXYr = pd.Series(df.loc[df.sex=='m'].groupby('year',as_index = False).\
                      lizardNumber.nunique())
femaleSjXYr = pd.Series(df.loc[df.sex=='f'].groupby('year',as_index = False).\
                        lizardNumber.nunique())
years = df.year.astype('int').unique()
SjSexXYr = pd.concat([maleSjXYr,femaleSjXYr],axis=1,keys = ['males','females'])
SjSexXYr['year'] = str(SjSexXYr.index+2000)
SjSexXYr.head()

## Number of recaptured lizards by sex and year

In [15]:
newRecapKeep = ['recap', 'new', 'r', 'n']
df.loc[~df['new.recap'].isin(newRecapKeep),'new.recap'] = np.nan
df['new.recap'].unique()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


array([nan, 'recap', 'new', 'r', 'n'], dtype=object)

In [17]:
capStatusXyear = ggplot(aes(x = 'year', y='new.recap'),data = df)
capStatusXyear + geom_line()

<IPython.core.display.Javascript object>

AttributeError: 'Series' object has no attribute 'find'

In [None]:
# clean new.recap
new = ['new', 'n', 'new?']
recap = ['recap', 'r']
df.loc[df['new.recap'].isin(new),'new.recap'] = 'n'
df.loc[df['new.recap'].isin(recap),'new.recap'] = 'r'

In [None]:
#overall new.recap
SjNewXYr = pd.Series(df.loc[df['new.recap']=='n'].groupby(['year'],as_index = False).\
                     lizardNumber.nunique())
SjRecapXYr = pd.Series(df.loc[df['new.recap']=='r'].groupby(['year'],as_index = False).\
                       lizardNumber.nunique())
SjCapXYr = pd.concat([SjNewXYr,SjRecapXYr],axis=1,keys = ['new','recap'])
SjCapXYr.plot(kind = 'bar', title = 'S. jarrovii Capture status')

In [None]:
#Males
MSjNewXYr = pd.Series(df.loc[(df['new.recap']=='n') & (df.sex == 'm') ].groupby(['year'],as_index = False).lizardNumber.nunique())
MSjRecapXYr = pd.Series(df.loc[(df['new.recap']=='r')& (df.sex == 'm')].groupby(['year'],as_index = False).lizardNumber.nunique())
MSjCapXYr = pd.concat([MSjNewXYr,MSjRecapXYr],axis=1,keys = ['new','recap'])
MSjCapXYr.plot(kind = 'bar', title = 'Distribution of recaptures by year among male _S.jarrovii_')

In [None]:
#Female
FSjNewXYr = pd.Series(df.loc[(df['new.recap']=='n') & (df.sex == 'f') ].groupby(['year'],as_index = False).lizardNumber.nunique())
FSjRecapXYr = pd.Series(df.loc[(df['new.recap']=='r')& (df.sex == 'f')].groupby(['year'],as_index = False).lizardNumber.nuniqugcefe())
FSjCapXYr = pd.concat([FSjNewXYr,FSjRecapXYr],axis=1,keys = ['new','recap'])
FSjCapXYr.plot(kind = 'bar', title = 'Distribution of recaptures by year among female _S.jarrovii_')

### Notes:
- limit range of summary analysis to captures in May-August
- only use last/first capture of an individual (based on lizardNumber) in that period

#### Analyses
(For _Sj_ only)
- By year X sex
    - N animals (unique)
        - N recaptured
            - N  years each animal appears
            - \* Growth over period between recaptures (first SVL as a covariate)


## Number of years captured

In [None]:
print("\nThere are {} entries and {} unique lizards.".format(df.shape[0],df.lizardNumber.unique()))

In [None]:

#yearsCap = []
#for liz in df.loc['lizardNumber'].unique():
#    print(df.loc[df.lizardNumber == liz,'year'].unique())


In [None]:
df.year.unique()

In [None]:
print(df.columns)

Here we remove unnecessary columns from df.

keepCols = ['autotomized','date','location','mass','meters','misc','new.recap','paint.mark'\
         ,'painted','rtl','sex','sighting','species','svl','tl','toes','vial','year']
print(df[keepCols].columns)

In order to plot the data, we need to force svl, tl, rtl and mass to numeric variables.

In [None]:
print("\nThere are {} data points in our data set.".format(df.shape[0]))
print("\nThe columns in the data have the following data types:\n{}".format(df.dtypes))

The following tables exclude non-ideal values for the variables in question, but once the columns are cleaned in the source file, this won't be an issue.

Drop data for species other than 'j' and 'v'.

In [None]:
species2keep=['j','v']
df = df.loc[df.species.isin(species2keep)]
print ("\n{} of the original data set are entries belonging to a species of interest {}".format(df.shape[0],species2keep))

Dropping non-ideal data for sex.

In [None]:
print(df.shape[0])
sex2keep=['m','f']
df = df.loc[df.sex.isin(sex2keep)]
print ("\n{} of the original data set are entries belonging to a sex categories of interest {}".format(df.shape[0],sex2keep))

In [None]:
df['tl_svl']=(df.tl/df.svl)
df['mass_svl']=(df.mass/df.svl)

Create age_class index
Let's change the terminology: 
    - we know nothing about reproductive capability
- S. jarrovii: Ballinger
- S. virgatus: Rose 1981 although Smith, Ballinger and Rose 1985 seems makes a case for moving female demarcation closer to 50mm
**NOTE:** confirm cut point for size at maturity

Not implementing the chunk below b/c of discussion 

df['age_class']=pd.np.nan
df['age_class'].loc[(df.svl>=56)&(df.species=='j'),]='adult'
df['age_class'].loc[(df.svl<56)&(df.species=='j'),]='juvenile'
#Rose 1981 although Smith, Ballinger and Rose 1985 seems makes a case for moving female demarcation closer to 50mm
df['age_class'].loc[(df.svl>=51)&(df.species=='v')&(df.sex=='m'),]='adult'
df['age_class'].loc[(df.svl>=56)&(df.species=='v')&(df.sex=='f'),]='adult'
df['age_class'].loc[(df.svl<51)&(df.species=='v')&(df.sex=='m'),]='juvenile'
df['age_class'].loc[(df.svl<56)&(df.species=='v')&(df.sex=='f'),]='juvenile'

Create boolean flag to drop data from analyses

In [None]:
df['myDrop']= pd.np.nan
df['dropReason']= pd.np.nan

Populate 'myDrop' column bsed on outliers in data

df.loc[((df.svl>75)& (df.species=='v')),'myDrop']=True
df.loc[((df.svl>75)& (df.species=='v')),'dropReason']='svl;species'
df.loc[((df.species=='j')&(df.mass>40)),'myDrop']=True
df.loc[((df.species=='j')&(df.mass>40)),'dropReason']='svl;mass'
df.loc[((df.species=='v')&(df.mass>25)),'myDrop']=True
df.loc[((df.species=='v')&(df.mass>25)),'dropReason']='svl;mass'
df.loc[df.meters< -50,'myDrop']=True
df.loc[df.meters< -50,'dropReason']='meters'
df.myDrop.value_counts()

In [None]:
df.loc[df.myDrop==True].reset_index()

data = client.open("Outliers to check(2000-2017).csv").sheet1
dfChecked=pd.DataFrame(data.get_all_records())

dfChecked.columns=['index',dfChecked.columns[1:len(dfChecked.columns)]]

#This chunk merges in changes to data made in "Outliers to check (2000-2017).csv"
print (df.shape)
df.merge(copy=dfChecked,how='inner',right_on=)

Create a dataframe of values based on myDrop==True and export to csv.

In [None]:
df2run=df.loc[df.myDrop!=True]
df2Check=df.loc[df.myDrop==True,]
df2Check.head()

In [None]:
df2Check.to_csv("Outliers to check(2000-2017).csv")

In [None]:
df.to_csv("Descriptive Analyses of CC Data (2000-2017).csv")

## Summary Analyses

### Tables

**NOTE:**  We need to format these into laTex tables or something else that has borders.
try pd.DataFrame.crosstab

In [None]:
df.loc[df.myDrop!=True].groupby(['species','sex'])['autotomized'].count()

In [None]:
df.loc[df.myDrop!=True].groupby(['species','sex','autotomized'])['new.recap'].count()

**NOTE:  The plots below need to be edited to better label the axes.**

**NOTE** exclude outliers and rerun (include if mass_svl<0.6)

## Adult lizards

### S. jarrovii
#### SVL

In [None]:
species=['S. jarrovii', 'S. virgatus']
print(species)
sex=['female','male']
print(sex)

In [None]:
# pull out all individuals that we've recaught for Sj and writes to csv
multicapToes=df2run.loc[(df2run.species=='j')& (df2run.toes!="")& (df2run.toes!='NA')].toes.value_counts()[df2run.loc[df2run.species=='j'].toes.value_counts()>1].index.tolist()
df2run.loc[df2run.toes.isin(multicapToes)].sort_values(by=['toes','date']).to_csv('multicaps.csv')

# Create function to generate lizardNumber 
 lizard number is a numeric identifier of unique animals in the data set
function takes the following arguments:
    - *x*: series object on which function acts on
    - *sortCriteria*: list of strings of column names on which to sort data.  data are sorted by columns from left to right
    - *validationCriteria*: dictionary of dictionaries that identify columns to validate and validation expression of the form {{'column':'column_2 >= column_1'},{'otherColumn':'column_2 >= column_1'}}
    - *result*: dictionary of of dictionaries detailing the value *x* takes if validations are True or False of the form: {{'True':x=x[i]},{'False':x=x[i]+1},{errors: 'raise'}}, errors may be 'raise' *default* (terminates function and returns an error) or 'ignore' (returns 'NA')
Function action:
- first sort data by species, toes, then date

- for time points 1 and , with 2 being later: 
    - toes2 == toes1
    - svl2-svl1 >=-2
    - year2-year1 <=7
    - for species ==j:
        - if svl >=56:
            - if sex2==sex1:
                lizardNumber[i+1]=lizardNumber[i]
          else:
            - lizardNumber[i+1]=lizardNumber[i]+1
    

In [None]:
df.sort_values(by='date').groupby('toes').toes.value_counts()

In [None]:
#This is the function
#First, sort by date and groupby toes
tmp=df.sort_values(by='date').groupby('toes')
tmp['lizardNumber']=pd.nan
for i in 1:len(tmp.count.max()) in tmp:
    try:
        if tmp.loc[obs,'toes'] == tmp.loc[(obs-1),'toes']:
            tmp.lizardNumber=


In [None]:
ax=df2run.loc[(df.species=='j')&(df.sex=='m')].svl.plot(kind='kde',color='b',xlim=(0,120))
jSvlKde=df2run.loc[(df.species=='j')&(df.sex=='f')].svl.plot(kind='kde',color='r',xlim=(0,120),ax=ax)

In [None]:
jSvl=df2run.loc[df2run.species=='j'].boxplot(column='svl',by=['sex','age_class'])
jSvl.set_title(species[0])
jSvl.set_ylabel('SVL (mm)')
jSvl.set_ylim(0,110)
plt.suptitle("")

May want to consider running growth by month and see where these outliers end up

In [None]:
vSvl=df2run.loc[df2run.species=='v'].boxplot(column='svl',by=['sex','age_class'])                                
vSvl.set_title(species[1])
vSvl.set_ylabel('SVL (mm)')
vSvl.set_ylim(0,110)
plt.suptitle("")

#### TL

There are some hi juveniles values here

In [None]:
jTL=df2run.loc[df2run.species=='j'].boxplot(column=['tl'],by=['sex', 'age_class'])
jTL.set_title(species[0])
jTL.set_ylabel('TL (mm)')
jTL.set_ylim(0,140)
plt.suptitle("")

In [None]:
df2runa=df2run[df2run.age_class=='adult']
jTL=df2runa.loc[df2runa.species=='j'].boxplot(column=['tl'],by=['sex', 'autotomized'])
jTL.set_title(species[0])
jTL.set_ylabel('TL (mm)')
jTL.set_ylim(0,140)
plt.suptitle("")

Investigate f and m sj  intact, but <40; recalculate autotomized label

In [None]:
vTL=df2run.loc[df2run.species=='v'].boxplot(column=['tl'],by=['sex', 'age_class'])
vTL.set_title(species[1])
vTL.set_ylabel('TL (mm)')
vTL.set_ylim(0,140)
plt.suptitle("")

Why is it that outliers for both species start at around 45mm?

In [None]:
vTL=df2runa.loc[df2runa.species=='v'].boxplot(column=['tl'],by=['sex', 'autotomized'])
vTL.set_title(species[1])
vTL.set_ylabel('TL (mm)')
vTL.set_ylim(0,140)
plt.suptitle("")

investigate m intact <40 and female autotomized ~10

#### mass

**NOTE:** There are still some low mass Sj adults that we need to investigate

In [None]:
jMass=df2run.loc[df2run.species=='j'].boxplot(column=['mass'],by=['sex', 'age_class'])
jMass.set_title(species[0])
jMass.set_ylabel('Mass (g)')
jMass.set_ylim(0,35)
plt.suptitle("")

In [None]:
vMass=df2run.loc[df2run.species=='v'].boxplot(column=['mass'],by=['sex', 'age_class'])
vMass.set_title(species[1])
vMass.set_ylabel('Mass (g)')
vMass.set_ylim(0,35)
plt.suptitle("")

#### TL/SVL Ratio
**NOTE:** There are still some low SVL/TL ratios here for adults that we need to look into

In [None]:
jTlSvl=df2run.loc[df2run.species=='j'].boxplot(column=['tl_svl'],by=['sex', 'age_class'])
jTlSvl.set_title(species[0])
jTlSvl.set_ylabel('TL/SVL')
jTlSvl.set_ylim(0,2.25)
plt.suptitle("")

In [None]:
vTlSvl=df2run.loc[df2run.species=='v'].boxplot(column=['tl_svl'],by=['sex', 'age_class'])
vTlSvl.set_title(species[1])
vTlSvl.set_ylabel('TL/SVL')
vTlSvl.set_ylim(0,2.25)
plt.suptitle("")

In [None]:
jMassSvl=df2run.loc[df2run.species=='j'].boxplot(column=['mass_svl'],by=['sex', 'age_class'])
jMassSvl.set_title(species[0])
jMassSvl.set_ylabel('Mass/SVL (g/mm)')
jMassSvl.set_ylim(0,.8)
plt.suptitle("")

In [None]:
jMaleMassSvl=df2run.loc[(df2run.species=='j')&(df2run.sex=='m')].plot.scatter(x='svl',y='mass',edgecolors='red',facecolors='none')
jFeMassSvl=df2run.loc[(df2run.species=='j')&(df2run.sex=='f')].plot.scatter(x='svl',y='mass',edgecolors='blue', facecolors='none',ax=jMaleMassSvl)
jFeMassSvl.set_title(species[0])
jFeMassSvl.set_xlabel('SVL (mm)')
jFeMassSvl.set_ylabel('Mass (g)')
#vFeMassSvl.set_ylim(0,.5)
plt.suptitle("")

In [None]:
vMassSvl=df2run.loc[df2run.species=='v'].boxplot(column=['mass_svl'],by=['sex'])
vMassSvl.set_title(species[1])
vMassSvl.set_ylabel('Mass/SVL (g/mm)')
vMassSvl.set_ylim(0,.5)
plt.suptitle("")

In [None]:
vMaleMassSvl=df2run.loc[(df2run.species=='v')&(df2run.sex=='m')].plot.scatter(x='svl',y='mass',edgecolors='red',facecolors='none')
vFeMassSvl=df2run.loc[(df2run.species=='v')&(df2run.sex=='f')].plot.scatter(x='svl',y='mass',edgecolors='blue', facecolors='none',ax=jMaleMassSvl)
vFeMassSvl.set_title(species[1])
vFeMassSvl.set_xlabel('SVL (mm)')
vFeMassSvl.set_ylabel('Mass (g)')
vFeMassSvl.set_xlim(left=0,right=100)
plt.suptitle("")

In [None]:
df2run.loc[(df2run.sex=='m')].groupby(['species','sex']).plot.scatter(x='svl',y='tl',edgecolors=['red','blue'],facecolors='none')
#MassSVLdf2run.loc[(df2run.sex=='m')].groupby('species').plot.scatter(x='svl',y='mass',edgecolors='red',facecolors='none')

In [None]:
juvMassSVL=df.loc[(df.age_class=='juvenile')&(df.myDrop!=True),].groupby('species').boxplot(column=['mass_svl'],by=['sex'])

In [None]:
adultMassSVL=df2run.loc[(df2run.sex=='m')].groupby(['species','sex']).plot.scatter(x='svl',y='mass',edgecolors=['red','blue'],facecolors='none')
#MassSVLdf2run.loc[(df2run.sex=='m')].groupby('species').plot.scatter(x='svl',y='mass',edgecolors='red',facecolors='none')
#adultMassSVL.set_title(species)
#adultMassSVL.set_ylabel('Mass/SVL (g/mm)')
#adultMassSVL.set_ylim(0,.5)
#plt.suptitle("")

Add groupby arguments that include species ageclass and sex for all summaries
    - consider adding year
Types of visualizations:
- tables (autotomy, new/recap (1st sightings only)
- boxplots (svl, tl, rtl, mass)
- histograms (age class (svl), meters (location))

For inferential stats
- differences:
    - between seasons within years 
    - between years (weather and fire)
    - population density (revist how to calculate this)
        - ran study until flatline
        - do we need to account for person-hours still?

The following histograms show the distribution of animals linearly along the site.  The x-axis is location in meters and the y axis in the number of animals.  The graphs are separated by sex and species.

The differences between the adults and juvenile are interesting, no?

Adults

In [None]:
#Scale the figures so that the y axes are the same
df2run.loc[(df2run.myDrop!=True)&(df2run.species=='v')].groupby('year').hist(column='svl',by=['sex'])
#df.loc[(df.myDrop!=True)].groupby(df.date.mo).hist(column='svl',by=['sex'])

Juveniles

In [None]:
df.loc[(df.myDrop!=True)&(df.age_class=='juvenile')].groupby('species').hist(column='svl',by=['sex'])

TL - Adults

In [None]:
#Standardize x and y axes
df.loc[(df.myDrop!=True)&(df.age_class=='adult')].groupby('species').hist(column='tl',by=['sex'])

Juveniles

In [None]:
df.loc[(df.myDrop!=True)&(df.age_class=='juvenile')].groupby('species').hist(column='tl',by=['sex'])

Overall view of tail loss
**NOTE:** The autotomized==True argumetn is throwing an error here for some reason and rtl!=0 may exclude autotomized individuals which haven't regrown tail. Have to chase this down later

In [None]:
#df.loc[(df.myDrop!=True)&(df.age_class=='adult')&(df.autotomized==True)].hist(column='rtl',by=['species','sex'])
#df.loc[(df.myDrop!=True)&(df.age_class=='juvenile')&(df.autotomized==True)].hist(column='rtl',by=['species','sex'])
#df.loc[df.rtl!=0].hist(column='rtl',by=['species','sex'])

Adults

In [None]:
df.loc[((df.age_class=='adult')&(df.myDrop!=True)),].groupby('species').hist(column='tl',by=['sex'])

Juveniles

In [None]:
df.loc[(df.myDrop!=True)&(df.age_class=='juvenile')].groupby('species').hist(column='svl',by=['sex'])

In [None]:
df.loc[((df.myDrop!=True)&(df.age_class=='adult')),].groupby('species').hist(column='mass',by=['sex'])