# Cleaning CC data

This python notebook operates on a csv created after editing in open refine and is designed to finish cleaning columns of interest which were easier to clean in python.

## Setting up Python

Here we import necessary packages. 
This chunk may take a while.

In [1]:
import pandas as pd
import numpy as np
import os
import liznumber as ln
import liztoes as lt

import plotly
import plotly.plotly as py
import plotly.graph_objs as go

plotly.tools.set_config_file(world_readable=True)

# increase print limit
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 50

ModuleNotFoundError: No module named 'liznumber'

## Functions necessary for this notebook

# import pandas as pd
def report_pattern (x , pattern , col,return_type):
    """searches a pandas series for a regex expression, pattern, and replaces with replacement"""
   
    res = print('{}:\ntoe pattern {}:{}'.format(return_type,str(pattern),(x[col].str.match(pattern)==True).sum())) 
    
    return res

# import pandas as pd
#needs to capture when an entires fits multiple patterns and which patterns those are
def label_pattern (x , pat_num , pattern, pat_col = 'toe_pattern' , col = 'toes'):
    """searches a pandas series for a regex expression, pattern, and replaces with replacement"""
#     label the pattern
    x.loc[(x[col].str.match(pattern)==True)&(x[pat_col].isnull()),pat_col] = str(pat_num)
    return x
    

# import pandas as pd
def make_str(x):
    assert isinstance(x,pd.Series)
    #convert series to string
    x = x.astype(str)
    #create an index of single-digit numbers
    idx = x.str.len()<2
    #add a zero to the beginning of those single-digit numbers
    x.loc[idx] = '0' + x.loc[idx]
    return x

# import pandas as pd
def replace_pattern (x , pattern , pattern_b , source_col , replacement):
    """searches a pandas series for a regex expression, pattern, and replaces with replacement"""
    
    return x.loc[x[source_col].str.match(pattern)==True,source_col].str.replace(pattern_b,replacement)


### Use this chunk to read data from local folder on Chris' machine

In [None]:
# Source Data
sourceDataPers = 'C:/Users/Christopher/Google Drive/TailDemography/outputFiles'
sourceDataBig = 'S:/Chris/TailDemography/combined data'

#Output Data paths
outputPers = 'C:/Users/Christopher/Google Drive/TailDemography/outputFiles'
outputBig = 'S:/Chris/TailDemography/data'


In [None]:
os.chdir(sourceDataBig)
df=pd.read_csv('mapped-data-all_18-01-08_post_openrefine.csv')

os.chdir(outputBig)
df.head()


Let's take a look at the data

In [None]:
print("\nThere are {} data points in our data set.".format(df.shape[0]))

In [None]:
print("\nThe columns in the data have the following data types:\n{}".format(df.dtypes))

## Correcting class of columns

In [None]:
#We need to add real error handling into these conversion chunks

##Convert integer columns to int
intCols = ['meters','year']
df[intCols]=df[intCols].astype(int,errors='ignore')

##Convert numeric columns to numeric
numCols = ['svl','tl','rtl','rtl_orig','mass']
df[numCols]=df[numCols].apply(pd.to_numeric,errors='coerce')

##Convert string columns to str
strCols = ['toes','sex','species','vial']
df[strCols]=df[strCols].astype(str, errors='ignore')

#Convert date to datetime
df.loc[df.date=="NA"]=np.nan
df.date = pd.to_datetime(df.date,errors='coerce')

##Convert bool columns to bool
boolCols = ['review_sex','review_species','review_painted','review_new.recap',\
            'review_rtl','forceMale','forceFemale','forceRecap','forceNew',\
            'forceSighting','drop_species','drop_morphometrics','autotomized']
df[boolCols]=df[boolCols].astype(bool, errors='ignore')

In [None]:
print("\nAfter applying the above changes, the data types are as follows:\n{}".format(df.dtypes))

## Remove leading and trailing whitespaces

for col in df:
    print(len(col))# returns unique lengths of sex
    col=col.strip()

for col in df:
    col=col.strip()

## Cleaning toes column

First we will rename "toes" to "toes_orig"

In [None]:
df = df.rename(columns = {'toes':'toes_orig'},index = str)

Next we create a new column, "toes"  for the renamed toes

In [None]:
df['toes'] = df.toes_orig

Now we attempt to identify problem toes name and correct or export for review.

In [None]:
pattern1 = ".( {1,}-.|.- {1,}.)" # toes entries with any number of spaces on either side of a hyphen
pattern2 = ".( {,}\w{,} {1,})." # toes entries with space around or between numbers <- the spaces here should be deleted
pattern3 = ".(')."
pattern4 = "./."  # entries with '/' <-- need to replace these with '-'
pattern5 = "(\?{1,})"#<-- these needs to be investigated
pattern6 = "^\d{3,}$" # entries consist of only a single number comprised of at least three digits 
#<-- these needs to be investigated by checking raw field notes
pattern7 = ".(-{2,})." # entries which have at least 2 consecutive '-' <- these should be investigated
pattern8 = "^0" # entries in which single digit numbers have a leading "0" <-- Check raw field notes on this too
pattern9 = "a\w" #<--handled hyphens should be inserted  between the [ab] and \w 
# entries that contain an 'a' or 'b' followed by any character in the set [a-zA-Z0-9_]
pattern10 = "b\w" #<--handled hyphens should be inserted  between the [ab] and \w 
pattern11 = "\wa" # entries that contain an 'a' or 'b' preceded by any character in the set [a-zA-Z0-9_]
pattern12 = "\wb" # entries that contain an 'a' or 'b' preceded by any character in the set [a-zA-Z0-9_]
pattern13 = "[()]"
# remove space before 'a' at end of toes
#investigate '\d-', 
#'-(*)-', 
#' (16) ', 
#'---', <- may not exist in raw data
#'\d- ', 
#'- \d', 
#transcription errors from excel (toes in date format,
#'-\d\d\d\d' <- may not be in the data set

We'll have to change this block if we add or remove toe patterns.
This is not ideal and needs to be fixed

In [None]:
help(lt.label_pattern)

In [None]:
toe_pattern = pd.Series([*range(1,14)]) 
toe_pattern = lt.make_str(toe_pattern)
print(toe_pattern)

toe_pattern_descr = pd.Series([pattern1,pattern2,pattern3,pattern4
                               ,pattern5,pattern6,pattern7,pattern8
                               ,pattern9,pattern10,pattern11,pattern12,pattern13])
toe_pattern_descr = toe_pattern_descr.astype(str)
print(toe_pattern_descr)

toe_pattern_reference = pd.DataFrame({'toe_pattern': toe_pattern,'description':toe_pattern_descr})
toe_pattern_reference

We first replace the string 'nan' with a null value

In [None]:
df.loc[df.toes=='nan','toes'] = np.nan

Let's see how many of these patterns we need to correct

In [None]:
df['toe_pattern'] = np.nan

Here we use a for-loop to label the patterns 
(there's probably a better way to do this with pandas map or apply, but I'll have to figure this out, for now this is fast enough, but it could make a difference with a larger data set or with more patterns)

In [None]:
for i in range(0,toe_pattern_reference.shape[0]):
    tmp_pat_num = toe_pattern_reference.iloc[i,0]
    tmp_pattern = toe_pattern_reference.iloc[i,1]
    df = label_pattern(df,tmp_pat_num,tmp_pattern,'toe_pattern','toes')

A quick summary of the number of observations for each pattern in the data set

In [None]:
toe_errors =df.toe_pattern.value_counts(dropna=False).reset_index()\
.rename(columns = {'index':'toe_pattern','toe_pattern':'observations'})
toe_errors.loc[toe_errors.toe_pattern.isnull(),'toe_pattern'] = 'Not covered by current patterns'
toe_errors_desc = toe_errors.merge(toe_pattern_reference,'left',on='toe_pattern')
toe_errors_desc

Now let's make sure we've accounted for every row in the data set

In [None]:
accountedRows = toe_errors.observations.sum()
totalRows = df.shape[0]
notAccountedRows = df.shape[0] - toe_errors.observations.sum()
print("\nThere are {} rows accounted for in the patterns (including null values) and there {} rows in the full data set.\
  There are {} rows unaccounted for.".format(accountedRows,totalRows,notAccountedRows))

### And now we correct these patterns
We'll preserve the original toe data in a column called "toes_orig" just in case.  We can drop this later, if we are comfortable with the changes.  The new toes will be labeled "toes".

In [None]:
corrections_config = {'01':{'action':'replace','pattern_b':" ",'replacement':"\"\""},
            '02':{'action':'replace','pattern_b':" ",'replacement':"-"},
            '03':{'action':'replace','pattern_b':"\'",'replacement':"\"\""},
            '04':{'action':'replace','pattern_b':"/",'replacement':"-"},
            '05':{'action':'save','pattern_b':np.nan,'replacement':np.nan},
            '06':{'action':'save','pattern_b':np.nan,'replacement':np.nan},
            '07':{'action':'save','pattern_b':np.nan,'replacement':np.nan},
            '08':{'action':'replace','pattern_b':"^0",'replacement':"\"\""},
            '09':{'action':'replace','pattern_b':'a','replacement':'-a'},
            '10':{'action':'replace','pattern_b':'b','replacement':'-b'},          
            '11':{'action':'replace','pattern_b':"a",'replacement':"a-"},
            '12':{'action':'replace','pattern_b':"b",'replacement':"b-"},
            '13':{'action':'replace','pattern_b':"[()]",'replacement':"\"\""}}

In [None]:
toe_errors_desc['action'] = toe_errors_desc.loc[toe_errors_desc.toe_pattern.str.len()==2].toe_pattern\
.map(lambda x: corrections_config[x]['action'],na_action='ignore')

toe_errors_desc['replacement'] = toe_errors_desc.loc[toe_errors_desc.toe_pattern.str.len()==2].toe_pattern\
.map(lambda x: corrections_config[x]['replacement'],na_action='ignore')

toe_errors_desc = toe_errors_desc.sort_values('toe_pattern').reset_index(drop=True)
toe_errors_desc

In [None]:
for i in range(0,toe_errors_desc.shape[0]):
    tmp_pat_num = toe_errors_desc.iloc[i,0]
    tmp_pattern = toe_errors_desc.iloc[i,2]
    action = toe_errors_desc.iloc[i,3]
    tmp_replacement = toe_errors_desc.iloc[i,4]
    tmp_x = df.loc[df.toe_pattern==tmp_pat_num,:]
    
    if action =='save':
        tmp_filename = 'pattern'+tmp_pat_num+'.csv'
        tmp_x.to_csv(tmp_filename)
        print("Pattern {} successfully saved to {}.".format(tmp_pattern,tmp_filename))
    if action =='replace':
        df.loc[df.toe_pattern==tmp_pat_num,'toes'] = replace_pattern(x=df.loc[df.toe_pattern==tmp_pat_num]
                                                                     ,pattern = tmp_pat_num
                                                                     ,pattern_b = tmp_pattern
                                                                     ,source_col = 'toes'
                                                                    ,replacement = tmp_replacement)
        print("Pattern {} successfully replaced with {}.".format(tmp_pattern,tmp_replacement))
    else:
        print("No direction provided for pattern {}.  No action was taken.".format(tmp_pattern))

### Now we confirm that the patterns we expect to have eliminated have indeed been eliminated from the data set

In [None]:
for i in range(0,toe_pattern_reference.shape[0]):
    tmp_pattern = str(toe_pattern_reference.iloc[i,1])
    report_pattern(df,tmp_pattern,'toes','Post-Correction')

## Cleaning Sex column
Next we move on to cleaning the "sex" column.

First we want to get an idea of the types of problems in the sex column.  We start by striping leading and trailing whitespaces.  You can see here that there were none in the data set.

In [None]:
print(df.sex.str.len().unique())# returns unique lengths of sex
df.sex=df.sex.str.strip()
print(df.sex.str.len().unique())

### Identify non "m" or "f" values and their frequencies

In [None]:
patterns_sex="m|f|NA"
non_matches=df.sex.loc[df.sex.str.match(patterns_sex)!=True]
print("\nThere are {} entries for sex which do not match the patterns {}:"\
      .format(non_matches.shape[0],patterns_sex.split("|")))
non_matches.value_counts()

### Identify values to convert to NA, m, or f

In [None]:
sex2NA=['adult','juv','nan']
sex2m=['unm']
df.loc[df.sex.isin(sex2NA)==True]
print(df.sex.loc[df.sex.isin(sex2NA)==True].count())
print(df.sex.loc[df.sex.isin(sex2m)==True].count())

### Convert the values to NA or m, respectively.

In [None]:
df.loc[df.sex.isin(sex2m)]

In [None]:
df.loc[df.sex.isin(sex2NA),'sex']=np.nan
df.loc[df.sex.isin(sex2m),'sex']='m'
print(df.sex.loc[df.sex.isin(sex2NA)==True].count())
print(df.sex.loc[df.sex.isin(sex2m)==True].count())

# Set all remaining species and sex with "?" to NaN

In [None]:
df.loc[(df.species.str.contains('\?')) & (df.species.notnull()),'species'] = np.nan
df.loc[(df.sex.str.contains('\?')) & (df.sex.notnull()),'sex'] = np.nan

# Cleaning autotmized column

In [None]:
autotomyDict = {False:'intact',True:'autotomized'}

df.loc[:,'autotomized'] = df.loc[:,'autotomized'].map(autotomyDict)
df.autotomized.unique()

# Cleaning new.recap column

In [None]:
df.head()

In [None]:
#try using a dict to do thing more efficiently
newRecapKeep = ['recap', 'new', 'r', 'n']
new = ['new','n']
recap = ['recap','r']
df.loc[~df['new.recap'].isin(newRecapKeep),'new.recap'] = np.nan
df.loc[df['new.recap'].isin(new),'new.recap'] = 'new'
df.loc[df['new.recap'].isin(recap),'new.recap'] = 'recap'

## Add Columns

In [None]:
# tl_svl and mass_svl
df['tl_svl']=(df.tl/df.svl)
df['mass_svl']=(df.mass/df.svl)

## Create function to generate lizardNumber 
 lizard number is a numeric identifier of unique animals in the data set
function takes the following arguments:
    - *x*: series object on which function acts on
    - *sortCriteria*: list of strings of column names on which to sort data.  data are sorted by columns from left to right
    - *validationCriteria*: dictionary of dictionaries that identify columns to validate and validation expression of the form {{'column':'column_2 >= column_1'},{'otherColumn':'column_2 >= column_1'}}
    - *result*: dictionary of of dictionaries detailing the value *x* takes if validations are True or False of the form: {{'True':x=x[i]},{'False':x=x[i]+1},{errors: 'raise'}}, errors may be 'raise' *default* (terminates function and returns an error) or 'ignore' (returns 'NA')
Function action:
- first sort data by species, toes, then date

- for time points 1 and , with 2 being later: 
    - toes2 == toes1
    - svl2-svl1 >=-2
    - year2-year1 <=7
    - for species ==j:
        - if svl >=56:
            - if sex2==sex1:
                lizardNumber[i+1]=lizardNumber[i]
          else:
            - lizardNumber[i+1]=lizardNumber[i]+1

In [None]:
import pandas as pd
import os

sortCriteria = ['species','toes', 'sex']
validation = ['date','svl']


def lizsort(x,path:str,sortCriteria = ['species','toes', 'sex'], validation = ['date','svl'],\
            unsortablefile ='unsortable.csv'):
    """
    takes a pandas data frame and returns a pandas dataframe with only those values which 
    can be evaluated according to given criteria and prints a summaryof the files evaluated
    :param path:
    :param sortCriteria:
    :param validation:
    :param unsortablefile:
    """
    #identify lizards with sufficient data to evaluate
    #report on those without sufficient data and save them to a file for later evaluation
    critical = sortCriteria +validation
    unsortable = x.loc[x.loc[:,critical].isnull().any(axis=1)]
    sortable = x.loc[x.loc[:,critical].notnull().all(axis=1)]
    os.chdir(path)
    unsortable.to_csv(unsortablefile)
    print("\nThere were {} entries for which values for one of the critical criteria, ({}), were null.  \
    These entries could not be evaluated and were written out to the file {} for evaluation."\
          .format(unsortable.shape[0],critical,unsortablefile))
    return sortable

def mindate(x, sortCriteria = ['species','toes', 'sex']): # finds date of the initial capture of an animal
    """
    takes a pandas data frame and returns a dataframe with sorting criteria adds a column containing the earliest date 
    at which each unique combination of the sort criteria was sighted. [Requires that the source dataframe,x, has a 
    column labeled 'date'.]
    """
    if any(x.columns=='initialCaptureDate'):
        x = tmp_sort['n_val_data'].drop('initialCaptureDate',1)
    sortable_min_date =pd.DataFrame(x.groupby(sortCriteria).date.min()).\
    rename(index = str, columns= {'date':'initialCaptureDate'}).reset_index()
    x = x.merge(sortable_min_date,how = 'left', on = sortCriteria)
    x['year_diff'] = x.date.dt.year - x.initialCaptureDate.dt.year
    return x

def smallest(x, svlGroup = ['species','toes', 'sex','initialCaptureDate']):#finds svl of animal at date of the initial capture.  needs to be moved out of function
    if any(x.columns=='smallest_svl'):
        x = x.drop('smallest_svl',1)
    sortable_smallest_svl =x.groupby(svlGroup).svl.min().reset_index()\
    .rename(index = str, columns= {'svl':'smallest_svl'})
    #sortable_smallest_svl
    x = x.merge(sortable_smallest_svl,how = 'left', on = svlGroup)
    x['svl_diff'] = x.svl - x.smallest_svl
    return x

def validate (x, sortCriteria = ['species','toes', 'sex'],validation = ['date','svl']):
    x['tmp'] = 1 
    numbers = x.loc[(x.year_diff<=7) & (x.svl_diff>=-2),:].\
    groupby(['species','sex','toes']).tmp.min().cumsum().reset_index()
    validated = x.loc[(x.year_diff<=7) & (x.svl_diff>=-2),:].shape[0]
    not_val_data = x.loc[(x.year_diff<=7) & (x.svl_diff>=-2),:]
    not_validated = x.loc[~((x.year_diff<=7) & (x.svl_diff>=-2)),:].shape[0]
    numbers = numbers.rename(columns={'tmp':'liznumber'}) # rename last column to liznumber
    #the next line merges the numbers to the original data frame to assign the lizard number to the full record
    #of an animal.  It then drop 'tmp'and 'smallest_svl, sinc ewe won't be using these again
    x = x.merge(numbers,'left', on = ['species','sex','toes']).drop(['tmp','smallest_svl'],1)
    print("\nOf those entries we can handle, there are {} individuals as defined by {} which pass validataion based\
    on {} and {} which do not pass validation."\
          .format(validated,sortCriteria,validation,not_validated))
    return {'val_data':x,'n_val_data':not_val_data,'n_validated':not_validated}

# def genliznum2(df, path:str, errors:str= 'raise'):
#     """
#     calls functions to generate a unique identifier for each lizard
    
#     Lizard number is a numeric identifier of unique animals in the data set function takes the following arguments:
#     :param df:  series object on which function acts on
#     :param sortCriteria: list of strings of column names on which to sort data.  data are sorted by columns from left \
#     to right
#     :param validation: dictionary of dictionaries that identify columns to validate and validation expression of the form:\
#      {{'column':'column_2 >= column_1'},{'otherColumn':'column_2 >= column_1'}}
#     :param errors: str , errors may be 'raise' *default* (terminates function and returns an error) or 'ignore' (returns 'NA')
#     :return: dataframe
#     #dictionary  of dictionaries detailing the value *x* takes if validations are True or False of the form: \
#     #{{'True':x=x[i]},{'False':x=x[i]+1},{errors: 'raise'}}
#     """
#     sortable = lizsort(df, path = path)
#     sortable = mindate(sortable)
#     sortable = smallest(sortable)
#     tmp_sort = validate(sortable)
#     sortable = tmp_sort['val_data']
#     n_val = mindate(tmp_sort['n_val_data'])
#     n_val = smallest(n_val)
#     n_val = validate(n_val)['val_data']
 
#     res = n_val
#     return res


genliznum2(df, path = 'C:\\Users\\Christopher\\Documents\\GitHub\\tailDemography\\data')

### Initial attempt to assign lizard numbers

In [None]:
sortable = lizsort(df, path = 'S:\\Chris\\TailDemography\\data')
    
sortable = mindate(sortable)
sortable = smallest(sortable)
tmp_sort = validate(sortable)
sortable = tmp_sort['val_data']

### Second attempt to assign lizard numbers

In [None]:
n_val = mindate(tmp_sort['n_val_data'])
n_val = smallest(n_val)
df_numbered = validate(n_val)['val_data']

### Displaying the output data frame

In [None]:
df_numbered

### QC of lizard numbers

Identify individuals that have same species and toes, but different sex for review

In [None]:
df = df.merge(df.groupby(['species','toes']).sex.nunique().reset_index().rename(columns = {'sex':'sex_count'})\
         ,how = 'inner', on = ['species','toes'])
print(df.loc[df.sex_count>1,:].shape[0])
df.loc[df.sex_count>1,:].to_csv('entries flagged with same species and toes diff sex.csv')
df.head()

In [None]:
df.groupby(['species','toes']).sex.nunique()

In [None]:
print("Lizard Numbers in the sample range from {} to {}."\
      .format(df_numbered.liznumber.min(),df_numbered.liznumber.max()))

In [None]:
possibleLizNum = set(range(int(df_numbered.liznumber.min()),int(df_numbered.liznumber.max())))
actualLizNum = set(pd.Series(df_numbered.liznumber.unique()).dropna().apply(int))
print("\nThere are {} entries.  There are {} unique lizard numbers.\
\n\nThe liznumber ranges from {} to {}."\
  .format(df_numbered.shape[0],len(df_numbered.liznumber.unique())\
          ,df_numbered.liznumber.min(),df_numbered.liznumber.max()))

missingLizNum = possibleLizNum - actualLizNum
if len(missingLizNum)>0:
    print("\n\nThe following numbers are not assigned to a lizard:\n{}"\
      .format(missingLizNum))
else:
    print("\n\nThere are no numbers which were not assigned.")

## Add additional columns
- *daysSinceCapture* [int]:identifies the number of days since the animal was captured
- *capture* [int]: identifies the number of times an animal has been captured prior to an entry

In [None]:
df_numbered.loc[:,'daysSinceCapture'] = (df_numbered.date - df_numbered.initialCaptureDate).dt.days


In [None]:
# need to QC this
df_numbered['capture'] = df_numbered.sort_values(['liznumber','date'])\
.groupby(['liznumber']).daysSinceCapture.cumcount()+1

In [None]:
df_numbered.species.unique()

In [None]:
print(df_numbered.loc[df_numbered.species.isin(['j','v'])].groupby('capture').capture.count())

In [None]:
data = [go.Histogram(x = df_numbered.groupby('liznumber').capture.max())]
py.iplot(data, filename = 'Frequency of Captures in Crystal Creek 2000 - 2017')

In [None]:
lizards = [go.Scatter(x = df_numbered.liznumber,
                   y = df_numbered.groupby('liznumber').daysSinceCapture.max(), 
                     mode = 'markers')]
# year1 = [go.scatter.Line(y = 365)]
# year2 = [go.scatter.Line(y = 365*2)]
# year3 = [go.scatter.Line(y = 365*3)]
# year4 = [go.scatter.Line(y = 365*4)]
# year5 = [go.scatter.Line(y = 365*5)]
# year6 = [go.scatter.Line(y = 365*6)]
# year7 = [go.scatter.Line(y = 365*7)]
# year8 = [go.scatter.Line(y = 365*8)]

# data = [lizards, year1, year2, year3, year4, year5, year6, year7, year8]
data = lizards
layout = go.Layout(
    title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017',
        titlefont = dict(
            size = 20),
    xaxis = dict(
            title='Lizard Number',
            titlefont=dict(
                size=18)),
    yaxis = dict(
            title='Greatest Number of Days Since<br> Initial Capture',
            titlefont=dict(
                size=18)))

fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename = 'Days Since Initial Capture in Crystal Creek 2000 - 2017')

In [None]:
dfF = df_numbered.loc[df_numbered.sex =='f']
dfM = df_numbered.loc[df_numbered.sex =='m']

In [None]:
females = go.Scatter(
    x = dfF.liznumber,
    y = dfF.groupby('liznumber').daysSinceCapture.max(),
    name = 'females',
    mode = 'markers',
    marker = dict(
        color = 'rgba(152, 0, 0, .8)',
        opacity = 0.75,
        line = dict(
            width = 2,
            color = 'rgb(0, 0, 0)'
        )
    )
)

males = go.Scatter(
    x = dfM.liznumber,
    y = dfM.groupby('liznumber').daysSinceCapture.max(),
    name = 'males',
    mode = 'markers',
    marker = dict(
        color = 'rgba(255, 182, 193, .9)',
        opacity = 0.75,
        line = dict(
            width = 2,
        )
    )
)

data = [females, males]

layout = dict(title = 'Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex',
              yaxis = dict(
                  title='Greatest Number of Days Since<br> Initial Capture',
                  titlefont=dict(
                      size=18)
              ),
              xaxis = dict(zeroline = False)
             )

fig = dict(data=data, layout=layout)
py.iplot(fig, filename='Days Since Initial Capture in Crystal Creek 2000 - 2017 By Sex')

In [None]:
males = go.Histogram(x = df_numbered.loc[df.sex == 'm','capture'],opacity= 0.75,name='males')
females = go.Histogram(x = df_numbered.loc[df.sex == 'f','capture'], opacity= 0.75, name = 'females')
data = [males,females]
py.iplot(data, filename = 'Frequency of Captures by Sex in Crystal Creek 2000 - 2017')

## QC of Capture number and Recap status

In [None]:
df_numbered.columns

In [None]:
recapQuestion=df_numbered.loc[(df_numbered.capture==1 )& (df_numbered["new.recap"]=='recap'),:]
print("There are {} instances in rows for which a lizard appears to have only one capture, but is listed as a recap.\
The distribution of these across years in the sample is as follows:\n{}."\
      .format(recapQuestion.shape[0],recapQuestion.year.value_counts()))
recapQuestion.to_csv("Questionable recaptures.csv")#These individuals need to be rechecked in the raw notes
recapQuestion.head()

In [None]:
recapQuestion.loc[recapQuestion.svl<54,:]

Now we export the cleaned data to a csv

In [None]:
df_numbered = df_numbered.rename(index = str, columns = {'new.recap':'newRecap'})
qc_drop_cols = df_numbered.columns[df_numbered.columns.str.contains('force|drop')]
df_full = df_numbered.drop(qc_drop_cols,1)

In [None]:
timestamp = (pd.to_datetime('now')-pd.Timedelta(hours=4))
timestamp = str(timestamp).replace(':','_')
#path=''C:\\Users\\Christopher\\Google Drive\\TailDemography\\outputFiles\\''
# path=outputBig
filename = 'cleaned CC data 2000-2017_' + timestamp+ '.csv'
# filename = path + '/cleaned CC data 2000-2017' + '.csv'
df_full.to_csv(filename,index = False)
filename