In [26]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

# ACQUIRE
I'll begin by acquiring the data from a local CSV file.

In [2]:
df = pd.read_csv('aac_shelter_outcomes.csv')

df.head()

Unnamed: 0,age_upon_outcome,animal_id,animal_type,breed,color,date_of_birth,datetime,monthyear,name,outcome_subtype,outcome_type,sex_upon_outcome
0,2 weeks,A684346,Cat,Domestic Shorthair Mix,Orange Tabby,2014-07-07T00:00:00,2014-07-22T16:04:00,2014-07-22T16:04:00,,Partner,Transfer,Intact Male
1,1 year,A666430,Dog,Beagle Mix,White/Brown,2012-11-06T00:00:00,2013-11-07T11:47:00,2013-11-07T11:47:00,Lucy,Partner,Transfer,Spayed Female
2,1 year,A675708,Dog,Pit Bull,Blue/White,2013-03-31T00:00:00,2014-06-03T14:20:00,2014-06-03T14:20:00,*Johnny,,Adoption,Neutered Male
3,9 years,A680386,Dog,Miniature Schnauzer Mix,White,2005-06-02T00:00:00,2014-06-15T15:50:00,2014-06-15T15:50:00,Monday,Partner,Transfer,Neutered Male
4,5 months,A683115,Other,Bat Mix,Brown,2014-01-07T00:00:00,2014-07-07T14:04:00,2014-07-07T14:04:00,,Rabies Risk,Euthanasia,Unknown


# PREPARE
I'm now going to prepare the data for exploration.
***

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78256 entries, 0 to 78255
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   age_upon_outcome  78248 non-null  object
 1   animal_id         78256 non-null  object
 2   animal_type       78256 non-null  object
 3   breed             78256 non-null  object
 4   color             78256 non-null  object
 5   date_of_birth     78256 non-null  object
 6   datetime          78256 non-null  object
 7   monthyear         78256 non-null  object
 8   name              54370 non-null  object
 9   outcome_subtype   35963 non-null  object
 10  outcome_type      78244 non-null  object
 11  sex_upon_outcome  78254 non-null  object
dtypes: object(12)
memory usage: 7.2+ MB


After reviewing the data and my goals for this project, I'll need to make the following changes:
- Drop the following columns
    - animal_id
        - Unique id's won't be useful in predicting outcomes
    - breed
        - Highly variable, categorical column. 
        - At a later time I'd like to bin these values and explore them for statistical signifigance
    - color
        - Highly variable, categorical column. 
        - At a later time I'd like to bin these values and explore them for statistical signifigance
    - date_of_birth
        - Since I have the age of each animal at the time of outcome, I won't need to calculate their age using this variable
    - datetime, monthyear
        - Duplicate columns that presumably reflect when a record was entered and are thus not useful for predicting outcomes
    - outcome_subtype
        - Gives details about outcome (example: why was an animal euthanised?)
        - May prove useful but appears to only be known when the outcome is known (example: (outcome_subtype partner) (outcome_type: transfer) could only be known when an animal is set for transfer to a partner)
        - The value of this project is to create a model that will predict if an animal will be adopted, so outcome_subtype probably wouldn't be set until outcome is set, so I'm going to drop it for now
    - Any columns that are split into categorical columns and no longer needed
- Rename columns to better reflect the data they represent
- Bin age_upon_outcome into categorical columns 
    - < 1 year
    - 1-2 years old
    - 2 or more yrs old
- Convert animal type to categorical columns
    - is_cat
    - is_dog
- Transform outcome_type to binary categorical column 'adopted' ( 1 = yes, 2 = no)
- Split sex_upon_outcome into categorical columns for neutered/spaded and sex
    - is_neutered_or_spaded
    - is_male
    - is_female
    - sex_unknown
- Drop null values
    - Very few nulls so dropping won't have a meaningful impact

### Dropping columns and nulls

In [4]:
df = df[['age_upon_outcome', 'animal_type', 'sex_upon_outcome', 'outcome_type']]

df.dropna(inplace = True)

### Binning age data

In [5]:
df.age_upon_outcome.value_counts(dropna=False)

1 year       14352
2 years      11194
2 months      9213
3 years       5157
3 months      3442
1 month       3341
4 years       2990
5 years       2691
4 months      2425
5 months      1951
6 months      1897
6 years       1810
8 years       1554
7 years       1536
3 weeks       1467
2 weeks       1330
10 months     1203
4 weeks       1193
8 months      1178
10 years      1158
7 months       963
9 years        822
9 months       673
12 years       609
1 weeks        513
11 months      490
11 years       429
1 week         427
13 years       389
14 years       253
3 days         235
2 days         217
15 years       208
1 day          153
6 days         152
4 days         136
5 days         116
16 years       101
0 years         94
5 weeks         61
17 years        58
18 years        26
19 years        13
20 years        12
22 years         4
25 years         1
Name: age_upon_outcome, dtype: int64

In [6]:
df['age_under_1'] = np.where(
    (df.age_upon_outcome.str.contains('week')) | 
    (df.age_upon_outcome.str.contains('month')) |
    (df.age_upon_outcome.str.contains('days')), 1, 0)

df['age_between_1_and_2'] = np.where((df.age_upon_outcome.str.contains('1 year')), 1, 0)

df['age_2_or_more'] = np.where((df.age_upon_outcome.str.contains('years')), 1, 0)

### animal type

In [7]:
df.animal_type.value_counts()

Dog          44234
Cat          29418
Other         4243
Bird           333
Livestock        9
Name: animal_type, dtype: int64

In [8]:
# change livestock to other
df['animal_type'] = np.where((df.animal_type == 'Livestock'), 'Other', df.animal_type)

# dummies
df.animal_type = df.animal_type.str.lower()

a_type = pd.get_dummies(df.animal_type, prefix = 'is')

df = pd.concat([df, a_type], axis = 1)

### sex upon outcome

In [9]:
df.sex_upon_outcome.value_counts()

Neutered Male    27782
Spayed Female    25203
Intact Male       9544
Intact Female     9139
Unknown           6569
Name: sex_upon_outcome, dtype: int64

In [10]:
df['is_male'] = np.where((df.sex_upon_outcome.str.contains('Male')), 1, 0)
df['is_female'] = np.where((df.sex_upon_outcome.str.contains('Female')), 1, 0)
df['sex_unknown'] = np.where((df.sex_upon_outcome.str.contains('Unknown')), 1, 0)


In [11]:
df['is_neutered_or_spayed'] = np.where(
    (df.sex_upon_outcome.str.contains('Neutered')) |
    (df.sex_upon_outcome.str.contains('Spayed')), 1, 0)

### outcome_type (target variable)

In [12]:
df.outcome_type.value_counts()

Adoption           33112
Transfer           23497
Return to Owner    14353
Euthanasia          6076
Died                 680
Disposal             307
Rto-Adopt            150
Missing               46
Relocate              16
Name: outcome_type, dtype: int64

In [13]:
df['adopted'] = np.where((df.outcome_type.str.contains('Adopt')), 1, 0)

In [None]:
alternate age c

### Copy original df, split copy, drop old columns

In [14]:
dfp = df.copy()

In [15]:
dfp = dfp[['age_under_1', 'age_between_1_and_2', 'age_2_or_more', 'is_bird',
       'is_cat', 'is_dog', 'is_other', 'is_male', 'is_female', 'sex_unknown',
       'is_neutered_or_spayed', 'adopted']]

In [18]:
def split_data(df):
    """
    Accepts DF. Returns data split into 3 dataframes: train, validate, and test.
    """
    # splitting data
    train_validate, test = train_test_split(df, test_size=.2, random_state=123)
    train, validate = train_test_split(train_validate, test_size=.3, random_state=123)
    
    # return split data frames
    return train, validate, test


In [19]:
train, validate, test = split_data(dfp)

# Explore

# Model
I'll now create several models that will predict whether an animal was adopted based on it's various features.
***

## Baseline

In [28]:
train.adopted.value_counts()

0    25152
1    18660
Name: adopted, dtype: int64

In [29]:
train['baseline_pred'] = 0

In [30]:
# printing performance metrics
print(classification_report(train.adopted, train.baseline_pred))

              precision    recall  f1-score   support

           0       0.57      1.00      0.73     25152
           1       0.00      0.00      0.00     18660

    accuracy                           0.57     43812
   macro avg       0.29      0.50      0.36     43812
weighted avg       0.33      0.57      0.42     43812



# TRAIN

## M1

In [32]:
# setting target variable
y_train = train['adopted']

# setting feature for model
X_train_m1 = train[['age_under_1', 'age_between_1_and_2', 'age_2_or_more', 'is_bird',
       'is_cat', 'is_dog', 'is_other', 'is_male', 'is_female', 'sex_unknown',
       'is_neutered_or_spayed']]

# creating logistic regression object
lr_m1 = LogisticRegression(random_state=123)

# fitting model to data
lr_m1.fit(X_train_m1, y_train)

# creating column with model predictions
train['m1_prediction'] = lr_m1.predict(X_train_m1)

# printing performance metrics
print(classification_report(train.adopted, train.m1_prediction))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     25152
           1       0.75      0.65      0.70     18660

    accuracy                           0.76     43812
   macro avg       0.76      0.74      0.75     43812
weighted avg       0.76      0.76      0.76     43812



In [31]:
train.columns

Index(['age_under_1', 'age_between_1_and_2', 'age_2_or_more', 'is_bird',
       'is_cat', 'is_dog', 'is_other', 'is_male', 'is_female', 'sex_unknown',
       'is_neutered_or_spayed', 'adopted', 'baseline_pred'],
      dtype='object')

# VALIDATE

In [34]:
# setting target variable
y_val = validate['adopted']

# setting feature for model
X_val_m1 = validate[['age_under_1', 'age_between_1_and_2', 'age_2_or_more', 'is_bird',
       'is_cat', 'is_dog', 'is_other', 'is_male', 'is_female', 'sex_unknown',
       'is_neutered_or_spayed']]

# creating column with model predictions
validate['m1_prediction'] = lr_m1.predict(X_val_m1)

# printing performance metrics
print(classification_report(validate.adopted, validate.m1_prediction))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80     10815
           1       0.75      0.65      0.69      7962

    accuracy                           0.76     18777
   macro avg       0.76      0.74      0.75     18777
weighted avg       0.76      0.76      0.75     18777



# TEST

In [35]:
# setting target variable
y_test = test['adopted']

# setting feature for model
X_test_m1 = test[['age_under_1', 'age_between_1_and_2', 'age_2_or_more', 'is_bird',
       'is_cat', 'is_dog', 'is_other', 'is_male', 'is_female', 'sex_unknown',
       'is_neutered_or_spayed']]

# creating column with model predictions
test['m1_prediction'] = lr_m1.predict(X_test_m1)

# printing performance metrics
print(classification_report(test.adopted, test.m1_prediction))

              precision    recall  f1-score   support

           0       0.76      0.84      0.80      9008
           1       0.75      0.64      0.69      6640

    accuracy                           0.76     15648
   macro avg       0.76      0.74      0.75     15648
weighted avg       0.76      0.76      0.76     15648

