# Outcomes data

In [1]:
import pandas as pd
import numpy as np

outcomes_df = pd.read_csv("aac_outcomes.csv")
outcomes_df.drop(['date_of_birth','monthyear','outcome_subtype'], axis = 1, inplace = True)
outcomes_df = outcomes_df[outcomes_df['animal_type'].isin(['Cat', 'Dog'])]
# Reset index after filtering
outcomes_df = outcomes_df.reset_index(drop=True)

# print(outcomes_df)
print(outcomes_df.shape)

(163904, 9)


In [2]:
outcomes_df.columns # We will drop animal_id later + maybe datetime as well as it leaks data + sex_upon_outcome and age_upon_outcome as well since that datais not available at the intake where the shelter wants to predict

Index(['animal_id', 'name', 'outcome_type', 'animal_type', 'sex_upon_outcome',
       'age_upon_outcome', 'breed', 'color', 'datetime'],
      dtype='object')

In [4]:
outcomes_df.isnull().sum()

animal_id               0
name                41759
outcome_type           39
animal_type             0
sex_upon_outcome        1
age_upon_outcome        5
breed                   0
color                   0
datetime                0
dtype: int64

In [8]:
# outcomes_df['animal_id'].value_counts() # We will keep the latest outcome one

In [11]:
# outcomes_df['name'].nunique()

In [12]:
# outcomes_df['outcome_type'].value_counts()

In [14]:
# outcomes_df['animal_type'].value_counts()

In [16]:
# outcomes_df['sex_upon_outcome'].value_counts()

In [17]:
# outcomes_df['age_upon_outcome'].value_counts()

In [18]:
# outcomes_df['breed'].nunique()

In [19]:
# outcomes_df['color'].nunique()

In [12]:
# outcomes_df['datetime'] # We need to conver this to datetime object and the format should be consistent accross the data

0         2015-07-02T00:00:00-05:00
1         2016-06-20T00:00:00-05:00
2         2015-11-11T00:00:00-05:00
3         2015-06-30T00:00:00-05:00
4         2015-11-17T00:00:00-05:00
                    ...            
163899          2025-05-04T12:48:00
163900          2025-05-01T16:54:00
163901          2025-05-04T17:53:00
163902          2025-05-04T14:18:00
163903          2025-05-05T00:00:00
Name: datetime, Length: 163904, dtype: object

In [20]:
import pandas as pd
import dateutil

def parse_any_datetime(val, default_tz="UTC"):
    try:
        dt = pd.to_datetime(val, utc=True)   # works for both offset and normal
        return dt
    except Exception:
        # Fallback: parse with dateutil and assign default timezone
        dt = dateutil.parser.parse(val)
        return dt.tz_localize(default_tz).tz_convert("UTC")

outcomes_df['datetime'] = outcomes_df['datetime'].astype(str).apply(parse_any_datetime)
outcomes_df['datetime'] = outcomes_df['datetime'].dt.tz_localize(None)

In [21]:
# outcomes_df['animal_id'].value_counts()

In [22]:
# outcomes_df[outcomes_df['animal_id'] == 'A721033']

In [23]:
# Step 1: Count number of visits per animal_id
visit_counts = outcomes_df['animal_id'].value_counts()

# Step 2: Keep only the latest record per animal_id
outcomes_latest = outcomes_df.sort_values(['animal_id', 'datetime']).groupby('animal_id').tail(1)

# Step 3: Add visit_count column
outcomes_latest['visit_count'] = outcomes_latest['animal_id'].map(visit_counts)

# Step 4: Reset index for neatness
outcomes_latest = outcomes_latest.reset_index(drop=True)

In [24]:
outcomes_latest[outcomes_latest['animal_id'] == 'A721033']

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,age_upon_outcome,breed,color,datetime,visit_count
40454,A721033,Lil Bit,Rto-Adopt,Dog,Neutered Male,4 years,Rat Terrier Mix,Tricolor/Brown Brindle,2019-10-20 11:35:00,33


In [25]:
outcomes_latest['animal_id'].value_counts()

animal_id
A929710    1
A006100    1
A047759    1
A134067    1
A141142    1
          ..
A212672    1
A210457    1
A208755    1
A200922    1
A197810    1
Name: count, Length: 146409, dtype: int64

In [26]:
outcomes_latest['datetime']

0        2017-12-07 05:00:00
1        2014-04-07 15:12:00
2        2013-11-16 11:54:00
3        2013-11-17 11:40:00
4        2014-11-14 19:28:00
                 ...        
146404   2025-05-02 18:44:00
146405   2025-05-02 18:43:00
146406   2025-05-03 13:19:00
146407   2025-05-03 11:27:00
146408   2025-05-04 17:13:03
Name: datetime, Length: 146409, dtype: datetime64[ns]

In [27]:
outcomes_latest.columns 

Index(['animal_id', 'name', 'outcome_type', 'animal_type', 'sex_upon_outcome',
       'age_upon_outcome', 'breed', 'color', 'datetime', 'visit_count'],
      dtype='object')

In [28]:
outcomes_latest

Unnamed: 0,animal_id,name,outcome_type,animal_type,sex_upon_outcome,age_upon_outcome,breed,color,datetime,visit_count
0,A006100,Scamp,Return to Owner,Dog,Neutered Male,10 years,Spinone Italiano Mix,Yellow/White,2017-12-07 05:00:00,3
1,A047759,Oreo,Transfer,Dog,Neutered Male,10 years,Dachshund,Tricolor,2014-04-07 15:12:00,1
2,A134067,Bandit,Return to Owner,Dog,Neutered Male,16 years,Shetland Sheepdog,Brown/White,2013-11-16 11:54:00,1
3,A141142,Bettie,Return to Owner,Dog,Spayed Female,15 years,Labrador Retriever/Pit Bull,Black/White,2013-11-17 11:40:00,1
4,A163459,Sasha,Return to Owner,Dog,Intact Female,15 years,Miniature Schnauzer Mix,Black/Gray,2014-11-14 19:28:00,1
...,...,...,...,...,...,...,...,...,...,...
146404,A929599,,Transfer,Cat,Unknown,4 days,Domestic Shorthair,Brown Tabby,2025-05-02 18:44:00,1
146405,A929600,,Transfer,Cat,Unknown,4 days,Domestic Shorthair,Brown Tabby,2025-05-02 18:43:00,1
146406,A929602,,Transfer,Cat,Unknown,1 year,Domestic Shorthair,Black,2025-05-03 13:19:00,1
146407,A929631,Fiona,Euthanasia,Dog,Spayed Female,16 years,Chihuahua Shorthair/Dachshund,Tan,2025-05-03 11:27:00,1


# Intakes data

In [36]:
import pandas as pd
import numpy as np
intakes_df = pd.read_csv("aac_intakes.csv")

In [37]:
intakes_df.drop(['datetime2','found_location'], axis = 1, inplace = True)
intakes_df = intakes_df[intakes_df['animal_type'].isin(['Cat', 'Dog'])]
# Reset index after filtering
intakes_df = intakes_df.reset_index(drop=True)

print(intakes_df)
print(intakes_df.shape)
print(intakes_df.columns)

                       datetime animal_id     name      intake_type  \
0       2013-10-01T16:10:00.000   A664308    Bruce            Stray   
1       2013-10-11T18:13:00.000   A664968    Simba            Stray   
2       2013-10-14T11:23:00.000   A665140      NaN            Stray   
3       2013-10-01T15:58:00.000   A659642  *Trevor  Owner Surrender   
4       2013-10-04T11:05:00.000   A664489   Nikole            Stray   
...                         ...       ...      ...              ...   
163927  2025-04-18T08:44:00.000   A928711      Red            Stray   
163928  2025-04-29T14:05:00.000   A929392      NaN            Stray   
163929  2025-04-15T14:13:00.000   A928558  *Murphy            Stray   
163930  2025-05-02T22:01:00.000   A929633      NaN            Stray   
163931  2025-04-23T16:53:00.000   A926662   Pancho    Public Assist   

       intake_condition animal_type sex_upon_intake age_upon_intake  \
0                Normal         Dog     Intact Male         6 years   
1    

In [38]:
# intakes_df['animal_id'].value_counts()

In [39]:
intakes_df['datetime'] = intakes_df['datetime'].astype(str).apply(parse_any_datetime)
intakes_df['datetime'] = intakes_df['datetime'].dt.tz_localize(None)

In [40]:
# intakes_df[intakes_df['animal_id'] == 'A721033']

In [41]:
# Keep only the latest outcome for each animal_id
intakes_df = intakes_df.sort_values('datetime')  # sort by datetime
intakes_df = intakes_df.groupby('animal_id').tail(1)  # keep last row per animal_id

# Reset index for neatness
intakes_latest = intakes_df.reset_index(drop=True)

print(intakes_latest.shape)
print(intakes_latest.head())

(146450, 10)
             datetime animal_id    name intake_type intake_condition  \
0 2013-10-01 07:51:00   A521520    Nina       Stray           Normal   
1 2013-10-01 08:33:00   A664237     NaN       Stray           Normal   
2 2013-10-01 08:33:00   A664236     NaN       Stray           Normal   
3 2013-10-01 08:33:00   A664235     NaN       Stray           Normal   
4 2013-10-01 08:53:00   A664233  Stevie       Stray          Injured   

  animal_type sex_upon_intake age_upon_intake                         breed  \
0         Dog   Spayed Female         7 years  Border Terrier/Border Collie   
1         Cat         Unknown          1 week        Domestic Shorthair Mix   
2         Cat         Unknown          1 week        Domestic Shorthair Mix   
3         Cat         Unknown          1 week        Domestic Shorthair Mix   
4         Dog   Intact Female         3 years                  Pit Bull Mix   

          color  
0     White/Tan  
1  Orange/White  
2  Orange/White  
3  Oran

In [42]:
intakes_latest[intakes_latest['animal_id'] == 'A721033']

Unnamed: 0,datetime,animal_id,name,intake_type,intake_condition,animal_type,sex_upon_intake,age_upon_intake,breed,color
90312,2019-10-05 12:31:00,A721033,Lil Bit,Public Assist,Normal,Dog,Neutered Male,4 years,Rat Terrier Mix,Tricolor/Brown Brindle


In [43]:
# intakes_latest['animal_id'].value_counts()

animal_id
A929635    1
A521520    1
A929699    1
A929697    1
A929690    1
          ..
A664234    1
A664238    1
A664233    1
A664235    1
A664236    1
Name: count, Length: 146450, dtype: int64

In [30]:
# intakes_latest['datetime']

0        2013-10-01 07:51:00
1        2013-10-01 08:33:00
2        2013-10-01 08:33:00
3        2013-10-01 08:33:00
4        2013-10-01 08:53:00
                 ...        
146445   2025-05-04 12:43:00
146446   2025-05-04 13:15:00
146447   2025-05-04 15:14:00
146448   2025-05-04 22:55:00
146449   2025-05-04 23:44:00
Name: datetime, Length: 146450, dtype: datetime64[ns]

In [44]:
outcomes_latest.columns

Index(['animal_id', 'name', 'outcome_type', 'animal_type', 'sex_upon_outcome',
       'age_upon_outcome', 'breed', 'color', 'datetime', 'visit_count'],
      dtype='object')

In [45]:
intakes_latest.columns

Index(['datetime', 'animal_id', 'name', 'intake_type', 'intake_condition',
       'animal_type', 'sex_upon_intake', 'age_upon_intake', 'breed', 'color'],
      dtype='object')

# Merge outcomes and intakes:

In [46]:
merged_df = pd.merge(
    outcomes_latest,
    intakes_latest,
    on="animal_id",
    how="inner",
    suffixes=("_outcome", "_intake")
)

In [47]:
print(merged_df)

       animal_id name_outcome     outcome_type animal_type_outcome  \
0        A006100        Scamp  Return to Owner                 Dog   
1        A047759         Oreo         Transfer                 Dog   
2        A134067       Bandit  Return to Owner                 Dog   
3        A141142       Bettie  Return to Owner                 Dog   
4        A163459        Sasha  Return to Owner                 Dog   
...          ...          ...              ...                 ...   
145598   A929599          NaN         Transfer                 Cat   
145599   A929600          NaN         Transfer                 Cat   
145600   A929602          NaN         Transfer                 Cat   
145601   A929631        Fiona       Euthanasia                 Dog   
145602   A929710          NaN       Euthanasia                 Cat   

       sex_upon_outcome age_upon_outcome                  breed_outcome  \
0         Neutered Male         10 years           Spinone Italiano Mix   
1        

In [190]:
merged_df.columns

Index(['animal_id', 'name_outcome', 'outcome_type', 'animal_type_outcome',
       'sex_upon_outcome', 'age_upon_outcome', 'breed_outcome',
       'color_outcome', 'datetime_outcome', 'visit_count', 'datetime_intake',
       'name_intake', 'intake_type', 'intake_condition', 'animal_type_intake',
       'sex_upon_intake', 'age_upon_intake', 'breed_intake', 'color_intake'],
      dtype='object')

In [191]:
# merged_df[['animal_id', 'name_outcome', 'name_intake']] # I have checked all names match - We can drop the name from the outcomes table

In [192]:
# merged_df[['animal_id', 'animal_type_outcome', 'animal_type_intake']] # I have checked all animal_types match - We can drop the outcome one

In [48]:
# merged_df[['animal_id', 'sex_upon_outcome', 'sex_upon_intake']]
# Compare sex_upon_outcome vs sex_upon_intake (treat NaN == NaN as True)
sex_equal = merged_df['sex_upon_outcome'].fillna("MISSING") == merged_df['sex_upon_intake'].fillna("MISSING")
# Check overall result
print("All equal?", sex_equal.all())
# Count mismatches
print("Number of mismatches:", (~sex_equal).sum())
# See mismatched rows
mismatches = merged_df.loc[~sex_equal, ['animal_id', 'sex_upon_outcome', 'sex_upon_intake']]
print(mismatches.head(10))

# They are changing since at the time of intake some ar intact and later they get spayed.
# For now we can keep both of these variables and then later we can drop the sex_upon_outcome since we can't really use it for prediction

All equal? False
Number of mismatches: 63010
    animal_id sex_upon_outcome sex_upon_intake
607   A501076    Spayed Female   Intact Female
609   A501255    Neutered Male     Intact Male
639   A504813    Spayed Female   Intact Female
672   A510475    Neutered Male     Intact Male
774   A524634    Spayed Female   Intact Female
821   A530122    Spayed Female   Intact Female
862   A533106    Neutered Male     Intact Male
898   A536136    Spayed Female   Intact Female
912   A538342    Spayed Female   Intact Female
967   A544105    Neutered Male     Intact Male


In [194]:
# merged_df['sex_upon_intake'].value_counts()

In [195]:
# merged_df[['animal_id', 'age_upon_intake', 'age_upon_outcome']]

In [49]:
cols_to_drop = ['name_outcome', 'animal_type_outcome', 'breed_outcome', 'color_outcome']
merged_df = merged_df.drop(columns=cols_to_drop)
print(merged_df.columns)

Index(['animal_id', 'outcome_type', 'sex_upon_outcome', 'age_upon_outcome',
       'datetime_outcome', 'visit_count', 'datetime_intake', 'name_intake',
       'intake_type', 'intake_condition', 'animal_type_intake',
       'sex_upon_intake', 'age_upon_intake', 'breed_intake', 'color_intake'],
      dtype='object')


In [50]:
merged_df = merged_df.rename(columns={
    'name_intake': 'name',
    'animal_type_intake': 'animal_type',
    'breed_intake': 'breed',
    'color_intake': 'color'
})
print(merged_df.columns)

Index(['animal_id', 'outcome_type', 'sex_upon_outcome', 'age_upon_outcome',
       'datetime_outcome', 'visit_count', 'datetime_intake', 'name',
       'intake_type', 'intake_condition', 'animal_type', 'sex_upon_intake',
       'age_upon_intake', 'breed', 'color'],
      dtype='object')


In [51]:
final_order = [
    'animal_id', 'name', 'sex_upon_outcome', 'sex_upon_intake',
    'age_upon_outcome', 'age_upon_intake',
    'datetime_outcome', 'datetime_intake',
    'visit_count', 'intake_type', 'intake_condition',
    'animal_type', 'breed', 'color', 'outcome_type'
]

merged_df = merged_df[final_order]

print(merged_df.columns)

Index(['animal_id', 'name', 'sex_upon_outcome', 'sex_upon_intake',
       'age_upon_outcome', 'age_upon_intake', 'datetime_outcome',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type'],
      dtype='object')


In [52]:
# Keep only rows where age_at_intake <= age_at_outcome
merged_df = merged_df[merged_df['age_upon_intake'] <= merged_df['age_upon_outcome']]

# Now drop the outcome-only columns
merged_df = merged_df.drop(
    ['age_upon_outcome', 'datetime_outcome', 'sex_upon_outcome'],
    axis=1
).reset_index(drop=True)

In [53]:
print(merged_df.columns)
print(merged_df.shape)

Index(['animal_id', 'name', 'sex_upon_intake', 'age_upon_intake',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type'],
      dtype='object')
(135235, 12)


In [54]:
# Save merged_df to CSV
merged_df.to_csv("merged_outcomes_intakes.csv", index=False)

# Ignore the script below

# Analysis:

In [52]:
import pandas as pd
merged_df = pd.read_csv('merged_outcomes_intakes.csv')

In [53]:
merged_df

Unnamed: 0,animal_id,name,sex_upon_intake,age_upon_intake,datetime_intake,visit_count,intake_type,intake_condition,animal_type,breed,color,outcome_type
0,A006100,Scamp,Neutered Male,10 years,2017-12-07 14:07:00,3,Stray,Normal,Dog,Spinone Italiano Mix,Yellow/White,Return to Owner
1,A047759,Oreo,Neutered Male,10 years,2014-04-02 15:55:00,1,Owner Surrender,Normal,Dog,Dachshund,Tricolor,Transfer
2,A134067,Bandit,Neutered Male,16 years,2013-11-16 09:02:00,1,Public Assist,Injured,Dog,Shetland Sheepdog,Brown/White,Return to Owner
3,A141142,Bettie,Spayed Female,15 years,2013-11-16 14:46:00,1,Stray,Aged,Dog,Labrador Retriever/Pit Bull,Black/White,Return to Owner
4,A163459,Sasha,Intact Female,15 years,2014-11-14 15:11:00,1,Stray,Normal,Dog,Miniature Schnauzer Mix,Black/Gray,Return to Owner
...,...,...,...,...,...,...,...,...,...,...,...,...
135230,A929598,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer
135231,A929599,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer
135232,A929600,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer
135233,A929602,,Unknown,1 year,2025-05-02 12:22:00,1,Stray,Normal,Cat,Domestic Shorthair,Black,Transfer


In [54]:
print(merged_df.isnull().sum())

# name - 38897 missing values - ~28.7%
# sex_upon_intake - 1 missing value
# outcome_type - 25 missing values

animal_id               0
name                38897
sex_upon_intake         1
age_upon_intake         0
datetime_intake         0
visit_count             0
intake_type             0
intake_condition        0
animal_type             0
breed                   0
color                   0
outcome_type           25
dtype: int64


In [55]:
merged_df.dropna(subset=["outcome_type"], inplace=True)

In [56]:
merged_df['breed'].nunique()

2716

In [57]:
merged_df['color'].nunique()

616

In [58]:
merged_df['outcome_type'].value_counts()

outcome_type
Adoption           64328
Transfer           44280
Return to Owner    19631
Euthanasia          4592
Died                1178
Rto-Adopt            897
Disposal             239
Missing               54
Relocate               8
Stolen                 2
Lost                   1
Name: count, dtype: int64

In [7]:
merged_df['sex_upon_intake'].value_counts()

sex_upon_intake
Intact Male      46904
Intact Female    46356
Neutered Male    19620
Spayed Female    16751
Unknown           5603
Name: count, dtype: int64

In [8]:
# Outcome Type Definitions (Austin Animal Center dataset)
#
# Adoption (64,328) 
#   - Animal was adopted into a new home from the shelter.
#
# Transfer (44,280) 
#   - Animal was moved to another organization (rescue, foster group, partner shelter).
#
# Return to Owner (19,631) 
#   - Lost pet was reunited with its original owner.
#
# Euthanasia (4,592) 
#   - Animal was humanely put down (severe illness, aggression, overcrowding).
#
# Died (1,178) 
#   - Animal died naturally while in shelter care (not euthanized).
#
# Disposal (239) 
#   - Animal found deceased and brought in for proper handling/disposal.
#
# Rto-Adopt (897) 
#   - Return-to-Owner followed immediately by adoption 
#     (e.g., owner surrendered rights and adopter took the animal).
#
# Missing (54) 
#   - Animal’s outcome is uncertain; shelter records show it went missing.
#
# Relocate (8) 
#   - Animal moved to a different facility/location (administrative, not a standard transfer).
#
# Stolen (2) 
#   - Animal reported stolen from shelter property or custody.
#
# Lost (1) 
#   - Animal lost while under shelter responsibility (rare case).
#
# Summary:
#   Positive outcomes → Adoption, Transfer, Return to Owner
#   Negative outcomes → Euthanasia, Died, Disposal, Missing, Stolen, Lost
#   Special/neutral outcomes → Rto-Adopt, Relocate


In [9]:
merged_df['intake_type'].value_counts()

intake_type
Stray                 96590
Owner Surrender       28930
Public Assist          7904
Abandoned              1575
Euthanasia Request      235
Wildlife                  1
Name: count, dtype: int64

In [10]:
merged_df['intake_condition'].value_counts()

intake_condition
Normal        115103
Injured         8310
Sick            5428
Nursing         3067
Neonatal        1584
Medical          509
Aged             478
Other            290
Pregnant         133
Feral            122
Med Attn          76
Behavior          60
Unknown           23
Med Urgent        21
Neurologic        11
Parvo             11
Space              4
Agonal             3
Panleuk            1
Congenital         1
Name: count, dtype: int64

# Feature engineering name column:

In [11]:
# # For analysis purpose:

# import pandas as pd

# # Get unique names, drop NaNs, sort them
# unique_names = sorted(merged_df['name'].dropna().unique())

# # Save to CSV
# pd.DataFrame(unique_names, columns=['Name']).to_csv("unique_names.csv", index=False)

# print(f"Saved {len(unique_names)} unique names to unique_names.csv")

In [12]:
# Has Name - 1, else 0

import pandas as pd
import re

def has_valid_name(name: str) -> int:
    """
    Return 1 if name looks like a valid given name, else 0.
    Implements the user-specified rules:
    - No digits, #, $, (, )
    - Length > 1
    - Not empty, NaN, or placeholders
    - Not generic group labels
    - Names starting with '*' are allowed
    """

    # Handle missing or NaN
    if pd.isna(name):
        return 0

    name_str = str(name).strip()
    if name_str == "" or name_str.isspace():
        return 0

    name_lower = name_str.lower()
        
    # if length <= 2 - return 0
    if len(name_lower) <= 2 :
        return 0
        
    # Preserve '*' names
    if name_str.startswith("*"):
        return 1

    # Rule 1: No numbers
    if any(char.isdigit() for char in name_lower):
        return 0

    # Rule 2: No # or $
    if "#" in name_lower or "$" in name_lower:
        return 0

    # Rule 3: No ( or )
    if "(" in name_lower or ")" in name_lower:
        return 0

    # Rule 4 & 5: invalid placeholders / generic labels
    invalid_terms = {
        "unknown", "unnamed", "no name", "nameless", "not named", "none",
        "stray", "animal", "dog", "puppy", "cat", "kitten",
        "male", "female", "litter", "deceased"
    }
    if name_lower in invalid_terms:
        return 0

    return 1

# Example usage:
# df["HasName"] = df["Name"].apply(has_valid_name)


In [13]:
# Applying this function to the name column:
merged_df["HasName"] = merged_df["name"].apply(has_valid_name)

In [14]:
# name_0 = merged_df[merged_df["HasName"] == 0]
# save = (name_0[['name','HasName']])
# save.to_csv("NoName.csv", index=False)

In [15]:
merged_df

Unnamed: 0,animal_id,name,sex_upon_intake,age_upon_intake,datetime_intake,visit_count,intake_type,intake_condition,animal_type,breed,color,outcome_type,HasName
0,A006100,Scamp,Neutered Male,10 years,2017-12-07 14:07:00,3,Stray,Normal,Dog,Spinone Italiano Mix,Yellow/White,Return to Owner,1
1,A047759,Oreo,Neutered Male,10 years,2014-04-02 15:55:00,1,Owner Surrender,Normal,Dog,Dachshund,Tricolor,Transfer,1
2,A134067,Bandit,Neutered Male,16 years,2013-11-16 09:02:00,1,Public Assist,Injured,Dog,Shetland Sheepdog,Brown/White,Return to Owner,1
3,A141142,Bettie,Spayed Female,15 years,2013-11-16 14:46:00,1,Stray,Aged,Dog,Labrador Retriever/Pit Bull,Black/White,Return to Owner,1
4,A163459,Sasha,Intact Female,15 years,2014-11-14 15:11:00,1,Stray,Normal,Dog,Miniature Schnauzer Mix,Black/Gray,Return to Owner,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
135230,A929598,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0
135231,A929599,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0
135232,A929600,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0
135233,A929602,,Unknown,1 year,2025-05-02 12:22:00,1,Stray,Normal,Cat,Domestic Shorthair,Black,Transfer,0


In [16]:
# Creating a feature indicating the length of the name: (Only for HasName = 1)

# Create NameLength column
merged_df["NameLength"] = merged_df.apply(
    lambda row: len(str(row["name"]).strip()) if row["HasName"] == 1 else 0,
    axis=1
)

# Quick check
print(merged_df[["name", "HasName", "NameLength"]].head(20))

# Summary for valid names only
print(merged_df.loc[merged_df["HasName"] == 1, "NameLength"].describe())

          name  HasName  NameLength
0        Scamp        1           5
1         Oreo        1           4
2       Bandit        1           6
3       Bettie        1           6
4        Sasha        1           5
5          Pep        1           3
6         Boti        1           4
7      Ophelia        1           7
8      Bri-Bri        1           7
9   Sassafrass        1          10
10      Carlos        1           6
11     Kootrie        1           7
12       Caleb        1           5
13        Cujo        1           4
14      Prissy        1           6
15     Guiness        1           7
16      Oliver        1           6
17      Cookie        1           6
18       Lippy        1           5
19       Jamie        1           5
count    91883.000000
mean         6.092520
std          2.026968
min          3.000000
25%          5.000000
50%          6.000000
75%          7.000000
max         30.000000
Name: NameLength, dtype: float64


In [17]:
merged_df

Unnamed: 0,animal_id,name,sex_upon_intake,age_upon_intake,datetime_intake,visit_count,intake_type,intake_condition,animal_type,breed,color,outcome_type,HasName,NameLength
0,A006100,Scamp,Neutered Male,10 years,2017-12-07 14:07:00,3,Stray,Normal,Dog,Spinone Italiano Mix,Yellow/White,Return to Owner,1,5
1,A047759,Oreo,Neutered Male,10 years,2014-04-02 15:55:00,1,Owner Surrender,Normal,Dog,Dachshund,Tricolor,Transfer,1,4
2,A134067,Bandit,Neutered Male,16 years,2013-11-16 09:02:00,1,Public Assist,Injured,Dog,Shetland Sheepdog,Brown/White,Return to Owner,1,6
3,A141142,Bettie,Spayed Female,15 years,2013-11-16 14:46:00,1,Stray,Aged,Dog,Labrador Retriever/Pit Bull,Black/White,Return to Owner,1,6
4,A163459,Sasha,Intact Female,15 years,2014-11-14 15:11:00,1,Stray,Normal,Dog,Miniature Schnauzer Mix,Black/Gray,Return to Owner,1,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135230,A929598,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0,0
135231,A929599,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0,0
135232,A929600,,Unknown,4 days,2025-05-02 12:22:00,1,Stray,Neonatal,Cat,Domestic Shorthair,Brown Tabby,Transfer,0,0
135233,A929602,,Unknown,1 year,2025-05-02 12:22:00,1,Stray,Normal,Cat,Domestic Shorthair,Black,Transfer,0,0


# Creating Features with age_upon_intake

In [18]:
merged_df['age_upon_intake'].value_counts()

age_upon_intake
2 years      21297
1 year       20592
1 month      16964
2 months      9523
3 years       7716
4 years       4741
3 months      4420
5 years       4331
4 months      4215
5 months      3457
3 weeks       3137
2 weeks       2899
6 years       2887
6 months      2653
7 years       2545
8 years       2439
10 years      2099
7 months      2059
4 weeks       1766
8 months      1545
1 week        1356
9 years       1310
1 weeks       1296
0 years       1120
10 months     1052
12 years      1030
1 day          956
11 years       815
9 months       729
13 years       654
2 days         628
3 days         621
14 years       493
15 years       429
11 months      374
4 days         270
6 days         238
16 years       182
5 days         151
17 years       101
18 years        56
5 weeks         28
19 years        27
20 years        22
22 years         6
-1 years         2
23 years         1
24 years         1
-4 years         1
-3 years         1
Name: count, dtype: int64

In [19]:
import pandas as pd
import re

def clean_and_parse_age_days(df, col="age_upon_intake"):
    """
    Clean anomalies, fix plural issues, and parse age_upon_intake into age_days.
    
    Steps:
    1. Drop anomalies with negative ages.
    2. Normalize plurals like '1 years' -> '1 year', '1 weeks' -> '1 week'.
    3. Convert all ages to days (integer).
    
    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe.
    col : str
        Column name containing age strings.
    
    Returns
    -------
    df : pd.DataFrame
        Dataframe with new column:
        - age_days
    """
    
    # Drop negative anomalies
    df = df[~df[col].astype(str).str.contains("-", na=False)].copy()
    
    # Fix plural issues for "1 unit"
    fixes = {
        r"\b1 years\b": "1 year",
        r"\b1 months\b": "1 month",
        r"\b1 weeks\b": "1 week",
        r"\b1 days\b": "1 day"
    }
    for wrong, right in fixes.items():
        df[col] = df[col].str.replace(wrong, right, regex=True)
    
    # Unit to days mapping
    unit_to_days = {"day": 1, "week": 7, "month": 30, "year": 365}
    
    def parse_age(age_str):
        if pd.isna(age_str) or str(age_str).strip() == "":
            return None
        match = re.match(r"(\d+)\s+(\w+)", str(age_str).lower())
        if not match:
            return None
        value, unit = int(match.group(1)), match.group(2).rstrip("s")
        return value * unit_to_days.get(unit, None)
    
    # Apply parser
    df["age_days"] = df[col].apply(parse_age)
    
    return df


In [20]:
merged_df = clean_and_parse_age_days(merged_df, col="age_upon_intake")

# quick check of results
print(merged_df.shape)
print(merged_df[["age_upon_intake", "age_days"]].sample(30))

(135231, 15)
       age_upon_intake  age_days
57418           1 year       365
61126          4 years      1460
76888         10 years      3650
26126          1 month        30
123162          1 year       365
115078        8 months       240
77742          1 month        30
55326          1 month        30
1131          13 years      4745
52434          2 years       730
7930            1 year       365
44639          1 month        30
90571          2 years       730
671           15 years      5475
2488           8 years      2920
8507           1 month        30
58810           1 week         7
6620           5 years      1825
77289           1 year       365
105672         5 years      1825
97552           1 year       365
15057           1 year       365
100297          1 year       365
80762          3 weeks        21
18181           1 week         7
124657          1 year       365
39323          2 years       730
83750          5 years      1825
47136          1 month        

In [21]:
def add_age_group(df, col="age_days"):
    """
    Add a categorical 'age_group' column for dogs and cats based on biologically meaningful cutoffs.

    Groups:
    - Neonate: < 8 weeks (0–55 days)
    - Juvenile: 8 weeks – < 1 year (56–364 days)
    - Young Adult: 1 – < 7 years (365–2554 days)
    - Senior: 7+ years (2555+ days)

    Parameters
    ----------
    df : pd.DataFrame
        Input dataframe containing a numeric 'age_days' column.
    col : str
        Column name with numeric age in days.

    Returns
    -------
    df : pd.DataFrame
        Same dataframe with an additional column 'age_group'.
    """

    # Define cutoffs in **days** using right=False (left-inclusive bins):
    # 0–55 days    → Neonate
    # 56–364 days  → Juvenile
    # 365–2554 days → Young Adult
    # 2555+ days   → Senior
    bins = [0, 56, 365, 2555, float("inf")]

    # Labels that will be assigned to each bin
    labels = ["Neonate", "Juvenile", "Young Adult", "Senior"]

    # Use pd.cut to assign bins
    # right=False → intervals are [a, b), i.e. left-closed, right-open
    df["age_group"] = pd.cut(df[col], bins=bins, labels=labels, right=False)

    return df


In [22]:
merged_df = add_age_group(merged_df, col="age_days")
merged_df[['age_days', 'age_group']].sample(30)

Unnamed: 0,age_days,age_group
79572,30,Neonate
25523,1825,Young Adult
45602,1095,Young Adult
84114,120,Juvenile
86432,21,Neonate
92977,210,Juvenile
79123,730,Young Adult
96374,730,Young Adult
129114,1095,Young Adult
100301,30,Neonate


# Creating features for sex_upon_intake:

In [23]:
merged_df['sex_upon_intake'].value_counts()

sex_upon_intake
Intact Male      46903
Intact Female    46355
Neutered Male    19619
Spayed Female    16750
Unknown           5603
Name: count, dtype: int64

In [24]:
merged_df['sex_upon_intake'] = merged_df['sex_upon_intake'].fillna("Unknown") # NaN - Unknown

In [25]:
# Define mapping dictionaries
intactness_mapping = {
    'Intact Male': 'intact',
    'Intact Female': 'intact',
    'Neutered Male': 'not_intact',
    'Spayed Female': 'not_intact',
    'Unknown': 'unknown'
}

sex_mapping = {
    'Intact Male': 'male',
    'Intact Female': 'female',
    'Neutered Male': 'male',
    'Spayed Female': 'female',
    'Unknown': 'unknown'
}

# Apply mappings
merged_df['intactness'] = merged_df['sex_upon_intake'].map(intactness_mapping)
merged_df['sex'] = merged_df['sex_upon_intake'].map(sex_mapping)

In [26]:
merged_df[['sex_upon_intake','sex','intactness']].sample(30)

Unnamed: 0,sex_upon_intake,sex,intactness
81814,Neutered Male,male,not_intact
59852,Unknown,unknown,unknown
124883,Intact Male,male,intact
51539,Intact Female,female,intact
101399,Intact Female,female,intact
66165,Intact Female,female,intact
103445,Neutered Male,male,not_intact
14564,Intact Male,male,intact
84891,Unknown,unknown,unknown
19615,Intact Male,male,intact


In [27]:
merged_df.columns

Index(['animal_id', 'name', 'sex_upon_intake', 'age_upon_intake',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type', 'HasName',
       'NameLength', 'age_days', 'age_group', 'intactness', 'sex'],
      dtype='object')

# Creating features for datetime_intake:

In [28]:
merged_df['datetime_intake']

0         2017-12-07 14:07:00
1         2014-04-02 15:55:00
2         2013-11-16 09:02:00
3         2013-11-16 14:46:00
4         2014-11-14 15:11:00
                 ...         
135230    2025-05-02 12:22:00
135231    2025-05-02 12:22:00
135232    2025-05-02 12:22:00
135233    2025-05-02 12:22:00
135234    2025-05-02 20:27:00
Name: datetime_intake, Length: 135231, dtype: object

In [29]:
merged_df['visit_count'].value_counts()

visit_count
1     122059
2      10478
3       1935
4        505
5        141
6         52
7         37
8         14
9          5
11         1
14         1
12         1
33         1
10         1
Name: count, dtype: int64

In [None]:
# Ye niche ka thoda galat hai - Mid_Term vale script mai hai correct vaaala

In [30]:
import pandas as pd

merged_df['datetime_intake'] = pd.to_datetime(merged_df['datetime_intake'])

# Year
merged_df['intake_year'] = merged_df['datetime_intake'].dt.year

# Month
merged_df['intake_month'] = merged_df['datetime_intake'].dt.month_name()

# Day of Month
merged_df['intake_day'] = merged_df['datetime_intake'].dt.day

# Day of Week (full name)
merged_df['intake_weekday'] = merged_df['datetime_intake'].dt.day_name()

# Week Number of Year
merged_df['intake_week'] = merged_df['datetime_intake'].dt.isocalendar().week

# Hour of the Day
merged_df['intake_hour'] = merged_df['datetime_intake'].dt.hour

# Quarter of the Year
merged_df['intake_quarter'] = merged_df['datetime_intake'].dt.quarter

# Weekend flag with string weekdays
merged_df['is_weekend'] = merged_df['intake_weekday'].apply(
    lambda x: "Weekend" if x in ["Saturday", "Sunday"] else "Weekday"
)

# Season (Winter, Spring, Summer, Fall)
def get_season(month):
    if month in [12, 1, 2]:
        return "Winter"
    elif month in [3, 4, 5]:
        return "Spring"
    elif month in [6, 7, 8]:
        return "Summer"
    else:
        return "Fall"
merged_df['intake_season'] = merged_df['intake_month'].apply(get_season)

# AM/PM
merged_df['intake_am_pm'] = merged_df['intake_hour'].apply(lambda x: "AM" if x < 12 else "PM")

# Part of the Day
def part_of_day(hour):
    if 5 <= hour < 12:
        return "Morning"
    elif 12 <= hour < 17:
        return "Afternoon"
    elif 17 <= hour < 21:
        return "Evening"
    else:
        return "Night"
merged_df['intake_part_of_day'] = merged_df['intake_hour'].apply(part_of_day)

In [31]:
# Quick check of new features
merged_df[[
    "datetime_intake","intake_year","intake_month","intake_day","intake_weekday",
    "intake_week","intake_hour","intake_quarter","is_weekend","intake_season",
    "intake_am_pm","intake_part_of_day"
]].sample(30)

Unnamed: 0,datetime_intake,intake_year,intake_month,intake_day,intake_weekday,intake_week,intake_hour,intake_quarter,is_weekend,intake_season,intake_am_pm,intake_part_of_day
84903,2019-09-09 13:25:00,2019,September,9,Monday,37,13,3,Weekday,Fall,PM,Afternoon
112198,2022-08-26 18:13:00,2022,August,26,Friday,34,18,3,Weekday,Fall,PM,Evening
116754,2023-03-28 18:12:00,2023,March,28,Tuesday,13,18,1,Weekday,Fall,PM,Evening
70700,2018-09-11 13:58:00,2018,September,11,Tuesday,37,13,3,Weekday,Fall,PM,Afternoon
80297,2019-06-04 15:32:00,2019,June,4,Tuesday,23,15,2,Weekday,Fall,PM,Afternoon
31337,2015-08-27 12:37:00,2015,August,27,Thursday,35,12,3,Weekday,Fall,PM,Afternoon
78480,2019-04-30 10:55:00,2019,April,30,Tuesday,18,10,2,Weekday,Fall,AM,Morning
52360,2017-04-18 10:48:00,2017,April,18,Tuesday,16,10,2,Weekday,Fall,AM,Morning
104005,2021-11-01 12:29:00,2021,November,1,Monday,44,12,4,Weekday,Fall,PM,Afternoon
74227,2018-12-22 18:25:00,2018,December,22,Saturday,51,18,4,Weekend,Fall,PM,Evening


In [32]:
print(merged_df.shape)

(135231, 29)


In [33]:
merged_df.columns

Index(['animal_id', 'name', 'sex_upon_intake', 'age_upon_intake',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type', 'HasName',
       'NameLength', 'age_days', 'age_group', 'intactness', 'sex',
       'intake_year', 'intake_month', 'intake_day', 'intake_weekday',
       'intake_week', 'intake_hour', 'intake_quarter', 'is_weekend',
       'intake_season', 'intake_am_pm', 'intake_part_of_day'],
      dtype='object')

# Creating features for visit_count:

In [34]:
# Create binary feature: 0 for first-time, 1 for return visitors
merged_df['is_return_visitor'] = (merged_df['visit_count'] > 1).astype(int)

In [35]:
merged_df[['is_return_visitor', 'visit_count']].sample(20)

Unnamed: 0,is_return_visitor,visit_count
2480,1,2
37471,0,1
17541,1,2
42475,0,1
11259,0,1
21825,0,1
88668,0,1
39865,0,1
7944,0,1
28897,0,1


In [36]:
merged_df.columns

Index(['animal_id', 'name', 'sex_upon_intake', 'age_upon_intake',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type', 'HasName',
       'NameLength', 'age_days', 'age_group', 'intactness', 'sex',
       'intake_year', 'intake_month', 'intake_day', 'intake_weekday',
       'intake_week', 'intake_hour', 'intake_quarter', 'is_weekend',
       'intake_season', 'intake_am_pm', 'intake_part_of_day',
       'is_return_visitor'],
      dtype='object')

# Features for intake_type:

In [37]:
merged_df['intake_type'].value_counts()

intake_type
Stray                 96589
Owner Surrender       28928
Public Assist          7903
Abandoned              1575
Euthanasia Request      235
Wildlife                  1
Name: count, dtype: int64

In [38]:
merged_df.shape

(135231, 30)

In [39]:
# Drop rows where intake_type == "Wildlife"
merged_df = merged_df[merged_df['intake_type'] != "Wildlife"]
print(merged_df['intake_type'].value_counts())
merged_df.shape

intake_type
Stray                 96589
Owner Surrender       28928
Public Assist          7903
Abandoned              1575
Euthanasia Request      235
Name: count, dtype: int64


(135230, 30)

In [40]:
# VOLUNTARY vs INVOLUNTARY INTAKE:
# - VOLUNTARY: Deliberate owner choices (Owner Surrender, Euthanasia Request)
#   - Owner makes active decision to bring animal to shelter
#   - May indicate behavioral issues, financial constraints, or life changes
# - INVOLUNTARY: Circumstantial situations (Stray, Abandoned, Public Assist)  
#   - Animal ends up at shelter due to circumstances, not owner decision
#   - More about animal control, public safety, or emergency situations

def voluntary_group(val):
    if val in ["Owner Surrender", "Euthanasia Request"]:
        return "Voluntary"
    elif val in ["Stray", "Abandoned", "Public Assist"]:
        return "Involuntary"
    else:
        return "Other"
merged_df['intake_voluntary'] = merged_df['intake_type'].apply(voluntary_group)

In [41]:
# HUMAN vs NON-HUMAN INITIATED:
# - HUMAN-INITIATED: Direct human action (Owner Surrender, Public Assist, Euthanasia Request)
#   - Specific person directly responsible for animal coming to shelter
#   - Often comes with more background information and known history
# - NON-HUMAN: System/environment driven (Stray, Abandoned)
#   - Animal comes through systematic channels rather than person's direct action
#   - Often unknown history, behavior, and more "mystery" about the animal

def human_group(val):
    if val in ["Owner Surrender", "Public Assist", "Euthanasia Request"]:
        return "Human"
    elif val in ["Stray", "Abandoned"]:
        return "Non-Human"
    else:
        return "Other"
merged_df['intake_human'] = merged_df['intake_type'].apply(human_group)

In [42]:
merged_df[['intake_type','intake_voluntary','intake_human']].sample(10)

Unnamed: 0,intake_type,intake_voluntary,intake_human
118644,Stray,Involuntary,Non-Human
58930,Stray,Involuntary,Non-Human
106244,Public Assist,Involuntary,Human
96898,Stray,Involuntary,Non-Human
88320,Stray,Involuntary,Non-Human
73203,Stray,Involuntary,Non-Human
27415,Stray,Involuntary,Non-Human
133714,Stray,Involuntary,Non-Human
109851,Stray,Involuntary,Non-Human
119000,Stray,Involuntary,Non-Human


# Features for intake_condition:

In [43]:
merged_df['intake_condition'].value_counts()

intake_condition
Normal        115100
Injured         8309
Sick            5427
Nursing         3067
Neonatal        1584
Medical          509
Aged             478
Other            290
Pregnant         133
Feral            122
Med Attn          76
Behavior          60
Unknown           23
Med Urgent        21
Neurologic        11
Parvo             11
Space              4
Agonal             3
Panleuk            1
Congenital         1
Name: count, dtype: int64

In [44]:
def condition_group(val):
    if val in ["Normal"]:
        return "healthy"
    elif val in ["Injured", "Sick", "Medical", "Med Attn"]:
        return "medical_attention_needed"
    elif val in ["Med Urgent", "Neurologic", "Parvo", "Panleuk", "Congenital", "Agonal"]:
        return "critical_condition"
    elif val in ["Nursing", "Neonatal", "Pregnant", "Aged"]:
        return "special_care"
    elif val in ["Feral", "Behavior"]:
        return "behavioral"
    elif val in ["Unknown", "Other", "Space"]:
        return "Unknown"
    else:
        return "Other"

merged_df['intake_condition_grouped'] = merged_df['intake_condition'].apply(condition_group)

In [45]:
merged_df['intake_condition_grouped'].value_counts()

intake_condition_grouped
healthy                     115100
medical_attention_needed     14321
special_care                  5262
Unknown                        317
behavioral                     182
critical_condition              48
Name: count, dtype: int64

In [46]:
merged_df['is_healthy'] = (merged_df['intake_condition_grouped'] == "healthy").astype(int)
print(merged_df[['intake_condition_grouped','is_healthy']])

        intake_condition_grouped  is_healthy
0                        healthy           1
1                        healthy           1
2       medical_attention_needed           0
3                   special_care           0
4                        healthy           1
...                          ...         ...
135230              special_care           0
135231              special_care           0
135232              special_care           0
135233                   healthy           1
135234  medical_attention_needed           0

[135230 rows x 2 columns]


In [47]:
print(merged_df.columns)
print(merged_df.shape)

Index(['animal_id', 'name', 'sex_upon_intake', 'age_upon_intake',
       'datetime_intake', 'visit_count', 'intake_type', 'intake_condition',
       'animal_type', 'breed', 'color', 'outcome_type', 'HasName',
       'NameLength', 'age_days', 'age_group', 'intactness', 'sex',
       'intake_year', 'intake_month', 'intake_day', 'intake_weekday',
       'intake_week', 'intake_hour', 'intake_quarter', 'is_weekend',
       'intake_season', 'intake_am_pm', 'intake_part_of_day',
       'is_return_visitor', 'intake_voluntary', 'intake_human',
       'intake_condition_grouped', 'is_healthy'],
      dtype='object')
(135230, 34)


# Features for color:

In [48]:
print(merged_df['color'].value_counts().to_string())

color
Black/White                    14083
Black                          11711
Brown Tabby                     9022
White                           4949
Brown Tabby/White               4587
Orange Tabby                    4364
Brown/White                     4330
Tan/White                       3931
White/Black                     3746
Blue/White                      3677
Tan                             3437
Brown                           3273
Black/Tan                       2976
Black/Brown                     2843
Tricolor                        2823
Tortie                          2700
Blue                            2651
White/Brown                     2550
Calico                          2549
Blue Tabby                      2351
Brown/Black                     2301
Orange Tabby/White              2171
Brown Brindle/White             2105
White/Tan                       1966
Torbie                          1822
Red                             1345
Red/White                       

In [49]:
print(merged_df['color'].nunique())

616


# Features for breed:

In [50]:
print(merged_df['breed'].value_counts().to_string())

breed
Domestic Shorthair Mix                                    28531
Domestic Shorthair                                        20594
Pit Bull Mix                                               7423
Labrador Retriever Mix                                     7066
Chihuahua Shorthair Mix                                    5959
German Shepherd Mix                                        3254
Domestic Medium Hair Mix                                   2792
Pit Bull                                                   2746
Domestic Medium Hair                                       1799
Chihuahua Shorthair                                        1797
Labrador Retriever                                         1773
Australian Cattle Dog Mix                                  1592
German Shepherd                                            1474
Domestic Longhair Mix                                      1429
Siamese Mix                                                1246
Dachshund Mix                     

In [51]:
print(merged_df['breed'].nunique())

2716
