In [17]:
# Ethan Mason, Matthew Little
# em45486, ?

In [18]:
import pandas as pd
import numpy as np
import sklearn as sk
import re
import warnings
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

train_df.head(10)


Unnamed: 0,Id,Name,Intake Time,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Outcome Time,Date of Birth,Outcome Type
0,A706918,Belle,07/05/2015 12:59:00 PM,9409 Bluegrass Dr in Austin (TX),Stray,Normal,Dog,Spayed Female,8 years,English Springer Spaniel,White/Liver,07/05/2015 03:13:00 PM,07/05/2007,Return to Owner
1,A724273,Runster,04/14/2016 06:43:00 PM,2818 Palomino Trail in Austin (TX),Stray,Normal,Dog,Intact Male,11 months,Basenji Mix,Sable/White,04/21/2016 05:17:00 PM,04/17/2015,Return to Owner
2,A857105,Johnny Ringo,05/12/2022 12:23:00 AM,4404 Sarasota Drive in Austin (TX),Public Assist,Normal,Cat,Neutered Male,2 years,Domestic Shorthair,Orange Tabby,05/12/2022 02:35:00 PM,05/12/2020,Transfer
3,A743852,Odin,02/18/2017 12:46:00 PM,Austin (TX),Owner Surrender,Normal,Dog,Neutered Male,2 years,Labrador Retriever Mix,Chocolate,02/21/2017 05:44:00 PM,02/18/2015,Return to Owner
4,A635072,Beowulf,04/16/2019 09:53:00 AM,415 East Mary Street in Austin (TX),Public Assist,Normal,Dog,Neutered Male,6 years,Great Dane Mix,Black,04/18/2019 01:45:00 PM,06/03/2012,Return to Owner
5,A844350,*Ella,10/15/2021 11:40:00 AM,2112 East William Cannon Drive in Austin (TX),Stray,Normal,Cat,Intact Female,6 months,Domestic Shorthair,Brown Tabby,10/20/2021 06:51:00 PM,04/15/2021,Adoption
6,A708452,Mumble,07/30/2015 02:37:00 PM,Austin (TX),Public Assist,Normal,Dog,Intact Male,2 years,Labrador Retriever Mix,Black/White,08/04/2015 06:17:00 PM,07/28/2013,Return to Owner
7,A818975,,06/18/2020 02:53:00 PM,Braker Lane And Metric in Travis (TX),Stray,Normal,Cat,Intact Male,4 weeks,Domestic Shorthair,Cream Tabby,07/23/2020 03:54:00 PM,05/19/2020,Adoption
8,A774147,,06/11/2018 07:45:00 AM,6600 Elm Creek in Austin (TX),Stray,Injured,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Black/White,06/11/2018 12:00:00 AM,05/10/2018,Transfer
9,A731435,*Casey,08/08/2016 05:52:00 PM,Austin (TX),Owner Surrender,Normal,Cat,Neutered Male,5 months,Domestic Shorthair Mix,Cream Tabby,08/13/2016 12:27:00 PM,02/22/2016,Transfer


## Data Preprocessing

### 🧼 Preprocessing Summary

The following preprocessing steps were applied to prepare the dataset for modeling:


In [19]:
# Ignore name because that won't affect outcome
train_df.drop('Name', axis=1, inplace=True)

print(train_df[train_df["Id"] == "A123456"])
# Drop the record that has 'Wildlife' as the value for 'Intake Type'
train_df = train_df[train_df["Id"] != "A123456"]

# Drop ID because that won't affect outcome
train_df.drop('Id', axis=1, inplace=True)

# Drop Date of Birth column since we already have age
train_df.drop('Date of Birth', axis=1, inplace=True)

# Drop Found Location column
train_df.drop('Found Location', axis=1, inplace=True)

# Drop Color column
train_df.drop('Color', axis=1, inplace=True)

# Drop Intake Time column
train_df.drop('Intake Time', axis=1, inplace=True)

# Drop Outcome Time column
train_df.drop('Outcome Time', axis=1, inplace=True)

# Drop any rows where the "Sex upon Intake" column has the value "Unknown"
train_df = train_df[~train_df["Sex upon Intake"].astype(str).str.strip().str.lower().eq("unknown")]

# Fill the 'Sex upon Intake' NaNs with the mode
train_df['Sex upon Intake'].fillna(train_df['Sex upon Intake'].mode()[0], inplace=True)

# Function to convert age to months
def convert_to_months(age):
    if pd.isna(age):
        return np.nan
    
    # Extract the number and unit using regex
    match = re.match(r'(\d+)\s*(years?|months?|weeks?)', age, re.IGNORECASE)

    if match:
        value = float(match.group(1))
        unit = match.group(2).lower()

        if 'year' in unit:
            return value * 12
        elif 'week' in unit:
            return value / 4.345 # Approx. weeks per month
        else:
            return value
        
    return np.nan

# Drop animals with the "unknown", "other", or "space" intake condition
# because there are so few records with these values
def drop_uncertain_intake_conditions(df):
    drop_values = {"unknown", "other", "space"}
    mask = df["Intake Condition"].astype(str).str.strip().str.lower().isin(drop_values)
    before = len(df)
    df = df[~mask].copy()
    after = len(df)
    return df


# One-hot Encode the 'Sex upon Intake' column
def encode_sex_upon_intake(df):
    df['Sex'] = df['Sex upon Intake'].str.extract(r'(Intact|Spayed|Neutered)')
    df['Sex Type'] = df['Sex upon Intake'].str.extract(r'(Male|Female)')
    df.drop('Sex upon Intake', axis=1, inplace=True)
    return df

# Convert 'Breed' column values into 'Mix' or 'Purebred'.
# If a value contains '/' or 'mix' (case-insensitive), it's 'Mix'; otherwise 'Purebred'.
def simplify_breed_column(df):
    df = df.copy()
    df["Breed"] = df["Breed"].astype(str)
    df["Breed Type"] = df["Breed"].apply(
        lambda x: "Mix" if ("/" in x or "mix" in x.lower()) else "Purebred"
    )

    df.drop("Breed", axis=1, inplace=True)
    return df


def preprocess(df):
    df['Age upon Intake'] = df['Age upon Intake'].apply(convert_to_months)
        
    # Compute and apply mean age to NaNs or negative ages
    if df['Age upon Intake'].isna().any() or (df['Age upon Intake'] < 0).any():
        mean_age = df['Age upon Intake'].mean()
        df['Age upon Intake'].fillna(mean_age, inplace=True)
    
    df.rename(columns={'Age upon Intake': 'Age upon Intake (Months)'}, inplace=True)

    df = simplify_breed_column(df)
    df = encode_sex_upon_intake(df)
    df = drop_uncertain_intake_conditions(df)

    return df

Empty DataFrame
Columns: [Id, Intake Time, Found Location, Intake Type, Intake Condition, Animal Type, Sex upon Intake, Age upon Intake, Breed, Color, Outcome Time, Date of Birth, Outcome Type]
Index: []


## Training the Model

Use XGBoost

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier

# 1. Preprocess both datasets
train_df = preprocess(train_df)
test_df = preprocess(test_df)

# 2. Split by Animal Type
train_cats = train_df[train_df["Animal Type"].str.lower() == "cat"]
train_dogs = train_df[train_df["Animal Type"].str.lower() == "dog"]

test_cats = test_df[test_df["Animal Type"].str.lower() == "cat"]
test_dogs = test_df[test_df["Animal Type"].str.lower() == "dog"]

def train_and_predict_species(train_df, test_df):
    X = train_df.drop(columns=["Outcome Type"])
    y = train_df["Outcome Type"]

    # One-hot encode
    categorical_cols = [
        "Intake Type", "Intake Condition", "Animal Type",
        "Breed Type", "Sex", "Sex Type"
    ]
    X = pd.get_dummies(X, columns=categorical_cols)
    X_test = pd.get_dummies(test_df, columns=categorical_cols)

    # Align columns
    X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

    # Label encode outcomes
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)

    # Split train/val
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
    )

    # Train XGBoost
    model = XGBClassifier(
        objective='multi:softprob',
        eval_metric='mlogloss',
        use_label_encoder=False,
        num_class=len(label_encoder.classes_),
        n_estimators=1000,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )

    model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        verbose=True
    )

    # Predict
    y_test_pred = model.predict(X_test)
    predicted_labels = label_encoder.inverse_transform(y_test_pred)

    return predicted_labels

# 3. Train + Predict separately
cat_predictions = train_and_predict_species(train_cats, test_cats)
dog_predictions = train_and_predict_species(train_dogs, test_dogs)

# 4. Combine predictions and save
test_cats = test_cats.copy()
test_dogs = test_dogs.copy()
test_cats["Outcome"] = cat_predictions
test_dogs["Outcome"] = dog_predictions

submission_df = pd.concat([test_cats, test_dogs]).sort_index()
submission_df = submission_df[["Id", "Outcome"]]  # Ensure correct format
submission_df.to_csv("submission_cats_and_dogs.csv", index=False)

[0]	validation_0-mlogloss:1.56301
[1]	validation_0-mlogloss:1.52038
[2]	validation_0-mlogloss:1.48113
[3]	validation_0-mlogloss:1.44123
[4]	validation_0-mlogloss:1.40483
[5]	validation_0-mlogloss:1.37108
[6]	validation_0-mlogloss:1.34178
[7]	validation_0-mlogloss:1.31262
[8]	validation_0-mlogloss:1.28549
[9]	validation_0-mlogloss:1.26020
[10]	validation_0-mlogloss:1.23938
[11]	validation_0-mlogloss:1.21720
[12]	validation_0-mlogloss:1.19761
[13]	validation_0-mlogloss:1.17789
[14]	validation_0-mlogloss:1.15940


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[15]	validation_0-mlogloss:1.14306
[16]	validation_0-mlogloss:1.12764
[17]	validation_0-mlogloss:1.11276
[18]	validation_0-mlogloss:1.09790
[19]	validation_0-mlogloss:1.08396
[20]	validation_0-mlogloss:1.07085
[21]	validation_0-mlogloss:1.05895
[22]	validation_0-mlogloss:1.04845
[23]	validation_0-mlogloss:1.03800
[24]	validation_0-mlogloss:1.02752
[25]	validation_0-mlogloss:1.01788
[26]	validation_0-mlogloss:1.00831
[27]	validation_0-mlogloss:0.99929
[28]	validation_0-mlogloss:0.99124
[29]	validation_0-mlogloss:0.98352
[30]	validation_0-mlogloss:0.97580
[31]	validation_0-mlogloss:0.96903
[32]	validation_0-mlogloss:0.96199
[33]	validation_0-mlogloss:0.95552
[34]	validation_0-mlogloss:0.94906
[35]	validation_0-mlogloss:0.94331
[36]	validation_0-mlogloss:0.93811
[37]	validation_0-mlogloss:0.93257
[38]	validation_0-mlogloss:0.92736
[39]	validation_0-mlogloss:0.92239
[40]	validation_0-mlogloss:0.91790
[41]	validation_0-mlogloss:0.91349
[42]	validation_0-mlogloss:0.90920
[43]	validation_0-ml

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[16]	validation_0-mlogloss:1.22937
[17]	validation_0-mlogloss:1.21727
[18]	validation_0-mlogloss:1.20569
[19]	validation_0-mlogloss:1.19495
[20]	validation_0-mlogloss:1.18487
[21]	validation_0-mlogloss:1.17502
[22]	validation_0-mlogloss:1.16577
[23]	validation_0-mlogloss:1.15708
[24]	validation_0-mlogloss:1.14886
[25]	validation_0-mlogloss:1.14093
[26]	validation_0-mlogloss:1.13355
[27]	validation_0-mlogloss:1.12643
[28]	validation_0-mlogloss:1.11983
[29]	validation_0-mlogloss:1.11338
[30]	validation_0-mlogloss:1.10717
[31]	validation_0-mlogloss:1.10157
[32]	validation_0-mlogloss:1.09603
[33]	validation_0-mlogloss:1.09074
[34]	validation_0-mlogloss:1.08569
[35]	validation_0-mlogloss:1.08093
[36]	validation_0-mlogloss:1.07629
[37]	validation_0-mlogloss:1.07193
[38]	validation_0-mlogloss:1.06786
[39]	validation_0-mlogloss:1.06392
[40]	validation_0-mlogloss:1.06007
[41]	validation_0-mlogloss:1.05641
[42]	validation_0-mlogloss:1.05304
[43]	validation_0-mlogloss:1.04963
[44]	validation_0-ml

Make predictions

### Generate CSV File

In [21]:
# # Generate sequential IDs (1-based indexing)
# ids = np.arange(1, len(predicted_labels) + 1)

# # Create the DataFrame
# submission_df = pd.DataFrame({
#     'Id': ids,
#     'Outcome': predicted_labels
# })

# # Save to CSV
# submission_df.to_csv('submission7.csv', index=False)