In [21]:
# Ethan Mason, Matthew Little
# em45486, ?

In [22]:
import pandas as pd
import numpy as np
import sklearn as sk
import re
import warnings
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

test_df.head(10)


Unnamed: 0,Id,Intake Time,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Date of Birth
0,1,1/3/19 16:19,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,1/3/17
1,2,10/21/13 7:59,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,9/21/13
2,3,6/29/14 10:38,800 Grove Blvd in Austin (TX),Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,6/29/10
3,4,7/11/15 18:19,Galilee Court And Damita Jo Dr in Manor (TX),Stray,Normal,Dog,Intact Female,5 months,Pit Bull,Brown/White,1/11/15
4,5,2/4/17 10:10,208 Beaver St in Austin (TX),Stray,Injured,Cat,Intact Female,2 years,Domestic Shorthair Mix,Black/White,2/4/15
5,6,3/30/22 9:27,512 Bowery Trl in Austin (TX),Stray,Injured,Dog,Neutered Male,4 years,Beagle,Tricolor,9/3/17
6,7,4/2/21 11:16,Austin (TX),Owner Surrender,Normal,Cat,Intact Female,1 month,Domestic Shorthair Mix,Tortie,2/2/21
7,8,2/21/23 15:26,Ih 35 And Hwy 71 in Austin (TX),Stray,Normal,Dog,Intact Male,2 years,Siberian Husky,Gray/White,10/28/20
8,9,6/29/15 17:04,5306 Peppertree in Austin (TX),Stray,Normal,Cat,Unknown,4 days,Domestic Shorthair Mix,Black,6/25/15
9,10,5/5/15 7:29,4434 Frontier Trl in Austin (TX),Stray,Nursing,Cat,Unknown,2 weeks,Domestic Shorthair Mix,Gray Tabby,4/20/15


## Data Preprocessing

### 🧼 Preprocessing Summary

The following preprocessing steps were applied to prepare the dataset for modeling:

- **Age Conversion**: Converted `Age upon Intake` strings to months using regex, filled missing values with the mean, and renamed the column.

- **Length of Stay**: Calculated `Length of Stay (Days)` as the difference between `Outcome Time` and `Intake Time`, then dropped `Outcome Time`.

- **Datetime Features**: Extracted `Intake Hour`, `Intake DayOfWeek`, and `Intake Month` from `Intake Time`, then dropped the original column.

- **Location Simplification**: Created a boolean feature `Found In Austin` from the `Found Location` string, and dropped the original column.

- **Sex Encoding**: Extracted two features from `Sex upon Intake`: `Sex` (e.g., Intact, Spayed) and `Sex Type` (Male/Female), then dropped the original column.

- **Color Binning**: Kept the top 5 most frequent colors, grouped all others as `"Other"`, one-hot encoded the result, and dropped the original `Color` column.


In [23]:
# Ignore name because that won't affect outcome
train_df = pd.concat([train_df.iloc[:, :1], train_df.iloc[:, 2:]], axis=1)

# Drop ID because that won't affect outcome
train_df.drop('Id', axis=1, inplace=True)

# Drop Date of Birth column since we already have age
train_df.drop('Date of Birth', axis=1, inplace=True)

# Fill the 'Sex upon Intake' NaNs with the mode
train_df['Sex upon Intake'].fillna(train_df['Sex upon Intake'].mode()[0], inplace=True)

# Function to convert age to months
def convert_to_months(age):
    if pd.isna(age):
        return np.nan
    
    # Extract the number and unit using regex
    match = re.match(r'(\d+)\s*(years?|months?|weeks?)', age, re.IGNORECASE)

    if match:
        value = float(match.group(1))
        unit = match.group(2).lower()

        if 'year' in unit:
            return value * 12
        elif 'week' in unit:
            return value / 4.345 # Approx. weeks per month
        else:
            return value
        
    return np.nan

# Calculte how long the animal was with the shelter
def calculate_length_of_stay(df):
    df["Intake Time"] = pd.to_datetime(df["Intake Time"])
    df["Outcome Time"] = pd.to_datetime(df["Outcome Time"])

    df["Length of Stay (Days)"] = (df["Outcome Time"] - df["Intake Time"]).dt.total_seconds() / (60 * 60 * 24)

    df.drop('Outcome Time', axis=1, inplace=True)
    
    return df

# Function to Parse Intake Time
def process_intake_time(df):
    df['Intake Time'] = pd.to_datetime(df['Intake Time'])
    df['Intake Hour'] = df['Intake Time'].dt.hour
    df['Intake DayOfWeek'] = df['Intake Time'].dt.dayofweek
    df['Intake Month'] = df['Intake Time'].dt.month
    df.drop('Intake Time', axis=1, inplace=True)

    return df

# Turn the 'Found Location' column into a Boolean column representing if
# the animal was found in Austin or not
def add_in_austin_flag(df):
    df['Found In Austin'] = df['Found Location'].str.contains('Austin', case=False, na=False)
    df.drop('Found Location', axis=1, inplace=True)
    return df

# One-hot Encode the 'Sex upon Intake' column
def encode_sex_upon_intake(df):
    df['Sex'] = df['Sex upon Intake'].str.extract(r'(Intact|Spayed|Neutered)')
    df['Sex Type'] = df['Sex upon Intake'].str.extract(r'(Male|Female)')
    df.drop('Sex upon Intake', axis=1, inplace=True)
    return df

# Bin the top n colors in order to reduce cardinality of the 'Color' column
def bin_top_colors(df, top_n=5):
    # Get the top N most common colors
    top_colors = df["Color"].value_counts().nlargest(top_n).index

    # Create a new column with binned color values
    df["Color Binned"] = df["Color"].apply(lambda c: c if c in top_colors else "Other")

    # Drop the old 'Color' column
    df.drop('Color', axis=1, inplace=True)

    # One-hot encode the new binned color values
    color_dummies = pd.get_dummies(df["Color Binned"], prefix="Color")
    df = pd.concat([df, color_dummies], axis=1)
    
    return df


def preprocess(df):
    df['Age upon Intake'] = df['Age upon Intake'].apply(convert_to_months)
    
    # Compute and apply mean age, if any NaNs remain
    if df['Age upon Intake'].isna().any():
        mean_age = df['Age upon Intake'].mean()
        df['Age upon Intake'].fillna(mean_age, inplace=True)
    
    df.rename(columns={'Age upon Intake': 'Age upon Intake (Months)'}, inplace=True)

    # Only calculate length of stay if Outcome Time is available
    if "Outcome Time" in df.columns:
        df = calculate_length_of_stay(df)

    df = process_intake_time(df)
    df = add_in_austin_flag(df)
    df = encode_sex_upon_intake(df)
    df = bin_top_colors(df, top_n=5)

    return df




train_df = preprocess(train_df)


train_df.head(10)

  df["Outcome Time"] = pd.to_datetime(df["Outcome Time"])


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Age upon Intake (Months),Breed,Outcome Type,Length of Stay (Days),Intake Hour,Intake DayOfWeek,Intake Month,Found In Austin,Sex,Sex Type,Color Binned,Color_Black,Color_Black/White,Color_Brown Tabby,Color_Brown Tabby/White,Color_Other,Color_White
0,Stray,Normal,Dog,96.0,English Springer Spaniel,Return to Owner,0.093056,12,6,7,True,Spayed,Female,Other,False,False,False,False,True,False
1,Stray,Normal,Dog,11.0,Basenji Mix,Return to Owner,6.940278,18,3,4,True,Intact,Male,Other,False,False,False,False,True,False
2,Public Assist,Normal,Cat,24.0,Domestic Shorthair,Transfer,0.591667,0,3,5,True,Neutered,Male,Other,False,False,False,False,True,False
3,Owner Surrender,Normal,Dog,24.0,Labrador Retriever Mix,Return to Owner,3.206944,12,5,2,True,Neutered,Male,Other,False,False,False,False,True,False
4,Public Assist,Normal,Dog,72.0,Great Dane Mix,Return to Owner,2.161111,9,1,4,True,Neutered,Male,Black,True,False,False,False,False,False
5,Stray,Normal,Cat,6.0,Domestic Shorthair,Adoption,5.299306,11,4,10,True,Intact,Female,Brown Tabby,False,False,True,False,False,False
6,Public Assist,Normal,Dog,24.0,Labrador Retriever Mix,Return to Owner,5.152778,14,3,7,True,Intact,Male,Black/White,False,True,False,False,False,False
7,Stray,Normal,Cat,0.920598,Domestic Shorthair,Adoption,35.042361,14,3,6,False,Intact,Male,Other,False,False,False,False,True,False
8,Stray,Injured,Cat,0.920598,Domestic Shorthair Mix,Transfer,-0.322917,7,0,6,True,Intact,Female,Black/White,False,True,False,False,False,False
9,Owner Surrender,Normal,Cat,5.0,Domestic Shorthair Mix,Transfer,4.774306,17,0,8,True,Neutered,Male,Other,False,False,False,False,True,False


## Training the Model

We decided to use a Decision Tree because ...

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, make_scorer, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split

# Preprocess test data (same function you used earlier)
test_df = preprocess(test_df)

# Separate full train features and labels
X = train_df.drop(columns=["Outcome Type"])
y = train_df["Outcome Type"]

# Preprocess features
categorical_cols = ["Intake Type", "Intake Condition", "Animal Type", "Breed", "Sex", "Sex Type", "Color Binned"]
X = pd.get_dummies(X, columns=categorical_cols)
X_test = pd.get_dummies(test_df, columns=categorical_cols)

# Align columns
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

# Split into train and validation for evaluation
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# Define parameter grid
param_grid = {
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Define scoring metrics
scoring = {
    'accuracy': 'accuracy',
    'precision_macro': make_scorer(precision_score, average='macro', zero_division=0),
    'recall_macro': make_scorer(recall_score, average='macro', zero_division=0)
}

# Grid search
grid_search = GridSearchCV(
    estimator=DecisionTreeClassifier(random_state=42),
    param_grid=param_grid,
    scoring=scoring,
    refit='accuracy',
    cv=5,
    return_train_score=True
)

grid_search.fit(X_train, y_train)

# Best parameters and accuracy
print("Best Parameters:", grid_search.best_params_)
print("Best Accuracy Score (CV):", grid_search.best_score_)

# Evaluate on validation set
best_model = grid_search.best_estimator_
y_val_pred = best_model.predict(X_val)

print("\nValidation Set Classification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

# Predict on test set (no labels)
y_pred = best_model.predict(X_test)


  df['Intake Time'] = pd.to_datetime(df['Intake Time'])


Best Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5}
Best Accuracy Score (CV): 0.7538262580826539

Validation Set Classification Report:
                 precision    recall  f1-score   support

       Adoption       0.74      0.94      0.83     11009
           Died       0.27      0.04      0.07       208
     Euthanasia       0.56      0.32      0.41       690
Return to Owner       0.80      0.73      0.76      3320
       Transfer       0.78      0.54      0.64      7005

       accuracy                           0.75     22232
      macro avg       0.63      0.51      0.54     22232
   weighted avg       0.75      0.75      0.74     22232


Predictions on test set:
['Return to Owner' 'Euthanasia' 'Return to Owner' 'Return to Owner'
 'Euthanasia' 'Return to Owner' 'Euthanasia' 'Return to Owner' 'Transfer'
 'Transfer']


### Generate CSV File

In [None]:
# Generate sequential IDs (1-based indexing)
ids = np.arange(1, len(y_pred) + 1)

# Create the DataFrame
submission_df = pd.DataFrame({
    'Id': ids,
    'Outcome': y_pred
})

# Save to CSV
submission_df.to_csv('submission.csv', index=False)