In [28]:
# Ethan Mason, Matthew Little
# em45486, ?

In [29]:
import pandas as pd
import numpy as np
import sklearn as sk
import re
import warnings
import torch
from torch.utils.data import TensorDataset, DataLoader
# Suppress FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

test_df.head(10)


Unnamed: 0,Id,Intake Time,Found Location,Intake Type,Intake Condition,Animal Type,Sex upon Intake,Age upon Intake,Breed,Color,Date of Birth
0,1,1/3/19 16:19,2501 Magin Meadow Dr in Austin (TX),Stray,Normal,Dog,Neutered Male,2 years,Beagle Mix,Tricolor,1/3/17
1,2,10/21/13 7:59,Austin (TX),Stray,Sick,Cat,Intact Female,4 weeks,Domestic Shorthair Mix,Calico,9/21/13
2,3,6/29/14 10:38,800 Grove Blvd in Austin (TX),Stray,Normal,Dog,Neutered Male,4 years,Doberman Pinsch/Australian Cattle Dog,Tan/Gray,6/29/10
3,4,7/11/15 18:19,Galilee Court And Damita Jo Dr in Manor (TX),Stray,Normal,Dog,Intact Female,5 months,Pit Bull,Brown/White,1/11/15
4,5,2/4/17 10:10,208 Beaver St in Austin (TX),Stray,Injured,Cat,Intact Female,2 years,Domestic Shorthair Mix,Black/White,2/4/15
5,6,3/30/22 9:27,512 Bowery Trl in Austin (TX),Stray,Injured,Dog,Neutered Male,4 years,Beagle,Tricolor,9/3/17
6,7,4/2/21 11:16,Austin (TX),Owner Surrender,Normal,Cat,Intact Female,1 month,Domestic Shorthair Mix,Tortie,2/2/21
7,8,2/21/23 15:26,Ih 35 And Hwy 71 in Austin (TX),Stray,Normal,Dog,Intact Male,2 years,Siberian Husky,Gray/White,10/28/20
8,9,6/29/15 17:04,5306 Peppertree in Austin (TX),Stray,Normal,Cat,Unknown,4 days,Domestic Shorthair Mix,Black,6/25/15
9,10,5/5/15 7:29,4434 Frontier Trl in Austin (TX),Stray,Nursing,Cat,Unknown,2 weeks,Domestic Shorthair Mix,Gray Tabby,4/20/15


## Data Preprocessing

### 🧼 Preprocessing Summary

The following preprocessing steps were applied to prepare the dataset for modeling:

- **Age Conversion**: Converted `Age upon Intake` strings to months using regex, filled missing values with the mean, and renamed the column.

- **Length of Stay**: Calculated `Length of Stay (Days)` as the difference between `Outcome Time` and `Intake Time`, then dropped `Outcome Time`.

- **Datetime Features**: Extracted `Intake Hour`, `Intake DayOfWeek`, and `Intake Month` from `Intake Time`, then dropped the original column.

- **Location Simplification**: Created a boolean feature `Found In Austin` from the `Found Location` string, and dropped the original column.

- **Sex Encoding**: Extracted two features from `Sex upon Intake`: `Sex` (e.g., Intact, Spayed) and `Sex Type` (Male/Female), then dropped the original column.

- **Color Binning**: Kept the top 5 most frequent colors, grouped all others as `"Other"`, one-hot encoded the result, and dropped the original `Color` column.


In [30]:
# Ignore name because that won't affect outcome
train_df = pd.concat([train_df.iloc[:, :1], train_df.iloc[:, 2:]], axis=1)

# Drop ID because that won't affect outcome
train_df.drop('Id', axis=1, inplace=True)

# Drop Date of Birth column since we already have age
train_df.drop('Date of Birth', axis=1, inplace=True)

# Fill the 'Sex upon Intake' NaNs with the mode
train_df['Sex upon Intake'].fillna(train_df['Sex upon Intake'].mode()[0], inplace=True)

# Function to convert age to months
def convert_to_months(age):
    if pd.isna(age):
        return np.nan
    
    # Extract the number and unit using regex
    match = re.match(r'(\d+)\s*(years?|months?|weeks?)', age, re.IGNORECASE)

    if match:
        value = float(match.group(1))
        unit = match.group(2).lower()

        if 'year' in unit:
            return value * 12
        elif 'week' in unit:
            return value / 4.345 # Approx. weeks per month
        else:
            return value
        
    return np.nan

# Calculte how long the animal was with the shelter
def calculate_length_of_stay(df):
    df["Intake Time"] = pd.to_datetime(df["Intake Time"])
    df["Outcome Time"] = pd.to_datetime(df["Outcome Time"])

    df["Length of Stay (Days)"] = (df["Outcome Time"] - df["Intake Time"]).dt.total_seconds() / (60 * 60 * 24)

    df.drop('Outcome Time', axis=1, inplace=True)
    
    return df

# Function to Parse Intake Time
def process_intake_time(df):
    df['Intake Time'] = pd.to_datetime(df['Intake Time'])
    df['Intake Hour'] = df['Intake Time'].dt.hour
    df['Intake DayOfWeek'] = df['Intake Time'].dt.dayofweek
    df['Intake Month'] = df['Intake Time'].dt.month
    df.drop('Intake Time', axis=1, inplace=True)

    return df

def assign_quadrant(location):
    loc = location.lower()
    
    if "austin" not in loc:
        return "Outside Austin"
    
    # Rough heuristic-based quadrant rules
    north = any(word in loc for word in ["north", "n ", "braker", "research"])
    south = any(word in loc for word in ["south", "s ", "slaughter", "william cannon"])
    east  = any(word in loc for word in ["east", "e ", "airport", "springdale"])
    west  = any(word in loc for word in ["west", "w ", "mopac", "bee caves"])

    # Decision logic (priority-based)
    if north and east:
        return "NE"
    elif north and west:
        return "NW"
    elif south and east:
        return "SE"
    elif south and west:
        return "SW"
    elif north:
        return "North"
    elif south:
        return "South"
    elif east:
        return "East"
    elif west:
        return "West"
    else:
        return "Central Austin"


# One-hot Encode the 'Sex upon Intake' column
def encode_sex_upon_intake(df):
    df['Sex'] = df['Sex upon Intake'].str.extract(r'(Intact|Spayed|Neutered)')
    df['Sex Type'] = df['Sex upon Intake'].str.extract(r'(Male|Female)')
    df.drop('Sex upon Intake', axis=1, inplace=True)
    return df

# Bin the top n colors in order to reduce cardinality of the 'Color' column
def bin_top_colors(df, top_n=5):
    # Get the top N most common colors
    top_colors = df["Color"].value_counts().nlargest(top_n).index

    # Create a new column with binned color values
    df["Color Binned"] = df["Color"].apply(lambda c: c if c in top_colors else "Other")

    # Drop the old 'Color' column
    df.drop('Color', axis=1, inplace=True)

    # One-hot encode the new binned color values
    color_dummies = pd.get_dummies(df["Color Binned"], prefix="Color")
    df = pd.concat([df, color_dummies], axis=1)
    
    return df


def preprocess(df):
    df['Age upon Intake'] = df['Age upon Intake'].apply(convert_to_months)
    df["Location Quadrant"] = df["Found Location"].apply(assign_quadrant)
    df = pd.get_dummies(df, columns=["Location Quadrant"])
    df.drop("Found Location", axis=1, inplace=True)

    
    # Compute and apply mean age, if any NaNs remain
    if df['Age upon Intake'].isna().any():
        mean_age = df['Age upon Intake'].mean()
        df['Age upon Intake'].fillna(mean_age, inplace=True)
    
    df.rename(columns={'Age upon Intake': 'Age upon Intake (Months)'}, inplace=True)

    # Only calculate length of stay if Outcome Time is available
    if "Outcome Time" in df.columns:
        df = calculate_length_of_stay(df)

    df = process_intake_time(df)
    df = encode_sex_upon_intake(df)
    df = bin_top_colors(df, top_n=5)

    return df




train_df = preprocess(train_df)


train_df.head(10)

  df["Outcome Time"] = pd.to_datetime(df["Outcome Time"])


Unnamed: 0,Intake Type,Intake Condition,Animal Type,Age upon Intake (Months),Breed,Outcome Type,Location Quadrant_NE,Location Quadrant_NW,Location Quadrant_North,Location Quadrant_Outside Austin,...,Intake Month,Sex,Sex Type,Color Binned,Color_Black,Color_Black/White,Color_Brown Tabby,Color_Brown Tabby/White,Color_Other,Color_White
0,Stray,Normal,Dog,96.0,English Springer Spaniel,Return to Owner,False,False,True,False,...,7,Spayed,Female,Other,False,False,False,False,True,False
1,Stray,Normal,Dog,11.0,Basenji Mix,Return to Owner,False,False,True,False,...,4,Intact,Male,Other,False,False,False,False,True,False
2,Public Assist,Normal,Cat,24.0,Domestic Shorthair,Transfer,True,False,False,False,...,5,Neutered,Male,Other,False,False,False,False,True,False
3,Owner Surrender,Normal,Dog,24.0,Labrador Retriever Mix,Return to Owner,False,False,True,False,...,2,Neutered,Male,Other,False,False,False,False,True,False
4,Public Assist,Normal,Dog,72.0,Great Dane Mix,Return to Owner,True,False,False,False,...,4,Neutered,Male,Black,True,False,False,False,False,False
5,Stray,Normal,Cat,6.0,Domestic Shorthair,Adoption,True,False,False,False,...,10,Intact,Female,Brown Tabby,False,False,True,False,False,False
6,Public Assist,Normal,Dog,24.0,Labrador Retriever Mix,Return to Owner,False,False,True,False,...,7,Intact,Male,Black/White,False,True,False,False,False,False
7,Stray,Normal,Cat,0.920598,Domestic Shorthair,Adoption,False,False,False,True,...,6,Intact,Male,Other,False,False,False,False,True,False
8,Stray,Injured,Cat,0.920598,Domestic Shorthair Mix,Transfer,False,False,True,False,...,6,Intact,Female,Black/White,False,True,False,False,False,False
9,Owner Surrender,Normal,Cat,5.0,Domestic Shorthair Mix,Transfer,False,False,True,False,...,8,Neutered,Male,Other,False,False,False,False,True,False


In [31]:
def refine_features(df):
    import numpy as np
    import pandas as pd

    # --- Simplify Breed ---
    df["Is Mix"] = df["Breed"].str.contains("Mix", case=False, na=False)
    df["Is Domestic"] = df["Breed"].str.contains("Domestic", case=False, na=False)
    df["Is Purebred"] = ~(df["Is Mix"] | df["Is Domestic"])

    # Drop original breed column if not binning top N
    df.drop("Breed", axis=1, inplace=True, errors='ignore')

    # --- Normalize Color (e.g., Black/White == White/Black) ---
    if "Color Binned" not in df.columns and "Color" in df.columns:
        df["Color Normalized"] = df["Color"].fillna("Unknown").apply(
            lambda c: "/".join(sorted(c.split("/"))) if "/" in c else c
        )
        top_colors = df["Color Normalized"].value_counts().nlargest(5).index
        df["Color Binned"] = df["Color Normalized"].apply(lambda x: x if x in top_colors else "Other")
        df.drop(["Color", "Color Normalized"], axis=1, inplace=True)

    # --- Age Group Binning ---
    def age_bin(months):
        if pd.isnull(months):
            return "Unknown"
        if months < 6:
            return "Puppy/Kitten"
        elif months < 24:
            return "Young"
        elif months < 84:
            return "Adult"
        else:
            return "Senior"

    df["Age Group"] = df["Age upon Intake (Months)"].apply(age_bin)

    # --- Intake Time-Based Features ---
    df["Is Weekend"] = df["Intake DayOfWeek"] >= 5
    df["Is Night Intake"] = (df["Intake Hour"] < 6) | (df["Intake Hour"] >= 20)
    df["Is Summer Intake"] = df["Intake Month"].isin([6, 7, 8])

    return df


## Training the Model

Preparing Data for Use as PyTorch Neural Net

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
import torch

# Preprocess test data
test_df = preprocess(test_df)
train_df = refine_features(train_df)
test_df = refine_features(test_df)

# Separate full train features and labels
X = train_df.drop(columns=["Outcome Type"])
y = train_df["Outcome Type"]

# One-hot encode features
categorical_cols = [
    "Intake Type", "Intake Condition", "Animal Type",
    "Color Binned", "Age Group", "Sex", "Sex Type"
]

X = pd.get_dummies(X, columns=categorical_cols)
X_test = pd.get_dummies(test_df, columns=categorical_cols)

# Align feature columns
X, X_test = X.align(X_test, join="left", axis=1, fill_value=0)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into train and validation sets
X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=42
)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_val = scaler.transform(X_val_raw)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

X_val_tensor = torch.tensor(X_val, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val, dtype=torch.long)

# Create DataLoaders
batch_size = 64
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)


  df['Intake Time'] = pd.to_datetime(df['Intake Time'])


Defining the Neural Net

In [33]:
import torch.nn as nn
import torch.nn.functional as F

class ShelterNet(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.fc1 = nn.Linear(input_dim, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 64)
        self.out = nn.Linear(64, num_classes)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.dropout(F.relu(self.fc2(x)))
        x = self.dropout(F.relu(self.fc3(x)))
        return self.out(x)


In [34]:
from sklearn.utils.class_weight import compute_class_weight
import torch.optim as optim

# Create model
input_dim = X_train.shape[1]
num_classes = len(label_encoder.classes_)
model = ShelterNet(input_dim, num_classes)

raw_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
log_weights = np.log1p(raw_weights)
class_weights_tensor = torch.tensor(log_weights, dtype=torch.float32)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = optim.Adam(model.parameters(), lr=0.001)


Training the Neural Net

In [35]:
epochs = 100
train_accuracies = []
val_accuracies = []
epoch_losses = []
best_val_acc = 0
early_stop_counter = 0
patience = 10  # Stop if val accuracy doesn't improve after 10 epochs

for epoch in range(epochs):
    model.train()
    total_loss = 0
    correct_train, total_train = 0, 0

    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        _, preds = torch.max(outputs, 1)
        correct_train += (preds == y_batch).sum().item()
        total_train += y_batch.size(0)

    # Evaluate on validation set
    model.eval()
    correct_val, total_val = 0, 0
    with torch.no_grad():
        for X_batch, y_batch in val_loader:
            outputs = model(X_batch)
            _, preds = torch.max(outputs, 1)
            correct_val += (preds == y_batch).sum().item()
            total_val += y_batch.size(0)

    train_acc = correct_train / total_train
    val_acc = correct_val / total_val
    avg_loss = total_loss / len(train_loader)

    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    epoch_losses.append(avg_loss)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Train Acc: {train_acc:.4f}, Val Acc: {val_acc:.4f}")

    # Early stopping
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        early_stop_counter = 0
        best_model_state = model.state_dict()  # save best model
    else:
        early_stop_counter += 1
        if early_stop_counter >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
        
# Load best model if early stopping triggered
model.load_state_dict(best_model_state)

Epoch 1/100, Loss: 1.1851, Train Acc: 0.5489, Val Acc: 0.5842
Epoch 2/100, Loss: 1.1184, Train Acc: 0.5966, Val Acc: 0.6130
Epoch 3/100, Loss: 1.0609, Train Acc: 0.6352, Val Acc: 0.6692
Epoch 4/100, Loss: 1.0229, Train Acc: 0.6607, Val Acc: 0.6711
Epoch 5/100, Loss: 1.0010, Train Acc: 0.6727, Val Acc: 0.6936
Epoch 6/100, Loss: 0.9859, Train Acc: 0.6839, Val Acc: 0.6917
Epoch 7/100, Loss: 0.9731, Train Acc: 0.6922, Val Acc: 0.6984
Epoch 8/100, Loss: 0.9593, Train Acc: 0.6968, Val Acc: 0.7054
Epoch 9/100, Loss: 0.9583, Train Acc: 0.6991, Val Acc: 0.7060
Epoch 10/100, Loss: 0.9496, Train Acc: 0.7010, Val Acc: 0.6930
Epoch 11/100, Loss: 0.9454, Train Acc: 0.7017, Val Acc: 0.7074
Epoch 12/100, Loss: 0.9378, Train Acc: 0.7057, Val Acc: 0.7071
Epoch 13/100, Loss: 0.9363, Train Acc: 0.7030, Val Acc: 0.7124
Epoch 14/100, Loss: 0.9330, Train Acc: 0.7042, Val Acc: 0.7151
Epoch 15/100, Loss: 0.9325, Train Acc: 0.7060, Val Acc: 0.7102
Epoch 16/100, Loss: 0.9284, Train Acc: 0.7071, Val Acc: 0.7136
E

<All keys matched successfully>

Evaluate Model

In [None]:
# Scale the test data
X_test = test_df  # cleaned test data
# Align test columns to match training data (X_train_raw is your pre-scaling training features)
X_test, _ = X_test.align(X_train_raw, join='right', axis=1, fill_value=0)

# Scale using the same scaler fitted on training data
X_test_scaled = scaler.transform(X_test)

# Convert to PyTorch tensor
X_test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32)

model.eval()
with torch.no_grad():
    outputs = model(X_test_tensor)
    _, predictions = torch.max(outputs, 1)
    predicted_labels = label_encoder.inverse_transform(predictions.numpy())



def summarize_outcomes(y, label_encoder=None, normalize=False):
    if label_encoder is not None:
        labels = label_encoder.inverse_transform(np.array(y))
    else:
        labels = y

    return pd.Series(labels).value_counts(normalize=normalize)

outcomes = summarize_outcomes(y_train, label_encoder, normalize=True)
print(outcomes)

def summarize_predictions(predictions, label_encoder):
    labels = label_encoder.inverse_transform(predictions)
    value_counts = pd.Series(labels).value_counts(normalize=True)
    return value_counts

predicted_labels = label_encoder.inverse_transform(predictions.numpy())
summary = summarize_predictions(predictions.numpy(), label_encoder)
print(summary)


ValueError: y contains previously unseen labels: ['Died' 'Euthanasia']

### Generate CSV File

In [37]:
# Generate sequential IDs (1-based indexing)
ids = np.arange(1, len(predicted_labels) + 1)

# Create the DataFrame
submission_df = pd.DataFrame({
    'Id': ids,
    'Outcome': predicted_labels
})

# Save to CSV
submission_df.to_csv('submission4.csv', index=False)