Change: Use balanced_accuracy_score and added balanced class weight and scoring 


In [10]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report


1. Drop Name & Outcome time
2. Convert Age upon intake to single number (in weeks)
3. Split sex upon intake to intact/spayed/neutered and the actual gender (male/female)

- Could build a different model for cats and for dogs

In [None]:
def convert_to_weeks(value):
    value = value.lower()  # Make it case-insensitive
    if 'week' in value:
        # Extract number of weeks
        return int(value.split()[0])
    elif 'month' in value:
        # Convert months to weeks (1 month ≈ 4.345 weeks)
        return int(value.split()[0]) * 4.345
    elif 'year' in value:
        # Convert years to weeks (1 year ≈ 52.1775 weeks)
        return int(value.split()[0]) * 52.1775
    elif 'day' in value:
        # Convert days to 0 weeks
        return 0
    return 0  # In case of unexpected values

def extract_month_year(df, column='Intake Time'):
    """
    Convert the specified datetime column in the DataFrame to two new columns: Month and Year.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing the datetime column.
        column (str): Name of the column to convert (default is 'Intake Time').

    Returns:
        pd.DataFrame: The original DataFrame with additional 'Month' and 'Year' columns.
    """
    # Convert the column to datetime objects. Adjust the format if needed.
    df[column] = pd.to_datetime(df[column], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

    # Extract the month and year from the datetime column
    df['Intake Month'] = df[column].dt.month
    df['Intake Year'] = df[column].dt.year

    return df
    


In [None]:
def process_data(data):    
    data = data.drop(columns=['Id', 'Name', 'Outcome Time'])
    data = data.drop(columns=['Date of Birth'])
    data = data.drop(columns=['Found Location'])

    data = data.dropna()

    data = pd.get_dummies(data, columns=['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake', 'Breed', 'Color', "City"])
    data['Age upon Intake'] = data['Age upon Intake'].apply(convert_to_weeks)
    data = extract_month_year(data, column='Intake Time')
    data = data.drop(columns=['Intake Time'])
    print(data.columns)
    # data.head()
    return data

In [None]:
def process_test_data(data):    
    data = data.drop(columns=['Id'])
    data = data.drop(columns=['Date of Birth'])
    data = data.drop(columns=['Found Location'])

    data = data.dropna()

    data = pd.get_dummies(data, columns=['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake', 'Breed', 'Color', 'City'])
    data['Age upon Intake'] = data['Age upon Intake'].apply(convert_to_weeks)
    data = extract_month_year(data, column='Intake Time')
    data = data.drop(columns=['Intake Time'])
    print(data.columns)
    data.head()
    return data

In [14]:
train_data = pd.read_csv('train.csv', header=0)
train_data = process_data(train_data)
train_data.head()

Index(['Age upon Intake', 'Outcome Type', 'Intake Condition_Aged',
       'Intake Condition_Agonal', 'Intake Condition_Behavior',
       'Intake Condition_Congenital', 'Intake Condition_Feral',
       'Intake Condition_Injured', 'Intake Condition_Med Attn',
       'Intake Condition_Med Urgent',
       ...
       'City_Round Rock', 'City_San Leanna', 'City_Sunset Valley',
       'City_Travis', 'City_Village Ot Hill', 'City_Webberville',
       'City_West Lake Hills', 'City_Williamson', 'Intake Month',
       'Intake Year'],
      dtype='object', length=3073)


Unnamed: 0,Age upon Intake,Outcome Type,Intake Condition_Aged,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Congenital,Intake Condition_Feral,Intake Condition_Injured,Intake Condition_Med Attn,Intake Condition_Med Urgent,...,City_Round Rock,City_San Leanna,City_Sunset Valley,City_Travis,City_Village Ot Hill,City_Webberville,City_West Lake Hills,City_Williamson,Intake Month,Intake Year
0,417.42,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,7,2015
1,47.795,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2016
2,104.355,Transfer,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,5,2022
3,104.355,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2,2017
4,313.065,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2019


In [15]:
test_data = pd.read_csv('test.csv', header=0)
test_data = process_test_data(test_data)
test_data = test_data.reindex(columns=train_data.columns, fill_value=0)
test_data = test_data.drop(columns=['Outcome Type'])
test_data.head()

Index(['Age upon Intake', 'Intake Condition_Aged', 'Intake Condition_Agonal',
       'Intake Condition_Behavior', 'Intake Condition_Feral',
       'Intake Condition_Injured', 'Intake Condition_Med Attn',
       'Intake Condition_Med Urgent', 'Intake Condition_Medical',
       'Intake Condition_Neonatal',
       ...
       'City_Point Venture', 'City_Rollingwood', 'City_Round Rock',
       'City_Sunset Valley', 'City_Travis', 'City_Webberville',
       'City_West Lake Hills', 'City_Williamson', 'Intake Month',
       'Intake Year'],
      dtype='object', length=1708)


Unnamed: 0,Age upon Intake,Intake Condition_Aged,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Congenital,Intake Condition_Feral,Intake Condition_Injured,Intake Condition_Med Attn,Intake Condition_Med Urgent,Intake Condition_Medical,...,City_Round Rock,City_San Leanna,City_Sunset Valley,City_Travis,City_Village Ot Hill,City_Webberville,City_West Lake Hills,City_Williamson,Intake Month,Intake Year
0,104.355,False,False,False,0,False,False,False,False,False,...,False,0,False,False,0,False,False,False,,
1,4.0,False,False,False,0,False,False,False,False,False,...,False,0,False,False,0,False,False,False,,
2,208.71,False,False,False,0,False,False,False,False,False,...,False,0,False,False,0,False,False,False,,
3,21.725,False,False,False,0,False,False,False,False,False,...,False,0,False,False,0,False,False,False,,
4,104.355,False,False,False,0,False,True,False,False,False,...,False,0,False,False,0,False,False,False,,


# Random Forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score, make_scorer

train_x = train_data.drop('Outcome Type', axis=1)
train_y = train_data['Outcome Type']
weight_class = {}
forest = RandomForestClassifier(n_estimators=125, min_samples_leaf=150, max_features=0.55, class_weight="balanced")


In [17]:
train_x.head()

Unnamed: 0,Age upon Intake,Intake Condition_Aged,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Congenital,Intake Condition_Feral,Intake Condition_Injured,Intake Condition_Med Attn,Intake Condition_Med Urgent,Intake Condition_Medical,...,City_Round Rock,City_San Leanna,City_Sunset Valley,City_Travis,City_Village Ot Hill,City_Webberville,City_West Lake Hills,City_Williamson,Intake Month,Intake Year
0,417.42,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,7,2015
1,47.795,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2016
2,104.355,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,5,2022
3,104.355,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2,2017
4,313.065,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2019


In [None]:
accuracy = cross_val_score(forest, train_x, train_y, cv=5, scoring=make_scorer(balanced_accuracy_score))
print(accuracy)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

In [None]:
x_train, x_holdout, y_train, y_holdout = train_test_split(train_x, train_y, test_size=0.15)
test_model = forest.fit(x_train, y_train)

In [None]:
test_predictions = forest.predict(x_holdout)
test_accuracy = balanced_accuracy_score(y_holdout, test_predictions)
print(test_accuracy)

0.5358756302576924


In [None]:
model = forest.fit(train_x, train_y)

In [None]:
predictions = forest.predict(test_data)

In [None]:
df = pd.DataFrame({
    "Id": np.arange(1, len(predictions) + 1),
    "Outcome Type": predictions
})
df.head(10)

Unnamed: 0,Id,Outcome Type
0,1,Return to Owner
1,2,Died
2,3,Return to Owner
3,4,Adoption
4,5,Euthanasia
5,6,Euthanasia
6,7,Adoption
7,8,Return to Owner
8,9,Died
9,10,Transfer


In [None]:
print(df['Outcome Type'].value_counts())

Outcome Type
Adoption           10237
Return to Owner     7690
Transfer            3997
Died                3653
Euthanasia          2214
Name: count, dtype: int64


In [None]:

df.to_csv("predictions.csv", index=False)
