In [27]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_val_predict
from sklearn.metrics import classification_report


1. Drop Name & Outcome time
2. Convert Age upon intake to single number (in weeks)
3. Split sex upon intake to intact/spayed/neutered and the actual gender (male/female)

- Could build a different model for cats and for dogs

In [28]:
def convert_to_weeks(value):
    value = value.lower()  # Make it case-insensitive
    if 'week' in value:
        # Extract number of weeks
        return int(value.split()[0])
    elif 'month' in value:
        # Convert months to weeks (1 month ≈ 4.345 weeks)
        return int(value.split()[0]) * 4.345
    elif 'year' in value:
        # Convert years to weeks (1 year ≈ 52.1775 weeks)
        return int(value.split()[0]) * 52.1775
    elif 'day' in value:
        # Convert days to 0 weeks
        return 0
    return 0  # In case of unexpected values

def extract_month_year(df, column='Intake Time'):
    """
    Convert the specified datetime column in the DataFrame to two new columns: Month and Year.

    Parameters:
        df (pd.DataFrame): Input DataFrame containing the datetime column.
        column (str): Name of the column to convert (default is 'Intake Time').

    Returns:
        pd.DataFrame: The original DataFrame with additional 'Month' and 'Year' columns.
    """
    # Convert the column to datetime objects. Adjust the format if needed.
    df[column] = pd.to_datetime(df[column], format='%m/%d/%Y %I:%M:%S %p', errors='coerce')

    # Extract the month and year from the datetime column
    df['Intake Month'] = df[column].dt.month
    df['Intake Year'] = df[column].dt.year

    return df

In [29]:
def assign_breed_frequency(df, column='Breed'):
    # Calculate the frequency of each breed
    breed_counts = df[column].value_counts()

    # Assign the actual frequency to the 'Breed_Popularity' column
    df[column] = df[column].map(breed_counts)

    return df

In [30]:
def process_data(data):    
    data = data.drop(columns=['Id', 'Name', 'Outcome Time'])
    data = data.drop(columns=['Found Location', 'Date of Birth'])
    # data = data.drop(columns=['Breed', 'Color'])
    data = data.dropna()
    print(data.columns)
    data = pd.get_dummies(data, columns=['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake', 'Breed', 'Color'])
    data['Age upon Intake'] = data['Age upon Intake'].apply(convert_to_weeks)
    data = extract_month_year(data, column='Intake Time')
    data = data.drop(columns=['Intake Time'])
    print(data.columns)
    data.head()
    return data

In [31]:
def process_test_data(data):    
    data = data.drop(columns=['Id'])
    data = data.drop(columns=['Found Location', 'Date of Birth'])
    data = data.drop(columns=['Breed', 'Color'])
    data = data.dropna()
    print(data.columns)
    data = pd.get_dummies(data, columns=['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake'])
    data['Age upon Intake'] = data['Age upon Intake'].apply(convert_to_weeks)
    data = extract_month_year(data, column='Intake Time')
    data = data.drop(columns=['Intake Time'])
    print(data.columns)
    data.head()
    return data

In [32]:
train_data = pd.read_csv('train.csv', header=0)
train_data = process_data(train_data)
train_data.head()

Index(['Intake Time', 'Intake Type', 'Intake Condition', 'Animal Type',
       'Sex upon Intake', 'Age upon Intake', 'Breed', 'Color', 'Outcome Type'],
      dtype='object')
Index(['Age upon Intake', 'Outcome Type', 'Intake Condition_Aged',
       'Intake Condition_Agonal', 'Intake Condition_Behavior',
       'Intake Condition_Congenital', 'Intake Condition_Feral',
       'Intake Condition_Injured', 'Intake Condition_Med Attn',
       'Intake Condition_Med Urgent',
       ...
       'Color_Yellow/Brown', 'Color_Yellow/Cream', 'Color_Yellow/Gray',
       'Color_Yellow/Orange', 'Color_Yellow/Orange Tabby', 'Color_Yellow/Tan',
       'Color_Yellow/White', 'Color_Yellow/Yellow', 'Intake Month',
       'Intake Year'],
      dtype='object', length=3044)


Unnamed: 0,Age upon Intake,Outcome Type,Intake Condition_Aged,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Congenital,Intake Condition_Feral,Intake Condition_Injured,Intake Condition_Med Attn,Intake Condition_Med Urgent,...,Color_Yellow/Brown,Color_Yellow/Cream,Color_Yellow/Gray,Color_Yellow/Orange,Color_Yellow/Orange Tabby,Color_Yellow/Tan,Color_Yellow/White,Color_Yellow/Yellow,Intake Month,Intake Year
0,417.42,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,7,2015
1,47.795,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2016
2,104.355,Transfer,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,5,2022
3,104.355,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,2,2017
4,313.065,Return to Owner,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,4,2019


In [33]:
test_data = pd.read_csv('test.csv', header=0)
test_data.head()
test_data = process_test_data(test_data)
test_data.head()

Index(['Intake Time', 'Intake Type', 'Intake Condition', 'Animal Type',
       'Sex upon Intake', 'Age upon Intake'],
      dtype='object')
Index(['Age upon Intake', 'Intake Condition_Aged', 'Intake Condition_Agonal',
       'Intake Condition_Behavior', 'Intake Condition_Feral',
       'Intake Condition_Injured', 'Intake Condition_Med Attn',
       'Intake Condition_Med Urgent', 'Intake Condition_Medical',
       'Intake Condition_Neonatal', 'Intake Condition_Normal',
       'Intake Condition_Nursing', 'Intake Condition_Other',
       'Intake Condition_Panleuk', 'Intake Condition_Parvo',
       'Intake Condition_Pregnant', 'Intake Condition_Sick',
       'Intake Condition_Space', 'Intake Condition_Unknown',
       'Intake Type_Abandoned', 'Intake Type_Euthanasia Request',
       'Intake Type_Owner Surrender', 'Intake Type_Public Assist',
       'Intake Type_Stray', 'Animal Type_Cat', 'Animal Type_Dog',
       'Sex upon Intake_Intact Female', 'Sex upon Intake_Intact Male',
       'Sex u

Unnamed: 0,Age upon Intake,Intake Condition_Aged,Intake Condition_Agonal,Intake Condition_Behavior,Intake Condition_Feral,Intake Condition_Injured,Intake Condition_Med Attn,Intake Condition_Med Urgent,Intake Condition_Medical,Intake Condition_Neonatal,...,Intake Type_Stray,Animal Type_Cat,Animal Type_Dog,Sex upon Intake_Intact Female,Sex upon Intake_Intact Male,Sex upon Intake_Neutered Male,Sex upon Intake_Spayed Female,Sex upon Intake_Unknown,Intake Month,Intake Year
0,104.355,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,True,False,False,,
1,4.0,False,False,False,False,False,False,False,False,False,...,True,True,False,True,False,False,False,False,,
2,208.71,False,False,False,False,False,False,False,False,False,...,True,False,True,False,False,True,False,False,,
3,21.725,False,False,False,False,False,False,False,False,False,...,True,False,True,True,False,False,False,False,,
4,104.355,False,False,False,False,True,False,False,False,False,...,True,True,False,True,False,False,False,False,,


# KNN

In [34]:
train_x = train_data.drop('Outcome Type', axis=1)
train_y = train_data['Outcome Type']

In [35]:
scaler = StandardScaler()
pca = PCA()
knn = KNeighborsClassifier(7)
steps = [('scaler', scaler), ('pca', pca), ('knn', knn)]
pipe = Pipeline(steps)
accuracy = cross_val_score(pipe, train_x, train_y, cv=5, scoring='accuracy')
print(f"Average Accuracy: {np.mean(accuracy)}")

Average Accuracy: 0.5604246322702532


In [None]:
param_grid = {
    'pca__n_components': list(range(5, 10)),
    'knn__n_neighbors': list(range(1, 26))
}
grid_search = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')
grid_search.fit(train_x, train_y)

print(f"best params: {grid_search.best_params_}")
print(f"best score: {grid_search.best_score_}")

In [None]:
scaler = StandardScaler()
scaled = scaler.fit_transform(train_x)
pca = PCA(n_components=5)  # Reducing to 14 components
pca = pca.fit_transform(scaled)  # Apply PCA transformation

knn = KNeighborsClassifier(n_neighbors=14)
knn.fit(pca, train_y)

In [None]:
test_data.isna().count()

Age upon Intake                   27791
Intake Condition_Aged             27791
Intake Condition_Agonal           27791
Intake Condition_Behavior         27791
Intake Condition_Feral            27791
Intake Condition_Injured          27791
Intake Condition_Med Attn         27791
Intake Condition_Med Urgent       27791
Intake Condition_Medical          27791
Intake Condition_Neonatal         27791
Intake Condition_Normal           27791
Intake Condition_Nursing          27791
Intake Condition_Other            27791
Intake Condition_Panleuk          27791
Intake Condition_Parvo            27791
Intake Condition_Pregnant         27791
Intake Condition_Sick             27791
Intake Condition_Space            27791
Intake Condition_Unknown          27791
Intake Type_Abandoned             27791
Intake Type_Euthanasia Request    27791
Intake Type_Owner Surrender       27791
Intake Type_Public Assist         27791
Intake Type_Stray                 27791
Animal Type_Cat                   27791


In [None]:
scaled_test = scaler.fit_transform(test_data)
pca_test = PCA(n_components=5)  # Reducing to 14 components
pca_test = pca_test.fit_transform(scaled_test)  # Apply PCA transformation
predictions = knn.predict(pca_test)
predictions.head()

In [None]:
df = pd.DataFrame({
    "Id": np.arange(1, len(predictions) + 1),
    "Outcome Type": predictions
})
df.head(10)

In [None]:
df.to_csv("predictions.csv", index=False)