In [39]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

In [40]:
# Load the full dataset
df = pd.read_csv('train.csv')

# 1. Handle missing values
# Drop rows with missing Outcome Type (since it's our target)
df.dropna(subset=['Outcome Type'], inplace=True)

# Fill missing Name and Breed with "Unknown"
df['Name'] = df['Name'].fillna('Unknown')
df['Breed'] = df['Breed'].fillna('Unknown')

# 2. Convert 'Age upon Intake' to numerical value (in years)
def process_age(age_str):
    # Check if the value is a string, otherwise return NaN
    if not isinstance(age_str, str):
        return np.nan
    if 'year' in age_str:
        return int(age_str.split()[0])
    elif 'month' in age_str:
        return int(age_str.split()[0]) / 12
    elif 'week' in age_str:
        return int(age_str.split()[0]) / 52
    elif 'day' in age_str:
        return int(age_str.split()[0]) / 365
    return np.nan

df['Age upon Intake'] = df['Age upon Intake'].apply(process_age)

# Fill missing 'Age upon Intake' values with the median age
df['Age upon Intake'] = df['Age upon Intake'].fillna(df['Age upon Intake'].median())

# Process Outcome Type as string and then map
df['Outcome Type'] = df['Outcome Type'].fillna('').astype(str).str.lower().str.strip()
outcome_mapping = {
    'adoption': 1, 
    'transfer': 2, 
    'return to owner': 3, 
    'euthanasia': 4, 
    'died': 5
}
df['Outcome Type'] = df['Outcome Type'].map(outcome_mapping)
df.dropna(subset=['Outcome Type'], inplace=True)

# 3. Convert date columns to datetime format
df['Intake Time'] = pd.to_datetime(df['Intake Time'], errors='coerce')
df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')

# 4. Handle categorical columns (standardizing text)
df['Intake Type'] = df['Intake Type'].str.lower().str.replace(' ', '_')
df['Sex upon Intake'] = df['Sex upon Intake'].str.lower().str.replace(' ', '_')

# 5. Encode 'Outcome Type' as numbers (for modeling)
outcome_mapping = {
    'adoption': 1, 
    'transfer': 2, 
    'return_to_owner': 3, 
    'euthanasia': 4, 
    'died': 5
}

# 6. Drop irrelevant columns
df.drop(['Id', 'Found Location', 'Date of Birth'], axis=1, inplace=True)

# 7. Drop duplicates
df.drop_duplicates(inplace=True)

# 8. Check for remaining missing values
missing_values = df.isnull().sum()

print(df.head())
print("\nMissing Values After Cleaning:")
print(missing_values)


  df['Outcome Time'] = pd.to_datetime(df['Outcome Time'], errors='coerce')


           Name         Intake Time      Intake Type Intake Condition  \
0         Belle 2015-07-05 12:59:00            stray           Normal   
1       Runster 2016-04-14 18:43:00            stray           Normal   
2  Johnny Ringo 2022-05-12 00:23:00    public_assist           Normal   
3          Odin 2017-02-18 12:46:00  owner_surrender           Normal   
4       Beowulf 2019-04-16 09:53:00    public_assist           Normal   

  Animal Type Sex upon Intake  Age upon Intake                     Breed  \
0         Dog   spayed_female         8.000000  English Springer Spaniel   
1         Dog     intact_male         0.916667               Basenji Mix   
2         Cat   neutered_male         2.000000        Domestic Shorthair   
3         Dog   neutered_male         2.000000    Labrador Retriever Mix   
4         Dog   neutered_male         6.000000            Great Dane Mix   

          Color        Outcome Time  Outcome Type  
0   White/Liver 2015-07-05 15:13:00             3  


In [41]:
# Feature Engineering cell
# Assuming df is your cleaned DataFrame from the previous steps

# 1. Duration of Stay (in days)
# This feature captures how long an animal stayed in the shelter by calculating the difference 
# between Outcome Time and Intake Time.
df['stay_duration'] = (df['Outcome Time'] - df['Intake Time']).dt.total_seconds() / (3600 * 24)

# 2. Extract Date-Time Components from Intake Time
# These features let your model pick up time-based patterns, for example, if intake month or weekday affects outcomes.
df['intake_month'] = df['Intake Time'].dt.month
df['intake_weekday'] = df['Intake Time'].dt.weekday  # Monday=0, Sunday=6
df['intake_hour'] = df['Intake Time'].dt.hour

# 3. Binary Indicator for Name Availability
# Some animals may have no name (filled with "Unknown" during cleaning), and this might carry predictive information.
df['has_name'] = (df['Name'] != 'Unknown').astype(int)

# 4. Indicator for Mixed Breed
# We derive a feature indicating whether an animal is a mixed breed by looking for "Mix" in the Breed column.
df['is_mixed_breed'] = df['Breed'].str.contains('Mix', case=False, na=False).astype(int)

# 5. Number of Colors
# Many entries in the Color column have multiple colors separated by "/".
# Counting the parts can give an indication of the color diversity, which might be useful.
df['num_colors'] = df['Color'].apply(lambda x: len(x.split('/')) if isinstance(x, str) else 0)

# 6. Age Category
# Converting the numerical age into categories (e.g., infant, young, adult, senior) can help the model capture non-linear effects.
def age_category(age):
    if age < 1:
        return 'infant'
    elif age < 3:
        return 'young'
    elif age < 7:
        return 'adult'
    else:
        return 'senior'

df['age_category'] = df['Age upon Intake'].apply(age_category)

# 7. Splitting 'Sex upon Intake' into two separate features:
# One for gender and one for reproductive status.
def extract_gender(sex_str):
    # Assumes values formatted like "spayed_female" or "intact_male"
    if isinstance(sex_str, str) and '_' in sex_str:
        return sex_str.split('_')[-1]  # 'female' or 'male'
    return sex_str

def extract_repro_status(sex_str):
    if isinstance(sex_str, str) and '_' in sex_str:
        return sex_str.split('_')[0]  # e.g., 'spayed', 'intact', 'neutered'
    return sex_str

df['gender'] = df['Sex upon Intake'].apply(extract_gender)
df['reproductive_status'] = df['Sex upon Intake'].apply(extract_repro_status)

# (Optional) Check the newly engineered features:
features_to_check = ['stay_duration', 'intake_month', 'intake_weekday', 'intake_hour',
                     'has_name', 'is_mixed_breed', 'num_colors', 'age_category', 'gender', 'reproductive_status']
print(df[features_to_check].head())


   stay_duration  intake_month  intake_weekday  intake_hour  has_name  \
0       0.093056             7               6           12         1   
1       6.940278             4               3           18         1   
2       0.591667             5               3            0         1   
3       3.206944             2               5           12         1   
4       2.161111             4               1            9         1   

   is_mixed_breed  num_colors age_category  gender reproductive_status  
0               0           2       senior  female              spayed  
1               1           2       infant    male              intact  
2               0           1        young    male            neutered  
3               1           1        young    male            neutered  
4               1           1        adult    male            neutered  


In [None]:
# ... (Assuming the previous data cleaning and feature engineering code is in place)

# Define feature columns and target variable:
features = [
    'Age upon Intake', 'stay_duration', 'intake_month', 'intake_weekday',
    'intake_hour', 'has_name', 'is_mixed_breed', 'num_colors',
    'age_category', 'gender', 'reproductive_status', 'Intake Type',
    'Intake Condition', 'Animal Type'
]
target = 'Outcome Type'

X = df[features]
y = df[target]

# Define numeric and categorical feature lists:
numeric_features = [
    'Age upon Intake', 'stay_duration', 'intake_month', 'intake_weekday',
    'intake_hour', 'has_name', 'is_mixed_breed', 'num_colors'
]
categorical_features = [
    'age_category', 'gender', 'reproductive_status', 'Intake Type',
    'Intake Condition', 'Animal Type'
]

# Import necessary scikit-learn modules
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Build a preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Define classifiers
knn = KNeighborsClassifier(n_neighbors=5)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, random_state=42)

# Build a stacking classifier
stack = StackingClassifier(
    estimators=[
        ('knn', knn),
        ('rf', rf),
        ('gb', gb)
    ],
    final_estimator=LogisticRegression(max_iter=1000),
    cv=5
)

# Create pipelines for each model
pipeline_knn = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', knn)])
pipeline_rf = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', rf)])
pipeline_gb = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', gb)])
pipeline_stack = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('classifier', stack)])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit and evaluate KNN classifier
pipeline_knn.fit(X_train, y_train)
y_pred_knn = pipeline_knn.predict(X_test)
print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))

# Fit and evaluate Random Forest classifier
pipeline_rf.fit(X_train, y_train)
y_pred_rf = pipeline_rf.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))

# Fit and evaluate Gradient Boosting classifier
pipeline_gb.fit(X_train, y_train)
y_pred_gb = pipeline_gb.predict(X_test)
print("Gradient Boosting Classification Report:")
print(classification_report(y_test, y_pred_gb))

# Fit and evaluate Stacking classifier
pipeline_stack.fit(X_train, y_train)
y_pred_stack = pipeline_stack.predict(X_test)
print("Stacking Classifier Classification Report:")
print(classification_report(y_test, y_pred_stack))


KNN Classification Report:
              precision    recall  f1-score   support

           1       0.65      0.79      0.72     10909
           2       0.59      0.51      0.55      6778
           3       0.60      0.49      0.54      3353
           4       0.43      0.14      0.21       701
           5       0.45      0.02      0.04       216

    accuracy                           0.63     21957
   macro avg       0.55      0.39      0.41     21957
weighted avg       0.62      0.63      0.61     21957

Random Forest Classification Report:
              precision    recall  f1-score   support

           1       0.77      0.91      0.84     10909
           2       0.76      0.62      0.69      6778
           3       0.83      0.75      0.79      3353
           4       0.65      0.34      0.45       701
           5       0.43      0.06      0.11       216

    accuracy                           0.77     21957
   macro avg       0.69      0.54      0.57     21957
weighted avg 