# Imports

In [137]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
from sklearn.preprocessing import StandardScaler
from datetime import datetime
pd.options.mode.chained_assignment = None  # default='warn'
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)


# Helpers

In [231]:
def convert_to_weeks(value):
    try:
        value = str(value).lower()
        num = float(value.split()[0])
        if 'week' in value:
            return num
        elif 'month' in value:
            return num * 4.345
        elif 'year' in value:
            return num * 52.1775
        elif 'day' in value:
            return num / 7.0
        else:
            return 0.0
    except Exception:
        return 0.0


def extract_month_year(df, column='Intake Time'):
    # Function to normalize the year and handle date formatting
    
    def normalize_date(date_str):
        # Split the date and time by space (if present)
        split_str = date_str.split(' ')
        date_str = split_str[0]
        # print(split_str[1])
        # date_str = date_str.split(' ')[0]  # Take only the date part before the space
        
        # Split the date string by '/'
        date_parts = date_str.split('/')
        
        # Get the month and year (first and third parts)
        month = int(date_parts[0])  # Remove leading zeros in months
        year = int(date_parts[2])   # Get the year (e.g., 1 -> 2001, 2 -> 2002)
        
        # Normalize the year
        if year < 100:  # Assume year is in 'YY' format (1 -> 2001, 2 -> 2002, etc.)
            year += 2000
            
        return month, year

    # Apply the normalize_date function to the 'Intake Time' column
    df[['Intake Month', 'Intake Year']] = df[column].apply(lambda x: pd.Series(normalize_date(str(x))))
    return df

def add_minutes_since_midnight(df):
    # List to store minutes since midnight
    minutes_since_midnight = []

    for time_str in df['Intake Time']:
        if isinstance(time_str, str) and time_str.strip():  # Check if time_str is a valid non-empty string
            # Split the time into date and time
            parts = time_str.split(' ')
            if len(parts) < 2:
                minutes_since_midnight.append(0)  # For invalid entries that don't have time
                continue
            date_str, time_str = parts[0], parts[1]
            if(len(parts) == 3):
                time_str += ' '
                time_str += parts[2]

            # Handle 12-hour or 24-hour time format and convert to minutes since midnight
            try:
                # Try to handle 12-hour time format with AM/PM (possibly with seconds)
                if 'AM' in time_str or 'PM' in time_str:
                    time_obj = datetime.strptime(time_str, '%I:%M:%S %p')  # 12-hour format with seconds
                else:
                    # Handle 24-hour format with or without seconds
                    time_obj = datetime.strptime(time_str, '%H:%M:%S')  # 24-hour format with seconds
            except ValueError:
                try:
                    # Handle 12-hour format without seconds
                    time_obj = datetime.strptime(time_str, '%I:%M %p')  # 12-hour format without seconds
                except ValueError:
                    # Handle 24-hour format without seconds
                    time_obj = datetime.strptime(time_str, '%H:%M')  # 24-hour format without seconds

            # Convert to minutes since midnight
            minutes = time_obj.hour * 60 + time_obj.minute

            # If it's PM (except for 12 PM), add 720 minutes (12 hours)
            # if 'PM' in time_str and time_obj.hour != 12:
            #     minutes += 720

            minutes_since_midnight.append(minutes)
        else:
            minutes_since_midnight.append(0)  # For empty or invalid entries

    # Add the new column to the DataFrame
    df['Minutes Since Midnight'] = minutes_since_midnight

    return df

def assign_frequency(df, column):
    # Calculate the frequency of each 
    cnt = df[column].value_counts()
    df.loc[:, column] = df[column].map(cnt)
    return df

def extract_city_from_address(df):    
    def extract_city(address):
        # Split the address by whitespace
        tokens = address.split()
        # Find the token before '(TX)'
        if '(TX)' in tokens:
            city_index = tokens.index('(TX)') - 1
            if city_index >= 0:
                return tokens[city_index]
        return 'Unknown'  # If no city found, return 'Unknown'
    df.loc[:, 'City'] = df['Found Location'].apply(extract_city)
    return df
        

def add_age_ratio_feature(df):
    # Calculate the average age upon intake for each animal type
    avg_age_by_type = df.groupby('Animal Type')['Age upon Intake'].mean()

    # Calculate breed frequency (as a fraction)
    breed_freq = df['Breed'].value_counts(normalize=True)

    # Compute breed-specific averages
    avg_age_by_breed = df.groupby('Breed')['Age upon Intake'].mean()

    # Function to get appropriate average age
    def get_age_ratio(row):
        if breed_freq.get(row['Breed'], 0) > 0.01:
            avg_age = avg_age_by_breed.get(row['Breed'], avg_age_by_type[row['Animal Type']])
        else:
            avg_age = avg_age_by_type[row['Animal Type']]
        return row['Age upon Intake'] / avg_age

    # Apply the function to calculate the age ratio for each row
    df.loc[:, 'Age upon Intake Ratio'] = df.apply(get_age_ratio, axis=1)
    return df

def add_is_mixed_breed(df):
    df['Is Mixed'] = df['Breed'].str.contains('Mix', case=False, na=False).astype(int)
    return df

# Count number of colors in the 'Color' field
def add_color_count(df):
    df['Num Colors'] = df['Color'].apply(lambda x: len(str(x).split('/')))
    return df

def add_intake_weekday(df):
    # Split 'Intake Time' into date and time
    df['Intake Weekday'] = df['Intake Time'].apply(lambda x: pd.to_datetime(x.split(' ')[0], errors='coerce').dayofweek if isinstance(x, str) else None)
    return df
    
def drop_columns(df, columns=None):
    if columns is None:
        columns = ['Id', 'Name', 'Outcome Time', 'Date of Birth', 'Intake Time', 'Found Location']
    df = df.drop(columns=columns, errors='ignore')
    df = df.dropna()
    return df


def standardize_features(df):
    df = extract_city_from_address(df)
    df['Age upon Intake'] = df['Age upon Intake'].apply(convert_to_weeks)
    df['Age upon Intake'] = pd.to_numeric(df['Age upon Intake'], errors='coerce').fillna(0.0)
    df = extract_month_year(df, column='Intake Time')
    return df
    
def hot_encode_features(df, columns=None):
    if columns is None:
        columns = ['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake', 'City']
    
    existing_columns = [col for col in columns if col in df.columns]
    df = pd.get_dummies(df, columns=existing_columns)
    
    return df

def rare_hot_encode_features(df, columns=None, rare_threshold=0.01):
    if columns is None:
        columns = ['Intake Condition', 'Intake Type', 'Animal Type', 'Sex upon Intake', 'City', 'Breed', 'Color']
    
    # For each specified column, replace rare categories with 'Rare'
    for col in columns:
        if col in df.columns:
            # Calculate the frequency of each unique value in the column
            freq = df[col].value_counts() / len(df)
            
            # Find the values that are considered rare based on the threshold
            rare_values = freq[freq < rare_threshold].index
            
            # Replace rare values with 'Rare'
            df[col] = df[col].replace(rare_values, 'Rare')
    
    # Now perform one-hot encoding on the specified columns
    existing_columns = [col for col in columns if col in df.columns]
    df = pd.get_dummies(df, columns=existing_columns)
    
    return df

def frequency_encode_features(df, columns=None):
    if columns is None:
        columns = ['Breed', 'Color', 'City']
    
    for col in columns:
        if col in df.columns:
            # Create a new column with frequency encoding
            df[f'{col}_frequency'] = df[col].map(df[col].value_counts(normalize=True))
    
    return df

def engineer_features(df):
    df = add_intake_weekday(df)
    df = add_is_mixed_breed(df)
    df = add_color_count(df)
    df = add_minutes_since_midnight(df)
    df = add_age_ratio_feature(df)
    return df

def scale_features(df):
    # Select numeric columns only
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    
    # Initialize the scaler
    scaler = StandardScaler()
    
    # Apply scaling to the numeric columns
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
    
    return df
    

# Exploration

# Data Cleaning

In [57]:
# Exploring what needs to be cleaned
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print("Train null counts:")
print(train.isnull().sum()[train.isnull().sum() > 0])

print("\nTest null counts:")
print(test.isnull().sum()[test.isnull().sum() > 0])

print("\nOutcome counts:")
print(train['Outcome Type'].value_counts())




Train null counts:
Name               31383
Sex upon Intake        2
Age upon Intake        1
dtype: int64

Test null counts:
Series([], dtype: int64)

Outcome counts:
Outcome Type
Adoption           55044
Transfer           35024
Return to Owner    16599
Euthanasia          3449
Died                1041
Name: count, dtype: int64


In [236]:
def clean(df):
    df = standardize_features(df)
    df = engineer_features(df)
    # df = 
    df = frequency_encode_features(df)
    df = rare_hot_encode_features(df)
    df = drop_columns(df)
    df = scale_features(df)
    return df
    

In [239]:
train = pd.read_csv('train.csv').head(250)
train = clean(train)
print(train.shape)
# print(train.dtypes)
train.head(25)

(250, 76)


Unnamed: 0,Age upon Intake,Outcome Type,Intake Month,Intake Year,Intake Weekday,Is Mixed,Num Colors,Minutes Since Midnight,Age upon Intake Ratio,Breed_frequency,Color_frequency,City_frequency,Intake Condition_Injured,Intake Condition_Neonatal,Intake Condition_Normal,Intake Condition_Nursing,Intake Condition_Rare,Intake Condition_Sick,Intake Type_Abandoned,Intake Type_Owner Surrender,Intake Type_Public Assist,Intake Type_Stray,Animal Type_Cat,Animal Type_Dog,Sex upon Intake_Intact Female,Sex upon Intake_Intact Male,Sex upon Intake_Neutered Male,Sex upon Intake_Spayed Female,Sex upon Intake_Unknown,City_Austin,City_Pflugerville,City_Rare,City_Travis,City_Unknown,City_Valle,Breed_Chihuahua Shorthair,Breed_Chihuahua Shorthair Mix,Breed_Dachshund Mix,Breed_Dachshund/Chihuahua Shorthair,Breed_Domestic Medium Hair,Breed_Domestic Medium Hair Mix,Breed_Domestic Shorthair,Breed_Domestic Shorthair Mix,Breed_German Shepherd Mix,Breed_Labrador Retriever,Breed_Labrador Retriever Mix,Breed_Pit Bull,Breed_Pit Bull Mix,Breed_Rare,Color_Black,Color_Black/Brown,Color_Black/Tan,Color_Black/White,Color_Blue,Color_Blue Tabby,Color_Blue/White,Color_Brown,Color_Brown Brindle/White,Color_Brown Tabby,Color_Brown Tabby/White,Color_Brown/Black,Color_Brown/White,Color_Calico,Color_Orange Tabby,Color_Orange Tabby/White,Color_Rare,Color_Red/White,Color_Tan,Color_Tan/White,Color_Torbie,Color_Tortie,Color_Tricolor,Color_White,Color_White/Black,Color_White/Brown,Color_White/Tan
0,2.013223,Return to Owner,0.13382,-1.016587,1.639896,-1.40577,0.937923,-0.291208,1.293568,-0.892859,-1.058451,0.473247,False,False,True,False,False,False,False,False,False,True,False,True,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
1,-0.332326,Return to Owner,-0.795486,-0.683934,0.127077,0.711354,0.937923,1.679046,-0.423683,-0.892859,-1.058451,0.473247,False,False,True,False,False,False,False,False,False,True,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
2,0.02659,Transfer,-0.485717,1.311982,0.127077,-1.40577,-1.066186,-4.621185,1.263242,0.06809,-0.769573,0.473247,False,False,True,False,False,False,False,False,True,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False
3,0.02659,Return to Owner,-1.415023,-0.351281,1.135623,0.711354,-1.066186,-0.365665,-0.16136,-0.206467,-1.058451,0.473247,False,False,True,False,False,False,False,True,False,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
4,1.351012,Return to Owner,-0.795486,0.314024,-0.881469,0.711354,-1.066186,-1.35652,0.808742,-0.892859,1.156285,0.473247,False,False,True,False,False,False,False,False,True,False,False,True,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,-0.470187,Adoption,1.063126,0.97933,0.63135,-1.40577,-1.066186,-0.743679,-0.168834,0.06809,1.252578,0.473247,False,False,True,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,0.02659,Return to Owner,0.13382,-1.016587,0.127077,0.711354,0.937923,0.270085,-0.16136,-0.206467,1.734042,0.473247,False,False,True,False,False,False,False,False,True,False,False,True,False,True,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,-0.610238,Adoption,-0.175949,0.646677,0.127077,-1.40577,-1.066186,0.361725,-0.572563,0.06809,-0.962158,-1.97844,False,False,True,False,False,False,False,False,False,True,True,False,False,True,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False
8,-0.610238,Transfer,-0.175949,-0.018629,-1.385743,0.711354,0.937923,-2.089638,-0.608717,1.509513,1.734042,0.473247,True,False,False,False,False,False,False,False,False,True,True,False,True,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,-0.497759,Transfer,0.443589,-0.683934,-1.385743,0.711354,-1.066186,1.386944,-0.444677,1.509513,-0.962158,0.473247,False,False,True,False,False,False,False,True,False,False,True,False,False,False,True,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,False,False,False,False,False,False,False


# Feature Engineering

# Models

### Random Forest

In [240]:
train_data = pd.read_csv('train.csv')
train_data = train_data.dropna()
X = train_data.drop(columns=['Outcome Type'])
y = train_data['Outcome Type']


# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = clean(X_train)
X_val = clean(X_val)
X_val = X_val.reindex(columns=X_train.columns, fill_value=False)

# Initialize the Random Forest Classifier
rf = RandomForestClassifier(
    n_estimators=400,                # Set the number of estimators
    max_depth=35,                    # Set the max depth
    min_samples_split=5,             # Set the min samples required to split
    random_state=42,                  # Set random state for reproducibility
)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Predict on the validation set
y_pred = rf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.4f}')

raise KeyboardInterrupt

# Define a parameter grid for tuning
param_grid = {
    'n_estimators': [400],              # Test both lower and higher values
    'max_depth': [25, 35],                    # Slight range in depth
    'min_samples_split': [5],              # Explore both smaller and higher splits
}

# Set up the grid search with cross-validation
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42), 
                           param_grid=param_grid, 
                           cv=3, 
                           n_jobs=-1, 
                           verbose=2)

# Fit the grid search to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_
print(f'Best Parameters: {best_params}')

# Use the best model found
best_rf = grid_search.best_estimator_

# Evaluate on validation set
y_pred = best_rf.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy with optimized model: {accuracy:.4f}')


Accuracy: 0.6582


KeyboardInterrupt: 

### LightGBM

In [241]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

# Load data
train_data = pd.read_csv('train.csv')
train_data = train_data.dropna()

X = train_data.drop(columns=['Outcome Type'])
y = train_data['Outcome Type']

# Split into train/val sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Clean them (your custom function)
X_train = clean(X_train)
X_val = clean(X_val)
X_val = X_val.reindex(columns=X_train.columns, fill_value=False)

# Label encoding for target since LightGBM needs numeric labels
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)  # Fit and transform on the training labels
y_val_enc = le.transform(y_val)  # Transform on validation labels

# Initialize the LGBMClassifier
lgb_model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    metric='multi_logloss',
    learning_rate=0.05,
    num_leaves=31,
    verbose=-1,
    random_state=42
)

# Train the model
lgb_model.fit(X_train, y_train_enc)

# Predict
y_pred = lgb_model.predict(X_val)

# Decode the predictions
y_pred_decoded = le.inverse_transform(y_pred)

# Accuracy
accuracy = accuracy_score(y_val, y_pred_decoded)
print(f'LightGBM Accuracy: {accuracy:.4f}')


LightGBM Accuracy: 0.6491


In [230]:
# Define parameter grid for tuning
param_grid = {
    'learning_rate': [0.05],
    'num_leaves': [31],
    # 'max_depth': [15, 20, 30],  # -1 means no limit
}

# Initialize the LGBMClassifier (without defining hyperparameters here)
lgb_model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    metric='multi_logloss',
    verbose=-1,
    random_state=42
)

# Perform Grid Search with cross-validation
grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, 
                           cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the grid search
grid_search.fit(X_train, y_train_enc)

# Best parameters from the grid search
print(f'Best Parameters: {grid_search.best_params_}')

# Get the best model
best_lgb_model = grid_search.best_estimator_

# Make predictions with the best model
y_pred = best_lgb_model.predict(X_val)

# Decode the predictions
y_pred_decoded = le.inverse_transform(y_pred)

# Accuracy of the tuned model
accuracy = accuracy_score(y_val, y_pred_decoded)
print(f'Tuned LightGBM Accuracy: {accuracy:.4f}')


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best Parameters: {'learning_rate': 0.05, 'num_leaves': 31}
Tuned LightGBM Accuracy: 0.6502


In [None]:
!pip install xgboost

### Ensemble

In [242]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Initialize the models
rf = RandomForestClassifier(
    n_estimators=400,                # Set the number of estimators
    max_depth=35,                    # Set the max depth
    min_samples_split=5,             # Set the min samples required to split
    random_state=42,                  # Set random state for reproducibility
)
lgb_model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    metric='multi_logloss',
    learning_rate=0.05,
    num_leaves=31,
    verbose=-1,
    random_state=42
)
logreg = LogisticRegression(random_state=42)
xgb = XGBClassifier(n_estimators=400, max_depth=25, random_state=42)
svm = SVC(random_state=42)

# Initialize the VotingClassifier (Majority Voting)
voting_clf = VotingClassifier(estimators=[
    ('rf', rf),
    ('lgb', lgb),
    ('logreg', logreg),
    # ('xgb', xgb),
    # ('svm', svm)
], voting='hard')

# Train the VotingClassifier
voting_clf.fit(X_train, y_train)

# Make predictions
y_pred = voting_clf.predict(X_val)

# Evaluate the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Voting Classifier Accuracy: {accuracy:.4f}')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Voting Classifier Accuracy: 0.6537
