In [4]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import VotingClassifier, StackingClassifier

In [5]:
df = pd.read_csv('pet_adoption_data.csv')

In [6]:
df.head()

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,103,Black,Large,16.357608,1,0,43,335,0,0
1,501,Rabbit,Labrador,8,Black,Medium,12.625582,0,0,50,93,0,1
2,502,Dog,Parakeet,153,Gray,Small,9.624407,0,0,3,79,0,0
3,503,Bird,Golden Retriever,91,Gray,Medium,20.770021,0,0,59,37,1,0
4,504,Bird,Golden Retriever,174,Gray,Medium,7.025923,1,0,28,407,0,1


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2007 entries, 0 to 2006
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   PetID               2007 non-null   int64  
 1   PetType             2007 non-null   object 
 2   Breed               2007 non-null   object 
 3   AgeMonths           2007 non-null   int64  
 4   Color               2007 non-null   object 
 5   Size                2007 non-null   object 
 6   WeightKg            2007 non-null   float64
 7   Vaccinated          2007 non-null   int64  
 8   HealthCondition     2007 non-null   int64  
 9   TimeInShelterDays   2007 non-null   int64  
 10  AdoptionFee         2007 non-null   int64  
 11  PreviousOwner       2007 non-null   int64  
 12  AdoptionLikelihood  2007 non-null   int64  
dtypes: float64(1), int64(8), object(4)
memory usage: 204.0+ KB


In [8]:
df.describe()

Unnamed: 0,PetID,AgeMonths,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
count,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0,2007.0
mean,1503.0,92.627803,15.699563,0.706527,0.199801,44.818635,251.690583,0.306926,0.338316
std,579.515315,51.527384,8.444901,0.455467,0.39995,25.676587,144.552893,0.461333,0.473255
min,500.0,1.0,1.03931,0.0,0.0,1.0,0.0,0.0,0.0
25%,1001.5,49.0,8.660533,0.0,0.0,22.0,129.5,0.0,0.0
50%,1503.0,93.0,15.752066,1.0,0.0,45.0,250.0,0.0,0.0
75%,2004.5,138.0,23.140205,1.0,0.0,67.0,374.0,1.0,1.0
max,2506.0,179.0,29.992795,1.0,1.0,89.0,499.0,1.0,1.0


In [9]:
df.shape

(2007, 13)

In [10]:
df.isnull().sum(axis=0)

PetID                 0
PetType               0
Breed                 0
AgeMonths             0
Color                 0
Size                  0
WeightKg              0
Vaccinated            0
HealthCondition       0
TimeInShelterDays     0
AdoptionFee           0
PreviousOwner         0
AdoptionLikelihood    0
dtype: int64

In [11]:
df.columns.to_list()

['PetID',
 'PetType',
 'Breed',
 'AgeMonths',
 'Color',
 'Size',
 'WeightKg',
 'Vaccinated',
 'HealthCondition',
 'TimeInShelterDays',
 'AdoptionFee',
 'PreviousOwner',
 'AdoptionLikelihood']

In [12]:
for col in df.columns:
        print(f"\033[1m{col}\033[0m: {df[col].nunique()}")

[1mPetID[0m: 2007
[1mPetType[0m: 4
[1mBreed[0m: 6
[1mAgeMonths[0m: 179
[1mColor[0m: 5
[1mSize[0m: 3
[1mWeightKg[0m: 2007
[1mVaccinated[0m: 2
[1mHealthCondition[0m: 2
[1mTimeInShelterDays[0m: 89
[1mAdoptionFee[0m: 493
[1mPreviousOwner[0m: 2
[1mAdoptionLikelihood[0m: 2


In [13]:
 df.duplicated().sum()

0

In [14]:
df.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PetID,2007.0,1503.0,579.515315,500.0,1001.5,1503.0,2004.5,2506.0
AgeMonths,2007.0,92.627803,51.527384,1.0,49.0,93.0,138.0,179.0
WeightKg,2007.0,15.699563,8.444901,1.03931,8.660533,15.752066,23.140205,29.992795
Vaccinated,2007.0,0.706527,0.455467,0.0,0.0,1.0,1.0,1.0
HealthCondition,2007.0,0.199801,0.39995,0.0,0.0,0.0,0.0,1.0
TimeInShelterDays,2007.0,44.818635,25.676587,1.0,22.0,45.0,67.0,89.0
AdoptionFee,2007.0,251.690583,144.552893,0.0,129.5,250.0,374.0,499.0
PreviousOwner,2007.0,0.306926,0.461333,0.0,0.0,0.0,1.0,1.0
AdoptionLikelihood,2007.0,0.338316,0.473255,0.0,0.0,0.0,1.0,1.0


In [15]:
df = df.drop('PetID', axis = 1)


In [16]:
X = df.drop('AdoptionLikelihood', axis=1)
y = df['AdoptionLikelihood']

In [17]:
X = pd.get_dummies(X)


In [19]:
def apply_models(X, y):
    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Check for class imbalance
    class_counts = np.bincount(y_train)
    if len(class_counts) > 2 or np.min(class_counts) / np.max(class_counts) < 0.1:
      print("Class imbalance detected. Applying SMOTE...")
    
    # Apply SMOTE (class imbalance)
    smote = SMOTE(random_state=42)
    X_train, y_train = smote.fit_resample(X_train, y_train)
    
    # Initialize the StandardScaler
    scaler = StandardScaler()

    # Fit the scaler on the training data and transform both training and test data
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define the models
    models = {
        'LogisticRegression': LogisticRegression(),
        'SVC': SVC(),
        'DecisionTree': DecisionTreeClassifier(),
        'RandomForest': RandomForestClassifier(),
        'ExtraTrees': ExtraTreesClassifier(),
        'AdaBoost': AdaBoostClassifier(),
        'GradientBoost': GradientBoostingClassifier(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'LightGBM': LGBMClassifier(),
        'CatBoost': CatBoostClassifier(verbose=0)
    }

    model_performance = {}

    # Apply each model
    for model_name, model in models.items():
        print(f"\n\033[1mClassification with {model_name}:\033[0m\n{'-' * 30}")
        
        # Fit the model to the training data
        model.fit(X_train, y_train)

        # Make predictions on the test data
        y_pred = model.predict(X_test)

        # Calculate the accuracy and f1 score
        accuracy = accuracy_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Store the performance in the dictionary
        model_performance[model_name] = (accuracy, f1)

        # Print the accuracy score
        print("\033[1m**Accuracy**:\033[0m\n", accuracy)

        # Print the confusion matrix
        print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test, y_pred))

        # Print the classification report
        print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test, y_pred))

    # Sort the models based on f1 score and pick the top 3
    top_3_models = sorted(model_performance.items(), key=lambda x: x[1][1], reverse=True)[:3]
    print("\n\033[1mTop 3 Models based on F1 Score:\033[0m\n", top_3_models)

    # Extract the model names and classifiers for the top 3 models
    top_3_model_names = [model[0] for model in top_3_models]
    top_3_classifiers = [models[model_name] for model_name in top_3_model_names]

    # Create a Voting Classifier with the top 3 models
    print("\n\033[1mInitializing Voting Classifier with top 3 models...\033[0m\n")
    voting_clf = VotingClassifier(estimators=list(zip(top_3_model_names, top_3_classifiers)), voting='hard')
    voting_clf.fit(X_train, y_train)
    y_pred = voting_clf.predict(X_test)
    print("\n\033[1m**Voting Classifier Evaluation**:\033[0m\n")
    print("\033[1m**Accuracy**:\033[0m\n", accuracy_score(y_test, y_pred))
    print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test, y_pred))
    print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test, y_pred))

    # Create a Stacking Classifier with the top 3 models
    print("\n\033[1mInitializing Stacking Classifier with top 3 models...\033[0m\n")
    stacking_clf = StackingClassifier(estimators=list(zip(top_3_model_names, top_3_classifiers)))
    stacking_clf.fit(X_train, y_train)
    y_pred = stacking_clf.predict(X_test)
    print("\n\033[1m**Stacking Classifier Evaluation**:\033[0m\n")
    print("\033[1m**Accuracy**:\033[0m\n", accuracy_score(y_test, y_pred))
    print("\n\033[1m**Confusion Matrix**:\033[0m\n", confusion_matrix(y_test, y_pred))
    print("\n\033[1m**Classification Report**:\033[0m\n", classification_report(y_test, y_pred))

In [20]:
apply_models(X, y)

[Errno 2] No such file or directory: 'lscpu'
  File "/home/user/workspace/.venv/lib/python3.11/site-packages/joblib/externals/loky/backend/context.py", line 250, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "/usr/lib/python3.11/subprocess.py", line 1953, in _execute_child
    raise child_exception_type(errno_num, err_msg, err_filename)



[1mClassification with LogisticRegression:[0m
------------------------------
[1m**Accuracy**:[0m
 0.9079601990049752

[1m**Confusion Matrix**:[0m
 [[252   7]
 [ 30 113]]

[1m**Classification Report**:[0m
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       259
           1       0.94      0.79      0.86       143

    accuracy                           0.91       402
   macro avg       0.92      0.88      0.90       402
weighted avg       0.91      0.91      0.91       402


[1mClassification with SVC:[0m
------------------------------
[1m**Accuracy**:[0m
 0.9054726368159204

[1m**Confusion Matrix**:[0m
 [[251   8]
 [ 30 113]]

[1m**Classification Report**:[0m
               precision    recall  f1-score   support

           0       0.89      0.97      0.93       259
           1       0.93      0.79      0.86       143

    accuracy                           0.91       402
   macro avg       0.91      0.88      0.8




[1m**Classification Report**:[0m
               precision    recall  f1-score   support

           0       0.94      0.96      0.95       259
           1       0.93      0.90      0.91       143

    accuracy                           0.94       402
   macro avg       0.94      0.93      0.93       402
weighted avg       0.94      0.94      0.94       402


[1mClassification with GradientBoost:[0m
------------------------------
[1m**Accuracy**:[0m
 0.9477611940298507

[1m**Confusion Matrix**:[0m
 [[248  11]
 [ 10 133]]

[1m**Classification Report**:[0m
               precision    recall  f1-score   support

           0       0.96      0.96      0.96       259
           1       0.92      0.93      0.93       143

    accuracy                           0.95       402
   macro avg       0.94      0.94      0.94       402
weighted avg       0.95      0.95      0.95       402


[1mClassification with XGBoost:[0m
------------------------------
[1m**Accuracy**:[0m
 0.955223