In [22]:
# Import the required modules
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

In [23]:
# Read the preprocessed_doggy file into a DataFrame
doggy_df = pd.read_csv(
    Path("preprocessed_doggy.csv")
)

# Review the DataFrame
display(doggy_df.head())
display(doggy_df.tail())

Unnamed: 0.1,Unnamed: 0,Borough,dog_friendly,income_cat,grooming_frequency,shedding,energy_level,trainability,demeanor,size,life_expectancy,lifetime_cost,Breed
0,0,Bronx,Yes,middle,0.4,0.4,0.8,0.8,1.0,Small 9-35lb,0.6,Low,Boxer
1,1,Manhattan,Yes,high,0.8,0.2,0.6,0.6,1.0,Toy >9lb,0.8,Medium,Maltese
2,2,Manhattan,Yes,high,1.0,0.2,0.6,0.2,0.8,Toy >9lb,0.8,Medium,Yorkshire Terrier
3,3,Brooklyn,Yes,low,0.4,0.8,0.6,1.0,0.6,Small 9-35lb,0.4,Low,German Shepherd Dog
4,4,Brooklyn,Yes,high,0.4,0.4,0.4,1.0,0.8,Toy >9lb,0.8,Medium,Cavalier King Charles Spaniel


Unnamed: 0.1,Unnamed: 0,Borough,dog_friendly,income_cat,grooming_frequency,shedding,energy_level,trainability,demeanor,size,life_expectancy,lifetime_cost,Breed
275750,275750,Queens,Yes,middle,1.0,0.2,0.6,0.2,0.8,Toy >9lb,0.8,Medium,Yorkshire Terrier
275751,275751,Manhattan,Yes,high,0.4,0.4,0.4,1.0,0.8,Toy >9lb,0.8,Medium,Cavalier King Charles Spaniel
275752,275752,Brooklyn,No,middle,0.2,0.4,0.6,0.4,0.6,Toy >9lb,1.0,High,Chihuahua
275753,275753,Staten Island,No,middle,0.4,0.8,1.0,1.0,1.0,Small 9-35lb,0.6,Medium,Labrador Retriever
275754,275754,Queens,No,middle,0.4,0.2,0.6,1.0,1.0,Toy >9lb,0.8,Low,Boston Terrier


In [24]:
# Remove the Unnamed col
doggy_df = doggy_df[['Borough', 'dog_friendly', 'income_cat',
       'grooming_frequency', 'shedding', 'energy_level', 'trainability',
       'demeanor', 'size', 'life_expectancy', 'lifetime_cost', 'Breed']]

doggy_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 275755 entries, 0 to 275754
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   Borough             275755 non-null  object 
 1   dog_friendly        275755 non-null  object 
 2   income_cat          275755 non-null  object 
 3   grooming_frequency  275755 non-null  float64
 4   shedding            275755 non-null  float64
 5   energy_level        275755 non-null  float64
 6   trainability        275755 non-null  float64
 7   demeanor            275755 non-null  float64
 8   size                275755 non-null  object 
 9   life_expectancy     275755 non-null  float64
 10  lifetime_cost       275755 non-null  object 
 11  Breed               275755 non-null  object 
dtypes: float64(6), object(6)
memory usage: 25.2+ MB


In [25]:
# Split the data into features(X) and targets(y)
# First y
y = doggy_df['Breed']

# Next X
X = doggy_df.drop(columns=['Breed'],axis=1)

In [26]:
# Use LabelEncoder to convert targets(50) into a single column
# Use LabelEncoder on the y DataFrame to encode the Dog Breeds
label_encoder_df = LabelEncoder()

# Encode Labels
y_encoded = label_encoder_df.fit_transform(y)

# Test the results of the encoder
# Display first 50 of encoded column
display(y_encoded[0:50])

# Get the original labels back
display(label_encoder_df.inverse_transform(y_encoded[0:50]))

array([10, 30, 49, 24, 14, 14, 10, 10,  3, 28,  3, 25,  3, 49, 29,  9,  6,
       25, 26,  6, 14, 10, 15, 30, 15, 29, 15, 49,  3,  6, 44, 29, 44,  3,
       26, 16,  3, 23, 45, 37, 10, 15, 29, 12, 43, 39, 49, 28, 15, 30])

array(['Boxer', 'Maltese', 'Yorkshire Terrier', 'German Shepherd Dog',
       'Cavalier King Charles Spaniel', 'Cavalier King Charles Spaniel',
       'Boxer', 'Boxer', 'Beagle', 'Havanese', 'Beagle',
       'German Shorthaired Pointer', 'Beagle', 'Yorkshire Terrier',
       'Labrador Retriever', 'Boston Terrier', 'Bichon Frise',
       'German Shorthaired Pointer', 'Golden Retriever', 'Bichon Frise',
       'Cavalier King Charles Spaniel', 'Boxer', 'Chihuahua', 'Maltese',
       'Chihuahua', 'Labrador Retriever', 'Chihuahua',
       'Yorkshire Terrier', 'Beagle', 'Bichon Frise', 'Shih Tzu',
       'Labrador Retriever', 'Shih Tzu', 'Beagle', 'Golden Retriever',
       'Cocker Spaniel', 'Beagle', 'French Bulldog', 'Siberian Husky',
       'Poodle', 'Boxer', 'Chihuahua', 'Labrador Retriever', 'Bulldog',
       'Shiba Inu', 'Pug', 'Yorkshire Terrier', 'Havanese', 'Chihuahua',
       'Maltese'], dtype=object)

In [27]:
# Use get_dummies to convert category types to numbers
y = pd.get_dummies(y, dtype=int)

X = pd.get_dummies(X, dtype=int)

In [28]:
display(y.sample(10))
display(X.head(3))

Unnamed: 0,Australian Cattle Dog,Australian Shepherd,Basset Hound,Beagle,Belgian Malinois,Bernese Mountain Dog,Bichon Frise,Bloodhound,Border Collie,Boston Terrier,...,Rhodesian Ridgeback,Rottweiler,Shetland Sheepdog,Shiba Inu,Shih Tzu,Siberian Husky,Vizsla,Weimaraner,West Highland White Terrier,Yorkshire Terrier
2704,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125069,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
239977,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
168819,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
199324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
37831,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
117473,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
191492,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209711,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
250605,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


Unnamed: 0,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,Borough_Bronx,Borough_Brooklyn,Borough_Manhattan,Borough_Queens,...,income_cat_low,income_cat_middle,size_Giant <75lb,size_Large 55-75lb,size_Medium 35-55lb,size_Small 9-35lb,size_Toy >9lb,lifetime_cost_High,lifetime_cost_Low,lifetime_cost_Medium
0,0.4,0.4,0.8,0.8,1.0,0.6,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.2,0.6,0.6,1.0,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
2,1.0,0.2,0.6,0.2,0.8,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1


In [29]:
# Convert Features/Targets into a form suitable for Modeling
y = y_encoded

X = pd.get_dummies(X, dtype=int)

In [34]:
# Look at X, y data
display(y)
display(X.head(3))

array([10, 30, 49, ..., 15, 29,  9])

Unnamed: 0,grooming_frequency,shedding,energy_level,trainability,demeanor,life_expectancy,Borough_Bronx,Borough_Brooklyn,Borough_Manhattan,Borough_Queens,...,income_cat_low,income_cat_middle,size_Giant <75lb,size_Large 55-75lb,size_Medium 35-55lb,size_Small 9-35lb,size_Toy >9lb,lifetime_cost_High,lifetime_cost_Low,lifetime_cost_Medium
0,0.4,0.4,0.8,0.8,1.0,0.6,1,0,0,0,...,0,1,0,0,0,1,0,0,1,0
1,0.8,0.2,0.6,0.6,1.0,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1
2,1.0,0.2,0.6,0.2,0.8,0.8,0,0,1,0,...,0,0,0,0,0,0,1,0,0,1


In [31]:
# Features
X.shape

(275755, 24)

In [39]:
# Target
y.shape

(275755,)

In [33]:
# Split into testing and training sets using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)

# LogisticRegression: MultiOutputClassifier>> Dimension Error

In [57]:
# Create multi_output classifier using logistic regression model.
# Use a random_state of 42 to the model
lr_model = MultiOutputClassifier(LogisticRegression(random_state=42))

# Fit and save the logistic regression model using the training data
lr_model = lr_model.fit(X_train, y_train)

# Predict
training_predictions = lr_model.predict(X_train)
testing_predictions = lr_model.predict(X_test)

ValueError: y must have at least two dimensions for multi-output regression but has only one.

# LogisticRegression: OneVsRestClassifier >> Tuple Input Error

In [59]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression

# Create multi_output classifier using logistic regression model.
# Use a random_state of 42 to the model
lr_model = OneVsRestClassifier(LogisticRegression(random_state=42))

# Fit and save the logistic regression model using the training data
lr_model = lr_model.fit(X_train, y_train)

# Predict
training_predictions = lr_model.predict(X_train)

testing_predictions = lr_model.predict(X_test)

In [49]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

# Assuming lr_model is the MultiOutputClassifier from the "Binary Relevance" example

# Predictions
y_pred = lr_model.predict(X_test)

# Convert y_test and y_pred to NumPy arrays
# y_test_np = y_test.to_numpy()
y_test_np = np.array(y_test) if not isinstance(y_test, np.ndarray) else y_test
y_pred_np = y_pred

# Get the confusion matrix for each target
confusion_matrices = []
for i in range(y_test_np.shape[1]):
    cm = confusion_matrix(y_test_np[:, i], y_pred_np[:, i])
    confusion_matrices.append(cm)

# Plot confusion matrices
# Adjusting subplot dimensions based on the number of labels
n_labels = y_test_np.shape[1]
n_rows = n_labels // 10 + (n_labels % 10 > 0)  # Ensure enough rows for all labels
fig, axes = plt.subplots(nrows=n_rows, ncols=min(n_labels, 10), figsize=(20, 10), squeeze=False)

for i in range(n_labels):
    row = i // 10
    col = i % 10
    sns.heatmap(confusion_matrices[i], annot=True, fmt="d", cmap="Blues", ax=axes[row, col])
    axes[row, col].set_title(f"Label {i+1}")

plt.tight_layout()
plt.show()

IndexError: tuple index out of range