In [15]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Embedding, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

## Data Collection

In [131]:
dataframe = pd.read_csv('./data/akc-data-latest-v4.csv')
dataframe.head()

Unnamed: 0,breed,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,trainability_category,demeanor_value,demeanor_category,temperament_category,height,weight,expectancy,height_category,weight_category,expectancy_category
0,Affenpinscher,The Affen’s apish look has been described many...,"Confident, Famously Funny, Fearless",148,22.86,29.21,3.175147,4.535924,12.0,15.0,...,Easy Training,1.0,Outgoing,Confident,26.035,3.855535,13.5,Short,Light,Long
1,Afghan Hound,"The Afghan Hound is an ancient breed, his whol...","Dignified, Profoundly Loyal, Aristocratic",113,63.5,68.58,22.679619,27.215542,12.0,15.0,...,May be Stubborn,0.2,Aloof/Wary,Loyal,66.04,24.94758,13.5,Tall,Medium,Long
2,Airedale Terrier,The Airedale Terrier is the largest of all ter...,"Friendly, Clever, Courageous",60,58.42,58.42,22.679619,31.751466,11.0,14.0,...,Eager to Please,0.8,Friendly,Amiable,58.42,27.215542,12.5,Tall,Medium,Long
3,Akita,"Akitas are burly, heavy-boned spitz-type dogs ...","Courageous, Dignified, Profoundly Loyal",47,60.96,71.12,31.751466,58.967008,10.0,13.0,...,Eager to Please,0.6,Alert/Responsive,Loyal,66.04,45.359237,11.5,Tall,Heavy,Long
4,Alaskan Malamute,The Alaskan Malamute stands 23 to 25 inches at...,"Affectionate, Loyal, Playful",58,58.42,63.5,34.019428,38.555351,10.0,14.0,...,Independent,0.8,Friendly,Playful,60.96,36.28739,12.0,Tall,Heavy,Long


## 1. Data Collection

There was 1 column for the breed name, 1 column with the description, 1 column for a popularity score (1 is the most popular breed, and higher numbers correspond to less popular breeds), and 18 columns for traits.

In [132]:
dataframe.columns.tolist()

['breed',
 'description',
 'temperament',
 'popularity',
 'min_height',
 'max_height',
 'min_weight',
 'max_weight',
 'min_expectancy',
 'max_expectancy',
 'group',
 'grooming_frequency_value',
 'grooming_frequency_category',
 'shedding_value',
 'shedding_category',
 'energy_level_value',
 'energy_level_category',
 'trainability_value',
 'trainability_category',
 'demeanor_value',
 'demeanor_category',
 'temperament_category',
 'height',
 'weight',
 'expectancy',
 'height_category',
 'weight_category',
 'expectancy_category']

In [133]:
dataframe.dtypes

breed                           object
description                     object
temperament                     object
popularity                       int64
min_height                     float64
max_height                     float64
min_weight                     float64
max_weight                     float64
min_expectancy                 float64
max_expectancy                 float64
group                           object
grooming_frequency_value       float64
grooming_frequency_category     object
shedding_value                 float64
shedding_category               object
energy_level_value             float64
energy_level_category           object
trainability_value             float64
trainability_category           object
demeanor_value                 float64
demeanor_category               object
temperament_category            object
height                         float64
weight                         float64
expectancy                     float64
height_category          

## Data Preprocessing

In [134]:
# # Kolom numerik
# numerical_cols = [ 'popularity', 'min_height', 'max_height', 'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy']
# dataframe_numeric = dataframe[numerical_cols]

# Mengubah data kategorikal menjadi numerik
categorical_cols = ['height_category', 'weight_category', 'demeanor_category', 'trainability_category', 'energy_level_category', 'shedding_category', 'grooming_frequency_category', 'temperament_category']
# categorical_cols = ['demeanor_category', 'trainability_category', 'energy_level_category', 'shedding_category', 'grooming_frequency_category']
dataframe_categorical = dataframe[categorical_cols].astype('category')

In [135]:
dataframe_encoded = pd.get_dummies(dataframe_categorical)

data = dataframe_encoded
pd.DataFrame(data.columns)

Unnamed: 0,0
0,height_category_Medium
1,height_category_Short
2,height_category_Tall
3,height_category_Very Tall
4,weight_category_Heavy
5,weight_category_Light
6,weight_category_Medium
7,weight_category_Very Heavy
8,demeanor_category_Alert/Responsive
9,demeanor_category_Aloof/Wary


## Decision Tree

In [136]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# Menghapus baris dengan nilai non-numerik
data = data.dropna()

# Menentukan fitur (features) dan target variabel
features = data
# Menerapkan LabelEncoder pada variabel target
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(dataframe['breed'])

# # Split dataset into training set and test set
# X_train, X_test, y_train, y_test = train_test_split(features, target_encoded, test_size=0.3, random_state=1) # 70% training and 30% test
# print(X_train.shape, X_test.shape)

# Tahap Kedua - Pembuatan Model
model = DecisionTreeClassifier()
model.fit(features, target_encoded)

DecisionTreeClassifier()

In [137]:
# #Predict the response for test dataset
# y_pred = model.predict(X_test)

In [138]:
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [140]:

# Membuat data prediksi baru
prediction_data = np.array([[1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0,1,0,0 ,0,0,  1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

# Melakukan prediksi dengan model Decision Tree yang sudah dilatih sebelumnya
predicted_breed = model.predict(prediction_data)

# Mengubah indeks hasil prediksi menjadi nama kelas anjing breed
predicted_breed = label_encoder.inverse_transform(predicted_breed)

# Mencari baris yang memiliki breed yang sesuai dengan hasil prediksi
characteristics = dataframe[dataframe['breed'] == predicted_breed[0]]

print("Predicted Breed:", predicted_breed)
pd.DataFrame(characteristics)

Predicted Breed: ['Vizsla']


  "X does not have valid feature names, but"


Unnamed: 0,breed,description,temperament,popularity,min_height,max_height,min_weight,max_weight,min_expectancy,max_expectancy,...,trainability_category,demeanor_value,demeanor_category,temperament_category,height,weight,expectancy,height_category,weight_category,expectancy_category
178,Vizsla,The Vizsla is easily recognized by his sleek g...,"Affectionate, Gentle, Energetic",31,53.34,60.96,19.958064,27.215542,12.0,14.0,...,Eager to Please,0.8,Friendly,Playful,57.15,23.586803,13.0,Tall,Medium,Long


## Random Forest

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Baca dataset
data = pd.read_csv('./data/akc-data-latest-v4.csv')

# Menerapkan LabelEncoder pada variabel target
label_encoder = LabelEncoder()
data['group_encoded'] = label_encoder.fit_transform(data['breed'])

# Memilih fitur-fitur kategorikal
categorical_cols = [
    'trainability_category', 
    'energy_level_category', 
    'shedding_category',
    'grooming_frequency_category', 
    'temperament_category', 
    'weight_category',
    'height_category',
    'demeanor_category'
]

# Menggunakan fitur-fitur kategorikal sebagai variabel fitur
features = data[categorical_cols]

# Menggunakan variabel target yang telah diencode
target = data['group_encoded']

# Melakukan one-hot encoding pada fitur-fitur kategorikal
features_encoded = pd.get_dummies(features)

# Split dataset menjadi training set dan test set
X_train, X_test, y_train, y_test = train_test_split(features_encoded, target, test_size=0.2, random_state=42)

# Membuat model Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Melatih model dengan data training
model.fit(X_train, y_train)

# Melakukan prediksi pada data test
y_pred = model.predict(X_test)

# Menghitung akurasi
accuracy = model.score(X_test, y_test)
print('Accuracy:', accuracy)


Accuracy: 0.0
