In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [2]:
# 1. Memuat dataset
df = pd.read_csv('mushroom_cleaned.csv')

In [3]:
# 2. Preprocessing: Mengatasi missing values (jika ada)
# Memeriksa apakah ada nilai kosong
print(df.isnull().sum())

cap-diameter       0
cap-shape          0
gill-attachment    0
gill-color         0
stem-height        0
stem-width         0
stem-color         0
season             0
class              0
dtype: int64


In [4]:
# Mengisi nilai kosong dengan metode pengisian yang sesuai (median, modus, dll.)
# Sebagai contoh, jika ada nilai kosong, kita dapat menggunakan:
df.fillna(df.median(), inplace=True)

In [5]:
# 3. Label Encoding untuk kolom kategorikal yang tidak banyak variasi
label_encoder = LabelEncoder()

In [6]:
# Melakukan label encoding pada kolom 'cap-shape' dan 'gill-color'
df['cap-shape'] = label_encoder.fit_transform(df['cap-shape'])
df['gill-color'] = label_encoder.fit_transform(df['gill-color'])

In [7]:
# 4. One-Hot Encoding untuk kolom kategorikal lainnya
# Mengidentifikasi kolom kategorikal untuk di-encode
categorical_columns = ['gill-attachment', 'stem-color', 'season']
one_hot_encoder = OneHotEncoder(sparse_output=False)

In [8]:
# Menerapkan OneHotEncoder
encoded_categorical = one_hot_encoder.fit_transform(df[categorical_columns])


In [9]:
# Menambah hasil OneHotEncoded ke dataset asli
encoded_df = pd.DataFrame(encoded_categorical, columns=one_hot_encoder.get_feature_names_out(categorical_columns))
df = df.join(encoded_df)


In [10]:
# Menghapus kolom kategorikal asli yang telah di-OneHotEncode
df.drop(columns=categorical_columns, inplace=True)

In [11]:
# 5. Normalisasi kolom numerik
scaler = StandardScaler()
numerical_columns = ['cap-diameter', 'stem-height', 'stem-width']
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

In [12]:
# 6. Memisahkan fitur dan target
X = df.drop(columns=['class'])  # Semua kolom kecuali 'class' adalah fitur
y = df['class']  # Kolom 'class' adalah target

In [13]:
# 7. Membagi dataset menjadi training dan testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [14]:
# 8. Training Model menggunakan RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

In [15]:
# 9. Evaluasi model
y_pred = rf_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Accuracy: 0.9911168686962154
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      4909
           1       0.99      0.99      0.99      5898

    accuracy                           0.99     10807
   macro avg       0.99      0.99      0.99     10807
weighted avg       0.99      0.99      0.99     10807



In [16]:
print(df.describe())
print(df['class'].value_counts())


       cap-diameter     cap-shape    gill-color   stem-height    stem-width  \
count  5.403500e+04  54035.000000  54035.000000  5.403500e+04  5.403500e+04   
mean   8.836582e-17      4.000315      7.329509  2.188106e-16 -8.205398e-17   
std    1.000009e+00      2.160505      3.200266  1.000009e+00  1.000009e+00   
min   -1.576238e+00      0.000000      0.000000 -1.165480e+00 -1.344010e+00   
25%   -7.731935e-01      2.000000      5.000000 -7.498324e-01 -8.056802e-01   
50%   -1.174201e-01      5.000000      8.000000 -2.547224e-01 -1.637766e-01   
75%    5.939273e-01      6.000000     10.000000  4.543241e-01  6.034389e-01   
max    3.678285e+00      6.000000     11.000000  4.725632e+00  3.219644e+00   

              class  gill-attachment_0  gill-attachment_1  gill-attachment_2  \
count  54035.000000       54035.000000       54035.000000       54035.000000   
mean       0.549181           0.359119           0.185546           0.082798   
std        0.497580           0.479747          

In [17]:
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")


Accuracy: 0.9911168686962154
