In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics

In [2]:
#baca file
df = pd.read_csv('MICE2024.csv')

In [3]:
# Tampilkan jumlah nilai kosong per kolom
print("\nJumlah nilai kosong per kolom:")
print(df.isnull().sum())


Jumlah nilai kosong per kolom:
Unnamed: 0    0
pm10          0
pm25          0
so2           0
co            0
o3            0
no2           0
categori      0
dtype: int64


In [4]:
#convert nilai 'categori' ke dalam angka
le = LabelEncoder()
df['categori_code'] = le.fit_transform(df['categori'])

In [5]:
for i, label in enumerate(le.classes_):
    print(f"{label} => {i}")

BAIK => 0
SEDANG => 1
TIDAK SEHAT => 2


In [6]:
X = df.drop(columns=['categori','categori_code'])
y = df['categori_code']

In [7]:
#bagi dataset menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [8]:
#buat model Random Forest
RF= RandomForestClassifier(random_state=42)

In [9]:
# 3. Hitung class weight manual
class_counts = y.value_counts().to_dict()
total = len(df)
n_classes = len(class_counts)
class_weight = {
    cls: round(total / (n_classes * count) * 2) / 2
    for cls, count in class_counts.items()
}
print("Manual class weights:", class_weight)

Manual class weights: {1: 0.5, 0: 2.0, 2: 4.0}


In [10]:
# Random Forest dengan cost-sensitive learning
RFCSL = RandomForestClassifier(class_weight='balanced', random_state=42)

In [11]:
#latih model dengan data latih
RF.fit(X_train, y_train)

In [12]:
#latih model dengan data latih
RFCSL.fit(X_train, y_train)

In [13]:
#prediksi pada data uji
pred = RF.predict(X_test)

In [14]:
#prediksi pada data uji
predRFCSL = RFCSL.predict(X_test)

In [15]:
# Evaluasi
accuracy = metrics.accuracy_score(y_test, pred)
confusion = metrics.confusion_matrix(y_test, pred)
report = metrics.classification_report(y_test, pred)

print("TESTING RESULTS: \n===============================")
print(f"CONFUSION MATRIX :\n{confusion}")
print(f"ACCURACY SCORE :\n{accuracy}")
print(f"CLASSIFICATION REPORT :\n{report}")

TESTING RESULTS: 
CONFUSION MATRIX :
[[ 98   6   0]
 [  3 520   0]
 [  0   3  51]]
ACCURACY SCORE :
0.9823788546255506
CLASSIFICATION REPORT :
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       104
           1       0.98      0.99      0.99       523
           2       1.00      0.94      0.97        54

    accuracy                           0.98       681
   macro avg       0.98      0.96      0.97       681
weighted avg       0.98      0.98      0.98       681



In [16]:
# Evaluasi
accuracyRFCSL = metrics.accuracy_score(y_test, predRFCSL)
confusionRFCSL = metrics.confusion_matrix(y_test, predRFCSL)
reportRFCSL = metrics.classification_report(y_test, predRFCSL)

print("TESTING RESULTS: \n===============================")
print(f"CONFUSION MATRIX :\n{confusionRFCSL}")
print(f"ACCURACY SCORE :\n{accuracyRFCSL}")
print(f"CLASSIFICATION REPORT :\n{reportRFCSL}")

TESTING RESULTS: 
CONFUSION MATRIX :
[[ 98   6   0]
 [  3 520   0]
 [  0   3  51]]
ACCURACY SCORE :
0.9823788546255506
CLASSIFICATION REPORT :
              precision    recall  f1-score   support

           0       0.97      0.94      0.96       104
           1       0.98      0.99      0.99       523
           2       1.00      0.94      0.97        54

    accuracy                           0.98       681
   macro avg       0.98      0.96      0.97       681
weighted avg       0.98      0.98      0.98       681

