In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("dataset/forestfires.csv")
df

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


In [2]:
def switch_month_integer(months):
    month_dict_mapping = {
        'jan': 1,
        'feb': 2,
        'mar': 3,
        'apr': 4,
        'may': 5,
        'jun': 6,
        'jul': 7,
        'aug': 8,
        'sep': 9, 
        'oct': 10,
        'nov': 11,
        'dec': 12
    }

    converted_months = []
    for month in months:
        converted_months.append(month_dict_mapping[month])

    return converted_months

months_data = switch_month_integer(df['month'])
months_data = np.array(months_data)

In [3]:
x = df[df.columns[4: -1]].to_numpy()
y_area_1 = df['X'].to_numpy()
y_area_2 = df['Y'].to_numpy()

min_data_1 = min(y_area_1)
min_data_2 = min(y_area_2)

In [4]:
# # Make the label start from 0 to n
for idx, _ in enumerate(y_area_1):
    y_area_1[idx] = y_area_1[idx] - min_data_1
    y_area_2[idx] = y_area_2[idx] - min_data_2

In [5]:
from collections import Counter

# Calculate each class image distribution in each set
def calculate_distribution(y):
    counter = Counter(y)
    for k, v in counter.items():
        per= v / len(y) * 100
        print('Class=%d, n=%d (%.3f%%)' % (k, v, per))

print("Area X distribution : ")
calculate_distribution(y_area_1)
print("Area Y distribution : ")
calculate_distribution(y_area_2)

# Decide to use X only

Area X distribution : 
Class=6, n=60 (11.605%)
Class=7, n=61 (11.799%)
Class=5, n=86 (16.634%)
Class=4, n=30 (5.803%)
Class=3, n=91 (17.602%)
Class=1, n=73 (14.120%)
Class=8, n=13 (2.515%)
Class=0, n=48 (9.284%)
Class=2, n=55 (10.638%)
Area Y distribution : 
Class=3, n=125 (24.178%)
Class=2, n=203 (39.265%)
Class=4, n=74 (14.313%)
Class=1, n=64 (12.379%)
Class=0, n=44 (8.511%)
Class=7, n=6 (1.161%)
Class=6, n=1 (0.193%)


In [6]:
# Merge month data with x
complete_x = np.concatenate((x, np.stack([months_data], axis=1)), axis=1)

In [7]:
from sklearn.preprocessing import MinMaxScaler

# Normalization
scaler = MinMaxScaler()
scaler.fit(complete_x)

complete_x = scaler.transform(complete_x)

In [8]:
print("Before : {} After : {}".format(x.shape, complete_x.shape))

Before : (517, 8) After : (517, 9)


In [9]:
# Cross Validation

from sklearn.model_selection import StratifiedKFold as skf

K = 2
skf_conf = skf(n_splits=K, random_state=42, shuffle=True)
for i, (train_idx, test_idx) in enumerate(skf_conf.split(complete_x, y_area_1)):
    x_train, y_train = complete_x[train_idx], y_area_1[train_idx]
    x_test, y_test = complete_x[test_idx], y_area_1[test_idx]

In [10]:
x_train.shape

(259, 9)

In [11]:
print("Area X Train Distribution : ")
calculate_distribution(y_train)
print("Area X Test Distribution : ")
calculate_distribution(y_test)

Area X Train Distribution : 
Class=6, n=30 (11.583%)
Class=7, n=31 (11.969%)
Class=5, n=43 (16.602%)
Class=4, n=15 (5.792%)
Class=3, n=45 (17.375%)
Class=1, n=37 (14.286%)
Class=0, n=24 (9.266%)
Class=2, n=28 (10.811%)
Class=8, n=6 (2.317%)
Area X Test Distribution : 
Class=6, n=30 (11.628%)
Class=7, n=30 (11.628%)
Class=5, n=43 (16.667%)
Class=3, n=46 (17.829%)
Class=4, n=15 (5.814%)
Class=1, n=36 (13.953%)
Class=8, n=7 (2.713%)
Class=0, n=24 (9.302%)
Class=2, n=27 (10.465%)


In [14]:
# Used Supervised Machine Learning algorithm
# 1. KNN
# 2. Naive Bayes
# 3. SVM
# 4. Decision Tree
# 5. Random Forest
# 6. Neural Network

def knn_fit(x_train, y_train, k=3):
    from sklearn.neighbors import KNeighborsClassifier as knearest

    knn = knearest(n_neighbors=k)
    knn.fit(x_train, y_train)

    return knn

def naive_bayes_fit(x_train, y_train):
    from sklearn.naive_bayes import GaussianNB

    naive_bayes = GaussianNB()
    naive_bayes.fit(x_train, y_train)

    return naive_bayes

def svm_fit(x_train, y_train):
    from sklearn import svm

    svm_clf = svm.SVC(decision_function_shape='ovo')
    svm_clf.fit(x_train, y_train)

    return svm_clf

def dec_tree_fit(x_train, y_train):
    from sklearn import tree

    tree_clf = tree.DecisionTreeClassifier()
    tree_clf.fit(x_train, y_train)

    return tree_clf

def random_forest_fit(x_train, y_train, estimators=7):
    from sklearn.ensemble import RandomForestClassifier

    random_forest_clf = RandomForestClassifier(n_estimators=estimators)
    random_forest_clf.fit(x_train, y_train)

    return random_forest_clf

def deep_learning_fit(x_train, y_train, x_test, y_test):
    import tensorflow as tf

    x_train = np.expand_dims(x_train, axis=1)
    x_test = np.expand_dims(x_test, axis=1)

    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(1, 9)),
        tf.keras.layers.LSTM(16, return_sequences=True),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(9, activation='softmax')        

    ])

    model.compile(loss='sparse_categorical_crossentropy', metrics=['accuracy'], optimizer=tf.keras.optimizers.Adam(learning_rate=0.001))

    history = model.fit(
        x=x_train,
        y=y_train,
        validation_data=(x_test, y_test),
        epochs=30,
        verbose=1
    )

    return model


# Define all of the machine learning model
knn_model = knn_fit(x_train, y_train, k=4)
naive_bayes_model = naive_bayes_fit(x_train, y_train)
svm_model = svm_fit(x_train, y_train)
dec_tree_model = dec_tree_fit(x_train, y_train)
rand_forest_model = random_forest_fit(x_train, y_train, estimators=7)
deep_learning_model = deep_learning_fit(x_train, y_train, x_test, y_test)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [28]:
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

# Get Model Prediction and calculate metrics
def predict(model, x_test, y_test):
    result = model.predict(x_test)
    
    accuracy = accuracy_score(y_true=y_test, y_pred=result)
    f1 = f1_score(y_true=y_test, y_pred=result, average='micro')
    recall = recall_score(y_true=y_test, y_pred=result, average='micro')
    precision = precision_score(y_true=y_test, y_pred=result, average='micro')

    return (accuracy, f1, precision, recall)

def predict_dl(model, x_test, y_test):
    x_test = np.expand_dims(x_test, axis=1)

    result = model.predict(x_test)
    # Get maximum confidence
    prediction_result = []
    for idx, pred_result in enumerate(result):
        prediction_result.append(np.argmax(pred_result))

    accuracy = accuracy_score(y_true=y_test, y_pred=prediction_result)
    f1 = f1_score(y_true=y_test, y_pred=prediction_result, average='micro')
    recall = recall_score(y_true=y_test, y_pred=prediction_result, average='micro')
    precision = precision_score(y_true=y_test, y_pred=prediction_result, average='micro')

    return (accuracy, f1, precision, recall)


knn_result = predict(knn_model, x_test, y_test)
naive_bayes_result = predict(naive_bayes_model, x_test, y_test)
svm_result = predict(svm_model, x_test, y_test)
decision_tree_result = predict(dec_tree_model, x_test, y_test)
random_forest_result = predict(rand_forest_model, x_test, y_test)
deep_learning_result = predict_dl(deep_learning_model, x_test, y_test)



In [33]:
def print_result(model_name='KNN', metrics_result:tuple=()):
    accuracy, f1, precision, recall = metrics_result

    print("Result of {} : ".format(model_name))
    print("Accuracy : {}".format(accuracy))
    print("F1 : {}".format(f1))
    print("Precision : {}".format(precision))
    print("Recall : {}".format(recall))
    print("--------------------------------------------------")

print_result(model_name='KNN', metrics_result=knn_result)
print_result(model_name='Naive Bayes', metrics_result=naive_bayes_result)
print_result(model_name='SVM', metrics_result=svm_result)
print_result(model_name='Decision Tree', metrics_result=decision_tree_result)
print_result(model_name='Random Forest', metrics_result=random_forest_result)
print_result(model_name='Deep Learning', metrics_result=deep_learning_result)

Result of KNN : 
Accuracy : 0.1821705426356589
F1 : 0.18217054263565893
Precision : 0.1821705426356589
Recall : 0.1821705426356589
--------------------------------------------------
Result of Naive Bayes : 
Accuracy : 0.16279069767441862
F1 : 0.16279069767441862
Precision : 0.16279069767441862
Recall : 0.16279069767441862
--------------------------------------------------
Result of SVM : 
Accuracy : 0.18604651162790697
F1 : 0.18604651162790695
Precision : 0.18604651162790697
Recall : 0.18604651162790697
--------------------------------------------------
Result of Decision Tree : 
Accuracy : 0.1821705426356589
F1 : 0.18217054263565893
Precision : 0.1821705426356589
Recall : 0.1821705426356589
--------------------------------------------------
Result of Random Forest : 
Accuracy : 0.20155038759689922
F1 : 0.20155038759689922
Precision : 0.20155038759689922
Recall : 0.20155038759689922
--------------------------------------------------
Result of Deep Learning : 
Accuracy : 0.1782945736434

In [35]:
import pickle
# Save model
def save_model(model, path, is_dl=False):
    if is_dl:
        model.save(path)
    else:
        with open(path, 'wb') as f:
            pickle.dump(model, f)

save_model(knn_model, path='model/model_knn.pkl')
save_model(naive_bayes_model, path='model/model_naive_bayes.pkl')
save_model(svm_model, path='model/model_svm.pkl')
save_model(dec_tree_model, path='model/model_decision_tree.pkl')
save_model(rand_forest_model, path='model/model_random_forest.pkl')
save_model(deep_learning_model, path='model/lstm.h5', is_dl=True)