## Import Statements

In [7]:
import gc

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import clone
from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

from sklearn.model_selection import cross_val_score

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [8]:
class OrdinalClassifier:
    def __init__(self, classifier : RandomForestClassifier):
        self.classifier = classifier
        self.classifiers = {}
    
    def fit(self, input, output):
        self.unique_class = np.sort(np.unique(output))

        if self.unique_class.shape[0] > 2:
            for idx in range(self.unique_class.shape[0] - 1):
                binary_output = (output > self.unique_class[idx]).astype(np.uint8)
                #print(binary_output)
                classifier = clone(self.classifier)
                classifier.fit(input, binary_output)
                self.classifiers[idx] = classifier
    
    def predict_proba(self, input):
        self.classifiers_predict = { idx:self.classifiers[idx].predict_proba(input) for idx in self.classifiers}
        #print(classifiers_predict)
        
        predicted = []

        for idx, idy in enumerate(self.unique_class):
            #print("IDX, IDY  == ", idx, "   ", idy)
            if idx == 0:
                predicted.append(1 - self.classifiers_predict[idy-1][:, 1])
            
            elif idy-1 in self.classifiers_predict:
                predicted.append(self.classifiers_predict[idy - 1 - 1][:, 1] - self.classifiers_predict[idy - 1][:, 1])

            else:
                predicted.append(self.classifiers_predict[idy - 1 - 1][:, 1])

        return np.vstack(predicted).T

    def predict(self, input):
        return np.argmax(self.predict_proba(input=input), axis=1)

In [9]:
class Learner:

    def __init__(self, train_df : pd.DataFrame, test_df : pd.DataFrame, submission_df : pd.DataFrame, ordinal_classifier : OrdinalClassifier):
        self.train_df = train_df
        self.test_df = test_df
        self.submission_df = submission_df

        self.ordinal_classifier = ordinal_classifier       
    

    def split_dataset(self):
        self.train_input, self.validation_input, self.train_output, self.validation_output = tts(self.train_df.drop(columns=["building_id", "damage_grade"]), self.train_df["damage_grade"], test_size=0.3)
        return self.train_input, self.validation_input, self.train_output, self.validation_output
    

    def create_file(self, predicted_output, file_num : str, obj =True):
        if predicted_output.shape[0] == self.submission_df.shape[0]:
            #print(True)
            submission = self.submission_df.copy()
            for (idx,data) in submission["damage_grade"].iteritems():
                submission["damage_grade"][idx] = predicted_output[idx]    
            if obj:
                submission.to_csv("../datasets/submissions_orc/submission_" + file_num + ".csv", index=False, header=True)
            else:
                submission.to_csv("../datasets/submissions_orc/submission_norm_" + file_num + ".csv", index=False, header=True)
            del submission
            gc.collect()


    def train_model(self):
        dataset = self.split_dataset()

        self.ordinal_classifier.fit(dataset[0], dataset[2])
        
        predicted_output = self.ordinal_classifier.predict(dataset[1])

        predicted_output = predicted_output + 1

        self.acc_score = accuracy_score(predicted_output, dataset[3])
        self.classification_reports = self.create_classification_report(dataset[3], predicted_output)
        self.f1_score = self.get_f1_score(dataset[3], predicted_output)

        del dataset        
        gc.collect()    

    def test_model(self, file_num : int,  create_file=True, obj=True):
        self.test_df = self.test_df.fillna(self.test_df.median())
        predicted_output = self.ordinal_classifier.predict(self.test_df.drop(columns=["building_id"]))
        predicted_output = predicted_output + 1
        if create_file:
            self.create_file(predicted_output, str(file_num), obj=obj)
        return predicted_output
    
    def plot_confusion_matrix(self):
        plot_conf_matrix = plot_confusion_matrix(self.ordinal_classifier, self.validation_input, self.validation_output, display_labels=classes, cmap=plt.cm.Blues, normalize='true') 
        return plot_conf_matrix
    
    def create_classification_report(self, validation_output, predicted_output):
        report = classification_report(validation_output, predicted_output, output_dict=True)   
        return pd.DataFrame(report).transpose()
    
    def get_cross_validation_score(self):
        return cross_val_score(self.ordinal_classifier, self.validation_input, self.validation_output, cv=3)
    
    def get_f1_score(self, validation_output, predicted_output):
        return f1_score(y_true=validation_output, y_pred=predicted_output, average='micro')

## Public Variables

In [10]:
classes = (1, 2, 3)
n_estimators = (200, 300)
max_depths = (30, 35, 40)

## Datasets

In [11]:
train_norm_df = pd.read_csv("../datasets/preprocessed/train_normalize.csv")
test_norm_df = pd.read_csv("../datasets/preprocessed/test_normalize.csv")

train_obj_df = pd.read_csv("../datasets/preprocessed/train_no_object.csv")
test_obj_df = pd.read_csv("../datasets/preprocessed/test_no_object.csv")

submission_df = pd.read_csv("../datasets/submission_format.csv")

## Train for No object data then predict test values

In [6]:
for estimator in n_estimators:
    for depth in max_depths:
        ordinal_classifier = OrdinalClassifier(classifier=RandomForestClassifier(max_depth=depth, n_estimators=estimator))
        learner = Learner(train_df=train_obj_df, test_df=test_obj_df, submission_df=submission_df, ordinal_classifier=ordinal_classifier)
        learner.train_model()

        print(learner.classification_reports)
        print("*" * 90)
        print(estimator, depth, "------", learner.f1_score)
        print("*" * 90)
                
        learner.test_model(file_num=depth+estimator)

              precision    recall  f1-score       support
1              0.651328  0.478296  0.551559   7487.000000
2              0.728899  0.828123  0.775349  44590.000000
3              0.722109  0.609217  0.660876  26104.000000
accuracy       0.721531  0.721531  0.721531      0.721531
macro avg      0.700778  0.638545  0.662595  78181.000000
weighted avg   0.719203  0.721531  0.715696  78181.000000
******************************************************************************************
40 ------ 0.7215308067177447
******************************************************************************************
              precision    recall  f1-score      support
1              0.646027  0.483587  0.553128   7616.00000
2              0.725992  0.825559  0.772581  44376.00000
3              0.722863  0.607736  0.660319  26189.00000
accuracy       0.719280  0.719280  0.719280      0.71928
macro avg      0.698294  0.638961  0.662009  78181.00000
weighted avg   0.717154  0.719280  0.7135

In [6]:
for estimator in n_estimators:
    for depth in max_depths:
        ordinal_classifier = OrdinalClassifier(classifier=RandomForestClassifier(max_depth=depth, n_estimators=estimator))
        learner = Learner(train_df=train_norm_df, test_df=test_norm_df, submission_df=submission_df, ordinal_classifier=ordinal_classifier)
        learner.train_model()

        print(learner.classification_reports)
        print("*" * 90)
        print(estimator, depth, "------", learner.f1_score)
        print("*" * 90)
        #plot = learner.plot_confusion_matrix()
        
        learner.test_model(file_num=depth+estimator, obj=False)

precision    recall  f1-score       support
1              0.646097  0.481651  0.551885   7630.000000
2              0.725226  0.829246  0.773756  44321.000000
3              0.729452  0.606672  0.662421  26230.000000
accuracy       0.720648  0.720648  0.720648      0.720648
macro avg      0.700258  0.639190  0.662687  78181.000000
weighted avg   0.718921  0.720648  0.714749  78181.000000
******************************************************************************************
200 40 ------ 0.7206482393420396
******************************************************************************************
              precision    recall  f1-score       support
1              0.642175  0.480787  0.549884   7417.000000
2              0.726005  0.824955  0.772323  44360.000000
3              0.725137  0.610286  0.662773  26404.000000
accuracy       0.719804  0.719804  0.719804      0.719804
macro avg      0.697772  0.638676  0.661660  78181.000000
weighted avg   0.717759  0.719804  0.714222  