In [1]:
import gc

from lightgbm import LGBMRanker, Dataset

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [2]:
class  Learner:
    def __init__(self, train_df : pd.DataFrame, test_df : pd.DataFrame, submission_df : pd.DataFrame):
        self.train_df = train_df
        self.test_df = test_df
        self.submission_df = submission_df

        self.train_input, self.validation_input, self.train_output, self.validation_output = tts(self.train_df.drop(columns=["building_id", "damage_grade"]), self.train_df["damage_grade"], test_size=0.3)

        self.f1_score = None
        self.classification_report = None
        self.evaluation_results = {}
        self.ranker_model = None

    def train_model(self, boosting_type: str, num_leaves: int , max_depth: int , learning_rate: float, n_estimators: int, early_stopping_round: int):
        self.ranker_model = LGBMRanker(boosting_type=boosting_type, num_leaves=num_leaves, max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, bagging_freq=1, feature_fraction=0.8, bagging_fraction=0.5)

        #print(self.train_output.value_counts(ascending=True))
        ### TRAIN ####
        self.ranker_model.fit(X=self.train_input, y=self.train_output, group=self.train_output.value_counts(sort=False), eval_set=[(self.validation_input, self.validation_output)], eval_group=[self.validation_output.value_counts(sort=False)])
        print(self.train_output.value_counts(ascending=True))

        #### EVALUATION ######
        predicted_output = self.ranker_model.predict(X=self.validation_input, num_iteration=self.ranker_model.best_iteration)

        self.evaluation_results = self.ranker_model.evals_result_
        self.f1_score = f1_score(y_pred=predicted_output, y_true=self.validation_output, average='micro')
        self.classification_report = pd.DataFrame(classification_report(y_true=self.validation_output, y_pred=predicted_output))

    def test_model(self,file_num : int,  create_file=True, obj=True):
       predicted_output = self.ranker_model.predict(self.test_df.drop(columns=["building_id"]), num_iteration=self.ranker_model.best_iteration)
       predicted_output = predicted_output + 1
       if create_file:
           self.create_file(predicted_output, str(file_num), obj=obj)
       return predicted_output

    def create_file(self, predicted_output, file_num : str, obj =True):
        if predicted_output.shape[0] == self.submission_df.shape[0]:
            #print(True)
            submission = self.submission_df.copy()
            for (idx,data) in submission["damage_grade"].iteritems():
                submission["damage_grade"][idx] = predicted_output[idx]
            if obj:
                submission.to_csv("../datasets/submissions_lgbm/submission_" + file_num + ".csv", index=False, header=True)
            else:
                submission.to_csv("../datasets/submissions_lgbm/submission_norm_" + file_num + ".csv", index=False, header=True)
            del submission
            gc.collect()
    
    def plot_confusion_matrix(self):
        plot_conf_matrix = plot_confusion_matrix(self.ranker_model, self.validation_input, self.validation_output, cmap=plt.cm.Blues, normalize='true') 
        return plot_conf_matrix

In [3]:
learning_rate = (0.1, 0.01, 0.03, 0.05)
boosting_type = ('gbdt', 'rf')
objective = ('multiclass', 'multiclassova', 'lambdarank')
num_leaves =(31, 36, 41, 46)
feature_fraction = (0.9, 0.8, 0.7)
bagging_fraction = (0.8, 0.7, 0.6)
bagging_freq = (5, 10, 15)
verbose = 0
early_stopping_rounds = (5, 10, 15)
max_depth = (30, 35, 40, 45)
n_estimators = (200, 300, 400)

In [4]:
train_norm_df = pd.read_csv("../datasets/preprocessed/train_normalize.csv")
test_norm_df = pd.read_csv("../datasets/preprocessed/test_normalize.csv")

train_obj_df = pd.read_csv("../datasets/preprocessed/train_no_object.csv")
test_obj_df = pd.read_csv("../datasets/preprocessed/test_no_object.csv")

submission_df = pd.read_csv("../datasets/submission_format.csv")



In [5]:
learner = Learner(train_df=train_obj_df, test_df=test_obj_df, submission_df=submission_df)

In [6]:

learner.train_model("gbdt", num_leaves=num_leaves[0], max_depth=4, n_estimators=30, learning_rate=learning_rate[0], early_stopping_round=early_stopping_rounds[1])
print(learner.f1_score)
print("*" * 90)
print(learner.classification_report)
print("*" * 90)
print(learner.evaluation_results)
print("*" * 90)
learner.plot_confusion_matrix()


[1]	valid_0's ndcg@1: 0.428571
[2]	valid_0's ndcg@1: 0.428571
[3]	valid_0's ndcg@1: 0.428571
[4]	valid_0's ndcg@1: 0.428571
[5]	valid_0's ndcg@1: 0.428571
[6]	valid_0's ndcg@1: 0.428571
[7]	valid_0's ndcg@1: 0.428571
[8]	valid_0's ndcg@1: 0.428571
[9]	valid_0's ndcg@1: 0.428571
[10]	valid_0's ndcg@1: 0.428571


KeyboardInterrupt: 