## Import Statements

In [9]:
import gc

import lightgbm as lgbm
from lightgbm import LGBMRanker, LGBMClassifier, Dataset

from sklearn.model_selection import train_test_split as tts

from sklearn.metrics import accuracy_score, precision_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, plot_confusion_matrix

from sklearn.model_selection import cross_val_score

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

In [10]:
class Learner:
    def __init__(self, train_df : pd.DataFrame, test_df : pd.DataFrame, submission_df : pd.DataFrame):
        self.train_df = train_df
        self.test_df = test_df
        self.submission_df = submission_df


    def split_dataset(self):
        self.train_input, self.validation_input, self.train_output, self.validation_output = tts(self.train_df.drop(columns=["building_id", "damage_grade"]), self.train_df["damage_grade"], test_size=0.3)
        return self.train_input, self.validation_input, self.train_output, self.validation_output
    
    def create_file(self, predicted_output, file_num : str, obj =True):
        if predicted_output.shape[0] == self.submission_df.shape[0]:
            #print(True)
            submission = self.submission_df.copy()
            for (idx,data) in submission["damage_grade"].iteritems():
                submission["damage_grade"][idx] = predicted_output[idx]    
            if obj:
                submission.to_csv("../datasets/submissions_lgbm/submission_" + file_num + ".csv", index=False, header=True)
            else:
                submission.to_csv("../datasets/submissions_lgbm/submission_norm_" + file_num + ".csv", index=False, header=True)
            del submission
            gc.collect()   
    

    def train_model(self, parameters: dict, num_boost_rounds : int, early_stopping_rounds: int):
        dataset = self.split_dataset()
        train_dataset = Dataset(data=dataset[0], label=dataset[2]-1)
        valid_dataset = Dataset(data=dataset[1], label=dataset[3]-1, reference=train_dataset)
        self.evaluation_results = {}

        ##### TRAIN ######
        self.classifer_model = lgbm.train(params=parameters, train_set=train_dataset, num_boost_round=num_boost_rounds, early_stopping_rounds=early_stopping_rounds, valid_sets=valid_dataset, evals_result=self.evaluation_results)

        #### EVAL ######
        predicted_output = self.classifer_model.predict(dataset[1], num_iteration=self.classifer_model.best_iteration)
        predicted_output = np.argmax(predicted_output, axis=1)+1
        #### SCORING #####
        #print(predicted_output.shape, dataset[3].shape)
        #print(predicted_output)
        self.f1_score = f1_score(y_true=dataset[3], y_pred=pd.Series(predicted_output), average='micro')
        self.classification_report = pd.DataFrame(classification_report(y_true=dataset[3], y_pred=pd.Series(predicted_output), output_dict=True)).transpose()
    

    def test_model(self,file_num : int,  create_file=True, obj=True):
       predicted_output = self.classifer_model.predict(self.test_df.drop(columns=["building_id"]), num_iteration=self.classifer_model.best_iteration)
       predicted_output = predicted_output + 1
       if create_file:
           self.create_file(predicted_output, str(file_num), obj=obj)
       return predicted_output






 ## Parameters 

In [11]:
learning_rate = (0.1, 0.01, 0.03, 0.05)
boosting_type = ('gbdt', 'rf')
objective = ('multiclass', 'multiclassova', 'lambdarank')
num_leaves =(31, 36, 41, 46)
feature_fraction = (0.9, 0.8, 0.7)
bagging_fraction = (0.8, 0.7, 0.6)
bagging_freq = (5, 10, 15)
verbose = 0
early_stopping_rounds = (5, 10, 15)

In [12]:
model_parmas = {
    'boosting_type' : boosting_type[1],
    'objective': objective[2],
    
    'num_leaves': num_leaves[2],
    'learning_rate': learning_rate[3],
    'feature_fraction': feature_fraction[0],
    'bagging_fraction': bagging_fraction[0],
    'bagging_freq': bagging_freq[0],
    'verbose': verbose
}

## Datasets

In [13]:
train_norm_df = pd.read_csv("../datasets/preprocessed/train_normalize.csv")
test_norm_df = pd.read_csv("../datasets/preprocessed/test_normalize.csv")

train_obj_df = pd.read_csv("../datasets/preprocessed/train_no_object.csv")
test_obj_df = pd.read_csv("../datasets/preprocessed/test_no_object.csv")

submission_df = pd.read_csv("../datasets/submission_format.csv")

## Train for No object data then predict test values

In [14]:
learner = Learner(train_df=train_obj_df, test_df=test_obj_df, submission_df=submission_df)


In [15]:
learner.train_model(parameters=model_parmas, num_boost_rounds=5, early_stopping_rounds=early_stopping_rounds[0])

LightGBMError: Lambdarank tasks require query information

In [16]:
learner.f1_score

AttributeError: 'Learner' object has no attribute 'f1_score'

0.673769841777414