# Trabajo práctico de Laboratorio de Datos II, Universidad Austral
## Profesores: Rafael Crescenzi y Pablo Albani
## Alumno: Sebastián Nicolás González

### Librerías ------------------------------

In [None]:
## Librerías básicas
import pandas as pd 
import numpy as np
import json
import time
import random

## Librerías sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import cohen_kappa_score, make_scorer
from sklearn.model_selection import StratifiedKFold, train_test_split, RandomizedSearchCV,train_test_split

## Librerías de algoritmos de aprendizaje
import lightgbm as lgb

### Construcción del dataset -------------------------


In [None]:
## Importo los datos de entrenamiento y testeo

train = pd.read_csv('../input/petfinder-adoption-prediction/train/train.csv').set_index("PetID")
test = pd.read_csv('../input/petfinder-adoption-prediction/test/test.csv').set_index("PetID")

In [None]:
## Importo los embedding de las imágenes
## Link a la notebook: https://www.kaggle.com/code/gonzalezsn/ex-menlaboratorio2-imageprocessing-sng

picTrain = pd.read_parquet("/kaggle/input/img-sg/train_img_features.parquet")
picTest = pd.read_parquet("/kaggle/input/img-sg/test_img_features.parquet")

picTrain.columns = ['pic_'+str(i) for i in range(0,len(picTrain.columns))]
picTest.columns = ['pic_'+str(i) for i in range(0,len(picTest.columns))]

## Agrego embeddings a train y test 

train = train.join(picTrain)
test = test.join(picTest)

In [None]:
print(train.shape, test.shape)

In [None]:
## Importo los embedding de las descripciones
## Comentario: Probé usando el embedding generado a partir de SBERT, pero hizo que se desplomara el Score de la predicción. Entonces uso el generado a partir de LSA (usando SVD).
## ## Link a la notebook: https://www.kaggle.com/code/gonzalezsn/ex-menlaboratorio2-textprocessing-sng

textTrain = pd.read_parquet("/kaggle/input/text-sg/train_text_features.parquet")
textTest = pd.read_parquet("/kaggle/input/text-sg/test_text_features.parquet")

## Anexo a train y test
train = pd.concat((train, textTrain), axis=1)
test = pd.concat((test, textTest), axis=1)

In [None]:
print(train.shape, test.shape)

In [None]:
#### Feature Engineering sobre Train ------------------------------------------------------------------------------------------------

# Variable nominal que indica si la mascota tiene o no mas de  un año.
train["Age_cat"] = train["Age"].apply(lambda x: x if x<=12 else 99)

# Variable nominal que indica si fue adoptado de forma gratuita.
train['Adopt_Free'] = np.where(train['Fee'] == 0, 1, 0)

# Variable nominal indicando si tiene las palabras "adopted" o "adoption".
train['Description'].fillna("missing", inplace=True)
train['DescContain_Adop'] = train['Description'].apply(lambda x: 1 if "adopted" in x.lower() else 2 if 'adoption' in x.lower() else 0)

# Variable nominal que indica si el rescatador del animal tiene mas de un animal rescatado.
resc_count = train.reset_index()[["RescuerID","PetID"]].groupby(by='RescuerID').agg({'PetID': lambda x: (x.count() > 1).astype(np.int8)})
resc_count.rename(columns={'PetID': 'CountRescMasDe1'}, inplace=True)
train = train.join(resc_count, how='left', on='RescuerID')

# resc_count = train.reset_index()[["RescuerID","PetID"]].groupby(by="RescuerID").agg({"PetID":"count"})
# resc_count.rename(columns={'PetID': 'CountResc'}, inplace=True)
# train = train.join(resc_count, how='left', on='RescuerID')

# Otro FE
train['name_length'] = train['Name'].apply(lambda x: len(x) if type(x) is str else 0)  # Largo del nombre
train['desc_length'] = train['Description'].apply(lambda x: len(x) if type(x) is str else 0)  # Largo de la descripción
train['purebred'] = train['Breed2'].apply(lambda x: 0 if x == 0 else 1)  # Si es raza pura
train['photo_and_video'] = ((train['PhotoAmt'] > 0) & (train['VideoAmt'] > 0)).astype(int)  # Si tiene fotos Y videos
train['photo_or_video'] = ((train['PhotoAmt'] > 0) | (train['VideoAmt'] > 0)).astype(int)  # Si tiene fotos O videos


#### Feature Engineering sobre Test ------------------------------------------------------------------------------------------------

# Variable nominal que indica si la mascota tiene o no mas de  un año.
test["Age_cat"] = test["Age"].apply(lambda x: x if x<=12 else 99)

# Variable nominal que indica si fue adoptado de forma gratuita.
test['Adopt_Free'] = np.where(test['Fee'] == 0, 1, 0)

# Creo una variable nominal indicando si tiene las palabras "adopted" o "adoption".
test['Description'].fillna("missing", inplace=True)
test['DescContain_Adop'] = test['Description'].apply(lambda x: 1 if "adopted" in x.lower() else 2 if 'adoption' in x.lower() else 0)

# Variable nominal que indica si el rescatador del animal tiene mas de un animal rescatado.
resc_count = test.reset_index()[["RescuerID","PetID"]].groupby(by='RescuerID').agg({'PetID': lambda x: (x.count() > 1).astype(np.int8)})
resc_count.rename(columns={'PetID': 'CountRescMasDe1'}, inplace=True)
test = test.join(resc_count, how='left', on='RescuerID')

# resc_count = test.reset_index()[["RescuerID","PetID"]].groupby(by="RescuerID").agg({"PetID":"count"})
# resc_count.rename(columns={'PetID': 'CountResc'}, inplace=True)
# test = test.join(resc_count, how='left', on='RescuerID')

# Otro FE
test['name_length'] = test['Name'].apply(lambda x: len(x) if type(x) is str else 0)  # Largo del nombre
test['desc_length'] = test['Description'].apply(lambda x: len(x) if type(x) is str else 0)  # Largo de la descripción
test['purebred'] = test['Breed2'].apply(lambda x: 0 if x == 0 else 1)  # Si es raza pura
test['photo_and_video'] = ((test['PhotoAmt'] > 0) & (test['VideoAmt'] > 0)).astype(int)  # Si tiene fotos Y videos
test['photo_or_video'] = ((test['PhotoAmt'] > 0) | (test['VideoAmt'] > 0)).astype(int)  # Si tiene fotos O videos

In [None]:
print(train.shape, test.shape)

In [None]:
## Importo el resultado del análisis de sentimientos (provisto por la competencia)

train_id = train.index
test_id = test.index

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in train_id:
    try:
        with open('/kaggle/input/petfinder-adoption-prediction/train_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

train.loc[:, 'doc_sent_mag'] = doc_sent_mag
train.loc[:, 'doc_sent_score'] = doc_sent_score

doc_sent_mag = []
doc_sent_score = []
nf_count = 0
for pet in test_id:
    try:
        with open('/kaggle/input/petfinder-adoption-prediction/test_sentiment/' + pet + '.json', 'r') as f:
            sentiment = json.load(f)
        doc_sent_mag.append(sentiment['documentSentiment']['magnitude'])
        doc_sent_score.append(sentiment['documentSentiment']['score'])
    except FileNotFoundError:
        nf_count += 1
        doc_sent_mag.append(-1)
        doc_sent_score.append(-1)

test.loc[:, 'doc_sent_mag'] = doc_sent_mag
test.loc[:, 'doc_sent_score'] = doc_sent_score

In [None]:
print(train.shape, test.shape)

In [None]:
# Importo la metadata (provisto por la competencia)

vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in train_id:
    try:
        with open('/kaggle/input/petfinder-adoption-prediction/train_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
print(nl_count)
train.loc[:, 'vertex_x'] = vertex_xs
train.loc[:, 'vertex_y'] = vertex_ys
train.loc[:, 'bounding_confidence'] = bounding_confidences
train.loc[:, 'bounding_importance'] = bounding_importance_fracs
train.loc[:, 'dominant_blue'] = dominant_blues
train.loc[:, 'dominant_green'] = dominant_greens
train.loc[:, 'dominant_red'] = dominant_reds
train.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
train.loc[:, 'dominant_score'] = dominant_scores
train.loc[:, 'label_description'] = label_descriptions
train.loc[:, 'label_score'] = label_scores


vertex_xs = []
vertex_ys = []
bounding_confidences = []
bounding_importance_fracs = []
dominant_blues = []
dominant_greens = []
dominant_reds = []
dominant_pixel_fracs = []
dominant_scores = []
label_descriptions = []
label_scores = []
nf_count = 0
nl_count = 0
for pet in test_id:
    try:
        with open('/kaggle/input/petfinder-adoption-prediction/test_metadata/' + pet + '-1.json', 'r') as f:
            data = json.load(f)
        vertex_x = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['x']
        vertex_xs.append(vertex_x)
        vertex_y = data['cropHintsAnnotation']['cropHints'][0]['boundingPoly']['vertices'][2]['y']
        vertex_ys.append(vertex_y)
        bounding_confidence = data['cropHintsAnnotation']['cropHints'][0]['confidence']
        bounding_confidences.append(bounding_confidence)
        bounding_importance_frac = data['cropHintsAnnotation']['cropHints'][0].get('importanceFraction', -1)
        bounding_importance_fracs.append(bounding_importance_frac)
        dominant_blue = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['blue']
        dominant_blues.append(dominant_blue)
        dominant_green = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['green']
        dominant_greens.append(dominant_green)
        dominant_red = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['color']['red']
        dominant_reds.append(dominant_red)
        dominant_pixel_frac = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['pixelFraction']
        dominant_pixel_fracs.append(dominant_pixel_frac)
        dominant_score = data['imagePropertiesAnnotation']['dominantColors']['colors'][0]['score']
        dominant_scores.append(dominant_score)
        if data.get('labelAnnotations'):
            label_description = data['labelAnnotations'][0]['description']
            label_descriptions.append(label_description)
            label_score = data['labelAnnotations'][0]['score']
            label_scores.append(label_score)
        else:
            nl_count += 1
            label_descriptions.append('nothing')
            label_scores.append(-1)
    except FileNotFoundError:
        nf_count += 1
        vertex_xs.append(-1)
        vertex_ys.append(-1)
        bounding_confidences.append(-1)
        bounding_importance_fracs.append(-1)
        dominant_blues.append(-1)
        dominant_greens.append(-1)
        dominant_reds.append(-1)
        dominant_pixel_fracs.append(-1)
        dominant_scores.append(-1)
        label_descriptions.append('nothing')
        label_scores.append(-1)

print(nf_count)
test.loc[:, 'vertex_x'] = vertex_xs
test.loc[:, 'vertex_y'] = vertex_ys
test.loc[:, 'bounding_confidence'] = bounding_confidences
test.loc[:, 'bounding_importance'] = bounding_importance_fracs
test.loc[:, 'dominant_blue'] = dominant_blues
test.loc[:, 'dominant_green'] = dominant_greens
test.loc[:, 'dominant_red'] = dominant_reds
test.loc[:, 'dominant_pixel_frac'] = dominant_pixel_fracs
test.loc[:, 'dominant_score'] = dominant_scores
test.loc[:, 'label_description'] = label_descriptions
test.loc[:, 'label_score'] = label_scores

In [None]:
print(train.shape, test.shape)

In [None]:
# Elimino las variables de tipo objeto.

colObject = list(train.dtypes[train.dtypes == "object"].index)

train = train.drop(colObject,axis=1)
test = test.drop(colObject,axis=1)

print(f"Variables eliminadas: {colObject}")

In [None]:
print(train.shape, test.shape)

In [None]:
# Convierto a las variables categoricas en tipo "Category" (conveniente para el algoritmo LGBM)

numeric_cols = ['Age', 'Quantity', 'Fee', 'VideoAmt', 'PhotoAmt', 'AdoptionSpeed', 
                'doc_sent_mag', 'doc_sent_score', 'dominant_score', 'dominant_pixel_frac', 
                'dominant_red', 'dominant_green', 'dominant_blue', 'bounding_importance', 
                'bounding_confidence', 'vertex_x', 'vertex_y', 'label_score',"name_length","name_length"] +\
               [col for col in train.columns if col.startswith('pic') or col.startswith('svd')]
cat_cols = list(set(train.columns) - set(numeric_cols))
train.loc[:, cat_cols] = train[cat_cols].astype('category')
test.loc[:, cat_cols] = test[cat_cols].astype('category')
print(train.shape)
print(test.shape)

# Índice de las variables categoricas
catVal = train.dtypes
cat_feature_names = catVal[catVal == "category"]
cat_features_idx = [train.columns.get_loc(c) for c in train.columns if c in cat_feature_names]

### Helper functions -------------------

In [None]:
def quadratic_weighted_kappa(y_true, y_pred):
    return cohen_kappa_score(y_true, y_pred, weights='quadratic')

def get_kappa(model, X_train, X_valid, y_train, y_valid):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    y_pred_proba = model.predict_proba(X_valid)
    return cohen_kappa_score(y_valid, y_pred, weights= 'quadratic'),y_pred_proba

def qwk_eval(preds, train_data):
    y_true = train_data.get_label()
    preds_rounded = np.round(preds)
    return 'qwk', cohen_kappa_score(preds_rounded, y_true, weights='quadratic'), True

In [None]:
## Optimización de los cortes. 

## Estuve tratando de resolver este tema con el OptimizedRounder (minimización de función objetivo para 
## encontrar los cortes óptimos a partir de una condición inicial) pero no tuve muy buenos resultados. 
## Luego probé cortar utilizando la distribución del conjunto de entrenamiento y los resultados mejoraron sustancialmente.

def get_thresholds_from_dist(y_true, y_pred):
    """Calculates thresholds for raw predictions
    so as to follow the true distribution.
    """
    idxs = np.cumsum(np.bincount(y_true))[:-1]
    idxs = (idxs * y_pred.size / y_true.size).astype(int)
    return np.sort(y_pred)[idxs]

def allocate_to_rate(y_pred, thresholds):
    """Allocates raw predictions to adoption rates."""
    rates = np.zeros(y_pred.size, dtype=int)
    for i in range(4):
        rates[y_pred >= thresholds[i]] = i + 1
    return rates

def map_to_int(y_true, y_pred, preds):
    thresholds = get_thresholds_from_dist(y_true, y_pred)
    return allocate_to_rate(preds, thresholds)

### Modelado -----------------------------

In [None]:
## Separo el conjunto de entrenamiento en train y test_p

X = train.drop("AdoptionSpeed", axis=1)
y = train.AdoptionSpeed

X_train, X_test_p, y_train, y_test_p = train_test_split(X, y, 
                                                          test_size=0.15, 
                                                          random_state=0,
                                                          stratify=y)

print(f"X_train: {X_train.shape}")
print(f"X_test_pre: {X_test_p.shape}")
print(f"y_train: {y_train.shape}")
print(f"y_test_pre: {y_test_p.shape}")

In [None]:
## Definición del espacio de exploración de hiperparámetros.

hyperparameter_combinations = []
num_trials = 100

random.seed(10)
np.random.seed(10)

for _ in range(num_trials):
    params = {
        'objective': 'regression',
        'boosting': 'gbdt',
        'metric': 'rmse',
        'num_leaves': np.random.randint(50, 90),
        'max_depth': np.random.randint(3, 15),
        'learning_rate': np.random.uniform(0.001, 0.5),
        'bagging_fraction': np.random.uniform(0.50, 0.95),
        'feature_fraction': np.random.uniform(0.40, 0.90),
        'min_split_gain': np.random.uniform(0.008, 0.03),
        'min_child_samples': np.random.randint(40, 200),
        'min_child_weight': np.random.uniform(0.008, 0.03),
        'lambda_l1': np.random.uniform(0.01, 0.06),
        'lambda_l2': np.random.uniform(0.01, 0.06),          
        'verbosity': -1,
        'data_random_seed': 999,
        'early_stopping_rounds': np.random.randint(100, 400),
        #'device': 'gpu',    ## ------> El uso de GPU reduce el tiempo en aproximadamente 27%.
        'deterministic': True
    }
    
    hyperparameter_combinations.append(params)

In [None]:
## Entrenamiento utilizando:
# - estrategia cross-validation,
# - Optimización de hiperparámetros utilizando una estrategía Random Search sobre el espacio de busqueda de hiperparámetros definido arriba,
# - Elección de los thresholds para discretizar el output de la regresión utilizando la distribución del conjunto de entrenamiento de cada fold.
# - La métrica utilizada sobre validación es la misma que usa la competencia para evaluar nuestros resultados.

best_score = -np.inf
best_params = None
best_thresh = None
matrix_best_param = None

start = time.time()

for j,params in enumerate(hyperparameter_combinations):
    
    print(f"Trial número {j+1}: \n")
    print("Seleccionado hiperparámetros...\n")
    print(f"Los hiperparámetros seleccionados son {params}\n")
    start_param = time.time()
    
    CANT_FOLDS_CV = 5
    folds = StratifiedKFold(n_splits=CANT_FOLDS_CV,random_state=123,shuffle=True).split(X_train, y_train)
    coef_all = np.zeros((CANT_FOLDS_CV,4))
    qwk_all = np.zeros(CANT_FOLDS_CV)

    for i,(train_indexFold, valid_indexFold) in enumerate(folds):
        
        start_fold = time.time()
        print(f"Entrenando el fold número {i+1}\n")
        
        # División de train y validaion para el fold actual
        X_trainFold, X_valFold, y_trainFold, y_valFold = X.iloc[train_indexFold], X.iloc[valid_indexFold], y[train_indexFold], y[valid_indexFold]

        d_train = lgb.Dataset(X_trainFold, 
                              label=y_trainFold,
                              categorical_feature = cat_features_idx)

        d_valid = lgb.Dataset(X_valFold, 
                              label=y_valFold,
                              reference=d_train)

        watchlist = [d_train, d_valid]
        
        # Entrenamiento del modelo
        model = lgb.train(params,
                          train_set=d_train,
                          num_boost_round=10000,
                          valid_sets=watchlist,
                          feval = qwk_eval,
                          #categorical_feature=cat_features_idx,
                          callbacks=[lgb.early_stopping(stopping_rounds=params['early_stopping_rounds']),
                                    lgb.log_evaluation(100)])
        
        # Perdicción sobre train y validation
        y_trainFold_pred = model.predict(X_trainFold, num_iteration=model.best_iteration)
        y_valFold_pred = model.predict(X_valFold, num_iteration=model.best_iteration)
        
        # Uso la predicción de train para obtener los thresholds
        thresh = get_thresholds_from_dist(y_trainFold,y_trainFold_pred)
        coef_all[i,:] = thresh
        
        # Usando los thresholds discretizo la predicción sobre validación
        y_valFold_pred_adj = map_to_int(y_trainFold,y_trainFold_pred,y_valFold_pred)
        
        # Computo la métrica de la competencia sobre validación
        qwk_fold = quadratic_weighted_kappa(y_valFold,y_valFold_pred_adj)
        qwk_all[i] = qwk_fold
        print(f"La métrica QWK sobre validación del fold {i+1} es:{qwk_fold}\n")
        
        finish_fold = time.time()
        #print(f'tiempo ejecución del fold {i+1} es: {finish_fold - start_fold} \n')
    
    qwk_avg = qwk_all.mean()
    avg_coef = coef_all.mean(axis=0)
    
    if qwk_avg > best_score:
           
        matrix_best_param = coef_all
        best_thresh = avg_coef
        best_score = qwk_avg
        best_params = params
    
    finish_param = time.time()
    #print('tiempo ejecución del primer set de parametros: {:.2f} \n'.format(finish_param - start_param))
    print(f"El QWK promedio del fold {i+1} es: {qwk_avg}\n")
    print(f"El th promedio del fold {i+1} es: {avg_coef}\n")
    print(f"El QWK a superar es: {best_score}\n")
    
finish = time.time()
print('tiempo ejecución: {:.2f} \n'.format(finish - start))

In [None]:
print("Matriz de cortes por cada fold:")
print(matrix_best_param)
print("Sacando el promedio de los coeficientes...")
avg_coef = best_thresh
print(f"Los cortes promedio son: {avg_coef}")
print(f"Los hiperparámetros ganadores son: {best_params}")

In [None]:
## Entrenamiento sobre el total del conjunto de entrenamiento usando los parámetros que mejor resultaron en la etapa de validación.

final_train_data = lgb.Dataset(X_train, 
                               label=y_train,
                               categorical_feature = cat_features_idx)

final_test_data = lgb.Dataset(X_test_p, 
                               label=y_test_p,
                               reference=final_train_data)

watchlist = [final_train_data, final_test_data]

model = lgb.train(best_params,
                  train_set=final_train_data,
                  num_boost_round=10000,
                  valid_sets=watchlist,
                  feval = qwk_eval,
                  #categorical_feature=cat_features_idx,
                  callbacks=[lgb.early_stopping(stopping_rounds=best_params['early_stopping_rounds']),
                            lgb.log_evaluation(100)])

In [None]:
# Predicción sobre test (subconjunto del conjunto de entrenamiento que no se utilizó en la etapa de validación)

test_p_pred = model.predict(X_test_p, num_iteration=model.best_iteration)
testPre_pred_discr = allocate_to_rate(test_p_pred,avg_coef)

print(f"El QWK en el conjunto de testeo es de: {quadratic_weighted_kappa(y_test_p,testPre_pred_discr)}")

In [None]:
# Predicciones sobre test para la submission

test_pred = model.predict(test, num_iteration=model.best_iteration)
test_pred_discr = allocate_to_rate(test_pred,avg_coef)

In [None]:
test_labels = pd.Series(test_pred_discr, 
                       name= "AdoptionSpeed",
                       index=test.index)

test_labels

In [None]:
test_labels.to_csv("submission.csv")
!head submission.csv

In [None]:
lgb.plot_importance(model, max_num_features = 20)