In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [3]:
# rutas de archivos
kaggle_path = '/kaggle/input/petfinder-adoption-prediction/'
local_path = '../data/'
path = local_path
local_out_path = '../submissions/'
out_path = local_out_path
seed = 700001

In [4]:
# lectura de datasets
# Lee el CSV
df_train = pd.read_csv(path + "train/train.csv")
df_test = pd.read_csv(path + "test/test.csv")
sample_sub = pd.read_csv(path + "test/sample_submission.csv")


# Ver Dataset


In [10]:
sample_sub.head()

Unnamed: 0,PetID,AdoptionSpeed
0,e2dfc2935,0
1,f153b465f,0
2,3c90f3f54,0
3,e02abc8a3,0
4,09f0df7d1,0


In [11]:
# Comparando datos de train y test
df_train.columns
df_test.columns

Index(['Type', 'Name', 'Age', 'Breed1', 'Breed2', 'Gender', 'Color1', 'Color2',
       'Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health', 'Quantity', 'Fee', 'State', 'RescuerID',
       'VideoAmt', 'Description', 'PetID', 'PhotoAmt'],
      dtype='object')

In [12]:
# Comparar la estructura de train y test
def compare_columns (df_train, df_test):
    print('Columnas en train', df_train.shape[1], " Columnas en test", df_test.shape[1])
    if df_train.shape[1] == df_test.shape[1]: 
        print("Misma cantidad de columnas")
        if df_train.columns.equals(df_test.columns):
            print('Mismo nombre de columnas')
            dtype_diff = (df_train.dtypes != (df_test.dtypes))
            if dtype_diff.any():
                print("Tipos de datos diferentes:")
                print(df_train.dtypes[dtype_diff].to_frame("train_dtype").join(
                      df_test.dtypes[dtype_diff].to_frame("test_dtype")))
            else:
                print('Mismos tipos de datos. Fin de la funcion')
                

        else:
            print("Columnas solo en train:", df_train.columns - df_test.columns)
            print("Columnas solo en test:", df_test.columns - df_train.columns)
    else:
        print('Cantidad de columnas diferentes')
        print('Columnas en train', df_train.shape[1], " Columnas en test", df_test.shape[1])

In [13]:
compare_columns(df_train.drop(columns=['AdoptionSpeed']), df_test)

Columnas en train 23  Columnas en test 23
Misma cantidad de columnas
Mismo nombre de columnas
Mismos tipos de datos. Fin de la funcion


# Exploración de las columnas

In [7]:
print(df_train.dtypes)
print(df_train.head())

Type               int64
Name              object
Age                int64
Breed1             int64
Breed2             int64
Gender             int64
Color1             int64
Color2             int64
Color3             int64
MaturitySize       int64
FurLength          int64
Vaccinated         int64
Dewormed           int64
Sterilized         int64
Health             int64
Quantity           int64
Fee                int64
State              int64
RescuerID         object
VideoAmt           int64
Description       object
PetID             object
PhotoAmt         float64
AdoptionSpeed      int64
dtype: object
   Type         Name  Age  Breed1  Breed2  Gender  Color1  Color2  Color3  \
0     2       Nibble    3     299       0       1       1       7       0   
1     2  No Name Yet    1     265       0       1       1       2       0   
2     1       Brisco    1     307       0       1       2       7       0   
3     1         Miko    4     307       0       2       1       2       0   
4

In [8]:
df_train['Quantity'].value_counts()
# Convertir type a dummy


Quantity
1     11565
2      1422
3       726
4       531
5       333
6       185
7        84
8        52
9        33
10       19
20       12
11       10
12        6
15        4
17        3
16        3
14        2
13        2
18        1
Name: count, dtype: int64

# Preparación de Datos

In [5]:
# Usar solo los numericos inicialmente
df_train_num = df_train.select_dtypes(include='number').copy()
df_test_num = df_test.select_dtypes(include='number').copy()

# Remover la columna target
df_train_num = df_train_num.drop(columns=['AdoptionSpeed'])

In [5]:
df_train_num.head()

Unnamed: 0,Type,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,FurLength,Vaccinated,Dewormed,Sterilized,Health,Quantity,Fee,State,VideoAmt,PhotoAmt,AdoptionSpeed
0,2,3,299,0,1,1,7,0,1,1,2,2,2,1,1,100,41326,0,1.0,2
1,2,1,265,0,1,1,2,0,2,2,3,3,3,1,1,0,41401,0,2.0,0
2,1,1,307,0,1,2,7,0,2,2,1,1,2,1,1,0,41326,0,7.0,3
3,1,4,307,0,2,1,2,0,2,1,1,1,2,1,1,150,41401,0,8.0,2
4,1,1,307,0,1,1,0,0,2,1,2,2,2,1,1,0,41326,0,3.0,2


# Modelo LightGBM

In [6]:
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [7]:
# Dividir los datos de entrenamiento en train y valid
x = df_train_num
y = df_train['AdoptionSpeed']

x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=seed)


In [8]:


#variable objetivo
label="AdoptionSpeed"

# Crear datasets para LightGBM
train_data = lgb.Dataset(x_train, label=y_train)
valid_data = lgb.Dataset(x_val, label=y_val)

# parametros del modelo
params = {
    'objective': 'multiclass',
    'num_class': 5,  # 0–4
    'metric': 'multi_logloss',
    'seed': seed
}

model = lgb.train(params,
                  train_data,
                  valid_sets=[valid_data])
                

# Predecir sobre el conjunto de validación
y_pred = model.predict(x_val)
y_pred_classes = y_pred.argmax(axis=1)

# Evaluar el modelo
acc = accuracy_score(y_val, y_pred_classes)
print("Validation Accuracy:", acc)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 440
[LightGBM] [Info] Number of data points in the train set: 11994, number of used features: 19
[LightGBM] [Info] Start training from score -3.633260
[LightGBM] [Info] Start training from score -1.582620
[LightGBM] [Info] Start training from score -1.312234
[LightGBM] [Info] Start training from score -1.506833
[LightGBM] [Info] Start training from score -1.282636
Validation Accuracy: 0.41613871290430143


ValueError: Data must be 1-dimensional, got ndarray of shape (3972, 5) instead

In [10]:

y_test_pred = model.predict(df_test_num)  # x_test debe estar definido y preprocesado igual que x_train
y_test_pred_classes = y_test_pred.argmax(axis=1)  # Convertir probabilidades en clases


In [11]:
# Crear archivo de submission
submission = pd.DataFrame({
    "PetID": df_test["PetID"],
    "AdoptionSpeed": y_test_pred_classes
})
submission.to_csv(out_path + "submission.csv", index=False)
