In [1]:
# Importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

import warnings
warnings.filterwarnings('ignore')
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)
import logging
logging.getLogger('lightgbm').setLevel(logging.INFO)
logging.getLogger('lightgbm').setLevel(logging.ERROR)

In [2]:
train_data = pd.read_csv('train.csv')
train_data.shape

(20758, 18)

In [3]:
test_data = pd.read_csv('test.csv')

In [4]:
num_cols = list(train_data.select_dtypes(exclude=['object']).columns)
cat_cols = list(train_data.select_dtypes(include=['object']).columns)

num_cols_test = list(test_data.select_dtypes(exclude=['object']).columns)
cat_cols_test = list(test_data.select_dtypes(include=['object']).columns)

num_cols_test = [col for col in num_cols_test if col not in ['id']]

In [7]:
from sklearn.preprocessing import StandardScaler

# Let's say 'target' is the name of your target column
target_column = 'target'

# Ensure the target column is not in the num_cols list
num_cols = [col for col in num_cols if col != target_column]

# Now, fit the scaler on the training data without the target column
scaler = StandardScaler()
train_data[num_cols] = scaler.fit_transform(train_data[num_cols])

# Assuming num_cols_test is the same as num_cols, but for safety, let's make sure:
num_cols_test = num_cols  # This should have the same features as num_cols, without the target

# Now transform the test data
test_data[num_cols_test] = scaler.transform(test_data[num_cols_test])

In [6]:
print(train_data[num_cols].shape[1])  # This should print the number of features in the training data
print(test_data[num_cols_test].shape[1])  # This should print the number of features in the test data

9
8


In [8]:
from sklearn.preprocessing import LabelEncoder

labelencoder = LabelEncoder()
object_columns = train_data.select_dtypes(include='object').columns.difference(['NObeyesdad'])

for col_name in object_columns:
    if train_data[col_name].dtypes=='object':
        train_data[col_name]=labelencoder.fit_transform(train_data[col_name])
        
for col_name in test_data.columns:
    if test_data[col_name].dtypes=='object':
        test_data[col_name]=labelencoder.fit_transform(test_data[col_name])

In [39]:
X = train_data.drop(['NObeyesdad', 'id'], axis=1)
y = train_data['NObeyesdad']
y = labelencoder.fit_transform(y)
X_test = test_data.drop(["id"],axis=1)

In [40]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [41]:
param = {"objective": "multiclass",          
    "metric": "multi_logloss",          
    "verbosity": -1,                    
    "boosting_type": "gbdt",            
    "random_state": 42,       
    "num_class": 7,                     
    'learning_rate': 0.030962211546832760,  
    'n_estimators': 500,                
    'lambda_l1': 0.009667446568254372,  
    'lambda_l2': 0.04018641437301800,   
    'max_depth': 10,                    
    'colsample_bytree': 0.40977129346872643,  
    'subsample': 0.9535797422450176,   
    'min_child_samples': 26}

model_lgb = lgb.LGBMClassifier(**param,verbose=100)
model_lgb.fit(X_train, y_train)
pred_lgb = model_lgb.predict(X_val)
pred_proba = model_lgb.predict_proba(X_val)

In [42]:
import optuna

def objective(trial):
    # Define the thresholds for each class
    thresholds = {}
    for i in range(num_classes):
        thresholds[f'threshold_{i}'] = trial.suggest_uniform(f'threshold_{i}', 0.0, 1.0)

    # Apply the thresholds to convert probabilities to predictions
    y_pred = apply_thresholds(pred_proba, thresholds)

    # Calculate accuracy
    accuracy = accuracy_score(y_val, y_pred)
    return accuracy  

def apply_thresholds(y_proba, thresholds):
    # Apply the specified thresholds to convert probabilities to predicted labels
    y_pred_labels = np.argmax(y_proba, axis=1)
    for i in range(y_proba.shape[1]):
        y_pred_labels[y_proba[:, i] > thresholds[f'threshold_{i}']] = i

    return y_pred_labels

In [15]:
num_classes = 7
pred_proba = pred_proba  # Example: replace with actual y_pred_proba
y_val = y_val  # Example: replace with actual y_val

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# # Get the best thresholds
best_thresholds = study.best_params
print("Best Thresholds:", best_thresholds)

[I 2024-02-26 10:28:45,365] A new study created in memory with name: no-name-e9e2e764-90de-4ce9-8be1-a77985ef18df
[I 2024-02-26 10:28:45,371] Trial 0 finished with value: 0.8480250481695568 and parameters: {'threshold_0': 0.9832547187343998, 'threshold_1': 0.7229045547337107, 'threshold_2': 0.14180424275646142, 'threshold_3': 0.16553733085213274, 'threshold_4': 0.8124109112371432, 'threshold_5': 0.3490552231412465, 'threshold_6': 0.04381327922670353}. Best is trial 0 with value: 0.8480250481695568.
[I 2024-02-26 10:28:45,374] Trial 1 finished with value: 0.8586223506743738 and parameters: {'threshold_0': 0.4979586092133763, 'threshold_1': 0.023031439548229726, 'threshold_2': 0.09696745510568261, 'threshold_3': 0.4887868277306643, 'threshold_4': 0.21952580451592618, 'threshold_5': 0.7102637202800242, 'threshold_6': 0.6423393216267461}. Best is trial 1 with value: 0.8586223506743738.
[I 2024-02-26 10:28:45,378] Trial 2 finished with value: 0.8104527938342967 and parameters: {'threshold_0

[I 2024-02-26 10:28:45,787] Trial 21 finished with value: 0.9089595375722543 and parameters: {'threshold_0': 0.3432635522456178, 'threshold_1': 0.47304423849625044, 'threshold_2': 0.9253613772186982, 'threshold_3': 0.9012836152843675, 'threshold_4': 0.36390759739881584, 'threshold_5': 0.44756806956556006, 'threshold_6': 0.3904561976340475}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:45,819] Trial 22 finished with value: 0.9067919075144508 and parameters: {'threshold_0': 0.27853582569224666, 'threshold_1': 0.46857439673168993, 'threshold_2': 0.9768072856663043, 'threshold_3': 0.71001110526328, 'threshold_4': 0.38176791269190163, 'threshold_5': 0.4900013747912717, 'threshold_6': 0.3425010731077164}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:45,847] Trial 23 finished with value: 0.8995664739884393 and parameters: {'threshold_0': 0.4165063511492821, 'threshold_1': 0.37996375793030135, 'threshold_2': 0.8491301349273784, 'threshold_3': 0.5

[I 2024-02-26 10:28:46,463] Trial 42 finished with value: 0.903179190751445 and parameters: {'threshold_0': 0.5294870193584108, 'threshold_1': 0.30531341985918514, 'threshold_2': 0.8310829526874891, 'threshold_3': 0.9492961389195183, 'threshold_4': 0.44294166601120183, 'threshold_5': 0.36877724678196855, 'threshold_6': 0.4468234030462257}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:46,502] Trial 43 finished with value: 0.9087186897880539 and parameters: {'threshold_0': 0.43712950265840234, 'threshold_1': 0.5784311493865765, 'threshold_2': 0.6260594705435507, 'threshold_3': 0.758020742516632, 'threshold_4': 0.5078648442348535, 'threshold_5': 0.42341323487904914, 'threshold_6': 0.54012691593726}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:46,537] Trial 44 finished with value: 0.9082369942196532 and parameters: {'threshold_0': 0.43863279404529737, 'threshold_1': 0.5968101450317648, 'threshold_2': 0.6007133010750284, 'threshold_3': 0.6798

[I 2024-02-26 10:28:47,267] Trial 64 finished with value: 0.9077552986512524 and parameters: {'threshold_0': 0.2724739646669075, 'threshold_1': 0.636145703065249, 'threshold_2': 0.9359061395594783, 'threshold_3': 0.6592466789687336, 'threshold_4': 0.37172702347776393, 'threshold_5': 0.9684703519418176, 'threshold_6': 0.9631128719726949}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:47,307] Trial 65 finished with value: 0.9039017341040463 and parameters: {'threshold_0': 0.38206181531128625, 'threshold_1': 0.4056743988476445, 'threshold_2': 0.6581686275826073, 'threshold_3': 0.7286067042360744, 'threshold_4': 0.41547561361271024, 'threshold_5': 0.36701462680272867, 'threshold_6': 0.3260047043540171}. Best is trial 21 with value: 0.9089595375722543.
[I 2024-02-26 10:28:47,342] Trial 66 finished with value: 0.9082369942196532 and parameters: {'threshold_0': 0.29087346958587346, 'threshold_1': 0.4986861429070706, 'threshold_2': 0.7934682539727442, 'threshold_3': 0.95

[I 2024-02-26 10:28:48,011] Trial 85 finished with value: 0.9094412331406551 and parameters: {'threshold_0': 0.46140844643753576, 'threshold_1': 0.6102347007070603, 'threshold_2': 0.9166991815699468, 'threshold_3': 0.5132409952001119, 'threshold_4': 0.4778697563485248, 'threshold_5': 0.6370230529733687, 'threshold_6': 0.39234764999418487}. Best is trial 83 with value: 0.9104046242774566.
[I 2024-02-26 10:28:48,045] Trial 86 finished with value: 0.9084778420038536 and parameters: {'threshold_0': 0.43498060244078146, 'threshold_1': 0.6136817387431568, 'threshold_2': 0.9046982645037985, 'threshold_3': 0.5034260371786228, 'threshold_4': 0.49547424523053274, 'threshold_5': 0.6061992907163749, 'threshold_6': 0.4977203384858596}. Best is trial 83 with value: 0.9104046242774566.
[I 2024-02-26 10:28:48,077] Trial 87 finished with value: 0.9092003853564548 and parameters: {'threshold_0': 0.4608913855159716, 'threshold_1': 0.6845915104864284, 'threshold_2': 0.8552106688411686, 'threshold_3': 0.49

Best Thresholds: {'threshold_0': 0.4348146122930608, 'threshold_1': 0.596682170682707, 'threshold_2': 0.9282961600842092, 'threshold_3': 0.496202312498077, 'threshold_4': 0.4360979327425641, 'threshold_5': 0.4200727115691642, 'threshold_6': 0.39239844233136467}


In [16]:
threshold1= {'threshold_0': 0.724201213234911, 'threshold_1': 0.6161299800571379, 'threshold_2': 0.29138887902587174, 'threshold_3': 0.3145837593497076, 'threshold_4': 0.8469398340837189, 'threshold_5': 0.6800824438387787, 'threshold_6': 0.35886959729223455}

In [45]:
threshold2 = {'threshold_0': 0.4348146122930608, 'threshold_1': 0.596682170682707, 'threshold_2': 0.9282961600842092, 'threshold_3': 0.496202312498077, 'threshold_4': 0.4360979327425641, 'threshold_5': 0.4200727115691642, 'threshold_6': 0.39239844233136467}

In [46]:
test_label = model_lgb.predict_proba(X_test)
test_label = apply_thresholds(test_label, threshold2)

In [48]:
pred = labelencoder.inverse_transform(test_label)
submission = pd.DataFrame({'id': test_data.id, 'NObeyesdad': pred})
submission.to_csv('submission2.csv', index=False)