In [4]:
import numpy as np, pandas as pd
import os
import json
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.metrics import log_loss, mean_squared_error

In [5]:
#path = '/kaggle/input/bench-tab-v1/'
path = './datasets/extra_datasets/'

tasks = json.load(open(f'{path}train/dict_task.json'))
suits = json.load(open(f'{path}train/dict_suit.json'))

tasks

{'336': {'features': 'num', 'target': 'num'},
 '337': {'features': 'num', 'target': 'cat'},
 '335': {'features': 'num+cat', 'target': 'num'},
 '334': {'features': 'num+cat', 'target': 'cat'}}

In [6]:
suits['334']

[361110, 361111, 361113, 361282, 361283, 361285, 361286]

In [31]:
DEBUG = True
# change here to get the dataset
suits = {'334': [361285]} if DEBUG else suits
n_seeds = 5

In [32]:
def prepare_sub(data, identifier):
    data = pd.DataFrame(data)
    data.columns = ['target']
    data['ID'] =  identifier + '_' + data.index.astype(str)
    columns = ['ID','target']
    return data[columns]

In [33]:
# https://docs.google.com/spreadsheets/d/159UsoK3q2x-wXKoYEY-zHlZhiIcDgjCFbDW69QMmUnk/edit#gid=1822039730
# map dataset suits to their true names
suit_to_name = {}

suit_to_name['361060'] = 'electricity'
suit_to_name['361061'] = 'covertype'
suit_to_name['361062'] = 'pol'
suit_to_name['361063'] = 'house_16H'
suit_to_name['361065'] = 'MagicTelescope'
suit_to_name['361066'] = 'bank-marketing'
suit_to_name['361276'] = 'Bioresponse'
suit_to_name['361068'] = 'MiniBooNE'
suit_to_name['361275'] = 'default-of-credit-card-clients'
suit_to_name['361069'] = 'Higgs'
suit_to_name['361070'] = 'eye_movements'
suit_to_name['361273'] = 'Diabetes130US'
suit_to_name['361274'] = 'jannis'
suit_to_name['361278'] = 'heloc'
suit_to_name['361055'] = 'credit'
suit_to_name['361277'] = 'california'

suit_to_name['361110'] = 'electricity_mixed'
suit_to_name['361111'] = 'eye_movements_mixed'
suit_to_name['361113'] = 'covertype_mixed'
suit_to_name['361282'] = 'albert'
suit_to_name['361285'] = 'road-safety'



In [34]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score

import numpy as np


res_gbdt = {}
sub = []

res_acc = {}

for t in suits:
    print(t)
    for suit in suits[t]:
        
        identifier = f'{t}_{suit}'
        task_type = tasks[t]
        var_type = task_type['features']
        target_type = task_type['target']

        X = pd.read_parquet(f'{path}train/{identifier}/data_train.parquet')
        #X_test = pd.read_parquet(f'{path}train/{identifier}/data_test.parquet')
        y = pd.read_parquet(f'{path}train/{identifier}/target_train.parquet')
        
        names = json.load(open(f'{path}train/{identifier}/attribute_names.json'))
        cat_indicator = json.load(open(f'{path}train/{identifier}/categorical_indicator.json'))
        
        print(names)
        #if suit_to_name[str(suit)] == "electricity":
        #    # drop date
        #    X = X.drop(columns=['date'])
        #    
        #    # delete 'date' from names and cat_indicator
        #    names = names[1:]
        #    cat_indicator = cat_indicator[1:]
            
        
        dataset_full = pd.concat([X,y],axis=1)
        dataset_full.to_csv(f'./datasets/{suit_to_name[str(suit)]}.csv', index=False)
        
        # Split the dataset into training (70%), validation (15%), and test (15%) sets
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
        X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
        
        # save the dataset files
        X_train.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Xtrain.csv', index=False)
        X_val.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Xvalidation.csv', index=False)
        X_test.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Xtest.csv', index=False)
        
        y_train.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Ytrain.csv', index=False)
        y_val.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Yvalidation.csv', index=False)
        y_test.to_csv(f'./datasets/{suit_to_name[str(suit)]}_Ytest.csv', index=False)  

        # Checking the shape of the splits
        split_shapes = {
            "Training Features": X_train.shape,
            "Training Labels": y_train.shape,
            "Validation Features": X_val.shape,
            "Validation Labels": y_val.shape,
            "Test Features": X_test.shape,
            "Test Labels": y_test.shape
        }

        print(split_shapes)
        
        
        preds = []
        
        for seed in range(n_seeds):
        
            gbdt_model = CatBoostRegressor(max_depth=10, random_seed=seed) if target_type=='num' else CatBoostClassifier(random_seed=seed)  
            gbdt_model = gbdt_model.fit(X_train, y_train.values, cat_features=X_train.columns[cat_indicator].values, silent = True)
            
            if target_type=='num':
                pred = gbdt_model.predict(X_test)
                preds.append( pred )
            else:
                pred = gbdt_model.predict_proba(X_test)[:,1]
                pred = [1 if p >= 0.5 else 0 for p in pred] 
                preds.append(pred)
            
        preds = np.mean(np.array(preds),axis=0)

        # convert preds to int
        preds = [1 if p >= 0.5 else 0 for p in preds]
        
        accuracy = accuracy_score(y_test['target'].values, preds)
        
        print(f'Accuracy: {accuracy}')
        
        preds = pd.DataFrame(preds)
        preds.index = X_test.index
        
        res_acc[suit] = accuracy
        

334
['Vehicle_Reference_df_res', 'Vehicle_Type', 'Vehicle_Manoeuvre', 'Vehicle_Location-Restricted_Lane', 'Hit_Object_in_Carriageway', 'Hit_Object_off_Carriageway', 'Was_Vehicle_Left_Hand_Drive?', 'Age_of_Driver', 'Age_Band_of_Driver', 'Engine_Capacity_(CC)', 'Propulsion_Code', 'Age_of_Vehicle', 'Location_Easting_OSGR', 'Location_Northing_OSGR', 'Longitude', 'Latitude', 'Police_Force', 'Number_of_Vehicles', 'Number_of_Casualties', 'Local_Authority_(District)', '1st_Road_Number', '2nd_Road_Number', 'Urban_or_Rural_Area', 'Vehicle_Reference_df', 'Casualty_Reference', 'Sex_of_Casualty', 'Age_of_Casualty', 'Age_Band_of_Casualty', 'Pedestrian_Location', 'Pedestrian_Movement', 'Casualty_Type', 'Casualty_IMD_Decile']
{'Training Features': (46939, 32), 'Training Labels': (46939, 1), 'Validation Features': (10059, 32), 'Validation Labels': (10059, 1), 'Test Features': (10059, 32), 'Test Labels': (10059, 1)}
Accuracy: 0.7939158962123471


In [30]:
y_test

Unnamed: 0,target
8072,0
48093,1
4312,0
52098,1
39033,1
...,...
5934,0
41030,1
20801,0
52313,1


In [33]:
y_test.value_counts()

target
1         958
0         899
dtype: int64