In [1]:
import os
import time
from pathlib import Path
import sys
import pickle
import joblib

#Import data manipulation libaries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#import ploting libraries
import matplotlib.pyplot as plt
import seaborn as sns

#Splitting the data and cross-validation
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score,KFold, GroupKFold, StratifiedKFold

#Metrics
#from sklearn.metrics import classification_report,accuracy_score , roc_auc_score
from sklearn.metrics import mean_squared_error, roc_auc_score
#Classifiers
import xgboost as xgb
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier

#Feature engineerring
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_transformer

#Encoding

import category_encoders as ce

In [2]:
#Optuna optimization

import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [3]:
train_import = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test_import = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")

In [4]:
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
                # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True
                # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)
                          # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [5]:
#reduce_mem_usage(train_import)

In [6]:
# List of colums based on their function
# Target
target_label  = "target"
# List of all features
all_feature = [col for col in train_import.columns if col not in ["target", "id"]]
# List of numeric features
all_numeric = [col for col in all_feature if train_import[col].dtype == "float64"]
# List of all categorical features
all_categorical = [col for col in all_feature if train_import[col].dtype == "object"]

high_cardinal = "cat10"

low_cardinal = ['cat0',
 'cat1',
 'cat2',
 'cat3',
 'cat4',
 'cat5',
 'cat6',
 'cat7',
 'cat8',
 'cat9',
 'cat11',
 'cat12',
 'cat13',
 'cat14',
 'cat15',
 'cat16',
 'cat17',
 'cat18']

In [7]:
#Create the training feature matrix, the training feature labels and the same for the
#validation set
train = train_import.sample(frac = 0.05).copy().reset_index()
train_X = train.loc[:, all_feature]
train_y = train.loc[:, target_label].values
test_X = test_import.loc[:, all_feature]
train_X.shape, train_y.shape, test_X.shape

((15000, 30), (15000,), (200000, 30))

In [8]:
# Helper Code to make the code adaptible for the future
train_split_X = train_X.copy()
train_split_y = pd.DataFrame(train_y.copy())

In [9]:
#Set up validation scheme
n_folds = 5
splitter = StratifiedKFold(n_splits=n_folds)

In [10]:
preprocess_continuous = Pipeline(steps=[('scaler', StandardScaler())])
preprocess_categorical = Pipeline(steps=[('encoder',
ce.OneHotEncoder(cols=all_categorical))])

In [11]:
ct = ColumnTransformer(
transformers=[('continuous', #A
preprocess_continuous, all_numeric),
('categorical', #B
preprocess_categorical, all_categorical)],
remainder='passthrough')

In [12]:
model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)

In [13]:
#model = LogisticRegression()

In [14]:
train_list = [] # An numpy array with a length of the number of folds
val_list = []


#for i, rate in enumerate(learning_rates): #D
for trn, val in splitter.split(train_split_X, train_split_y): #Create indexes for the train and validation set  
    
    Xtrain = train_split_X.iloc[trn]
    Xval = train_split_X.iloc[val]
    ytrain = train_split_y.iloc[trn]
    yval = train_split_y.iloc[val]
    
    Xtrain_prepocessed = ct.fit_transform(Xtrain, ytrain)
    Xval_preprocessed = ct.transform(Xval)
    
    model.fit(Xtrain_prepocessed, ytrain)
  
    train_pred =  model.predict_proba(Xtrain_prepocessed)[:,1]
    train_score = roc_auc_score(ytrain, train_pred)
    
    
    valid_pred = model.predict_proba(Xval_preprocessed)[:,1]
    valid_score = roc_auc_score(yval, valid_pred)

    
    train_list.append(train_score)
    
    val_list.append(valid_score)
    # Append the results from the importance score to a list
    
    
print(f"The average training ROC is {np.mean(train_list):.3f}")
print(f"The average validation ROC is {np.mean(val_list):.3f}")

The average training ROC is 0.990
The average validation ROC is 0.872


In [15]:
print(f"The average validation ROC is {np.mean(val_list):.3f}")

The average validation ROC is 0.872


In [16]:
#Ideas: separte lists for binary, ordinal and nominal categoroes 