In [1]:
import os
import numpy as np
import pandas as pd
import time
from pathlib import Path
import sys


from sklearn import model_selection
from sklearn.model_selection import GroupKFold
from sklearn import metrics
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score

#Models

import xgboost as xgb
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
#Feature engineerring
from sklearn.preprocessing import FunctionTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer

#Splitting the data
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score

# Hyperparameter optimization

import optuna


import warnings
warnings.filterwarnings("ignore")


In [2]:
#Special packages
!pip install feature_engine
from feature_engine.encoding import WoEEncoder

!git clone https://github.com/analokmaus/kuma_utils.git
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer

Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.4.0
[0mCloning into 'kuma_utils'...
remote: Enumerating objects: 915, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 915 (delta 104), reused 102 (delta 96), pack-reused 795[K
Receiving objects: 100% (915/915), 679.99 KiB | 2.84 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [3]:
# The config file class

class Config:
    path = "../input/tabular-playground-series-aug-2022"
    target = "failure"

In [4]:
# def read_data(path):
#     data_dir = Path(path)

#     train_import = pd.read_csv(data_dir / "train.csv")
#     test_import = pd.read_csv(data_dir / "test.csv")
#     submission_df = pd.read_csv(data_dir / "sample_submission.csv")

#     print(f"train data: Rows={train_import.shape[0]}, Columns={train_import.shape[1]}")
#     print(f"test data : Rows={test_import.shape[0]}, Columns={test_import.shape[1]}")
#     return train_import, test_import, submission_df

In [5]:
data_dir = Path(Config.path)

train_import = pd.read_csv(data_dir / "train.csv", index_col="id")
test_import = pd.read_csv(data_dir / "test.csv", index_col="id")
submission_df = pd.read_csv(data_dir / "sample_submission.csv")

In [6]:
train_import.head()

Unnamed: 0_level_0,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,...,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,...,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,...,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,...,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,...,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,...,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


## Initial data cleaning and re-casting of values

In [7]:
# This step is not needed for this competition

## Feature engineering pipeline

In [8]:
# This will be a custom step for each competition

In [9]:
def preprocessing(df_train = train_import, df_test = test_import):

 #Inputs: The total training and test data set
     
    # Missing indicator
    for df in [df_train, df_test]:
        #Add a 1 whereever there was a missing value
        df["m_3_missing"] = df["measurement_3"].isnull().astype(int)
        df["m_5_missing"] = df["measurement_5"].isnull().astype(int)

        
    # Missing value imputation
    imptr = LGBMImputer(n_iter=50) # Setup the LGBM imputer
    def impute_nans(df_train, df_test):
        nan_features = [col for col in df_train.columns if df_train[col].isnull().any()] # Selects the columns with nan values
        for pc in df_train["product_code"].unique(): # Loops through the unique values for the product code
            #Selects the rows for each unique feature (similar to groupby) and fit_transform the nan values
            df_train.loc[df_train["product_code"]==pc, nan_features] = imptr.fit_transform(df_train.loc[df_train["product_code"]==pc, nan_features])
        
        #Apply the same process to the test set
        nan_features = [col for col in df_test.columns if df_test[col].isnull().any()]
        for pc in df_test["product_code"].unique():
            df_test.loc[df_test["product_code"]==pc, nan_features] = imptr.fit_transform(df_test.loc[df_test["product_code"]==pc, nan_features])

        return df_train, df_test
    
    df_train, df_test = impute_nans(df_train, df_test)
    
    # Area
    for df in [df_train, df_test]: # Loop through the train and test data set and create a new feature by calculation the area
        df["attribute_2*3"] = df["attribute_2"] * df["attribute_3"]
    
    
    # Aggregations
    # Automated way to create additional features by using a list comprehension and f-strings - Output is a list
    meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
    meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
    # loop through the train and test data set and  
    for df in [df_train, df_test]:
        df["meas_gr1_avg"] = np.mean(df[meas_gr1_cols], axis=1) # Calculate the mean of those columns
        df["meas_gr1_std"] = np.std(df[meas_gr1_cols], axis=1) # # Calculate the std, dev of those columns
        df["meas_gr2_avg"] = np.mean(df[meas_gr2_cols], axis=1) # Calculate the mean for the second list of features
    
    # Create a new feature by looping through the train and test set and dividing measure_17 by the average of meas_2_avg
    for df in [df_train, df_test]:
        df["meas17/meas_gr2_avg"] = df["measurement_17"] / df["meas_gr2_avg"]
        
        #Applies the Weight of Evidence encoder to the categorical feature "attribute_0"
        # WoE encoding
    woe_encoder = WoEEncoder(variables=["attribute_0"])
    df_train["attribute_0"] = woe_encoder.fit_transform(df_train["attribute_0"].to_frame(), df_train["failure"])
    df_test["attribute_0"] = woe_encoder.transform(df_test["attribute_0"].to_frame())
    
    
    features = ["attribute_0", "measurement_0", "measurement_1", "measurement_2", "m_3_missing", "m_5_missing",
               "meas_gr1_avg", "meas_gr1_std", "attribute_2*3", "loading", "measurement_17", "meas17/meas_gr2_avg"] \
#                 + list(ohe_att_0_att_23.columns) + ["gb_attribute_2*3_mean_loading"] + list(ohe_att_23.columns)
    
#     del df_all, ohe_att_23, ohe_att_0_att_23
#     gc.collect()
    
    return train_import, test_import, features

In [10]:
# Take-aways: Use list comprehension (including if statements) to select columns for transformation, Use a for loop to apply the transformation to the train and test set

In [11]:
df_train_proc, df_test_proc, features = preprocessing(train_import, test_import)

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [12]:
# Define the target and the feature matrix
df_train_X = df_train_proc.drop(Config.target, axis = 1)

df_train_y = df_train_proc["failure"]




In [13]:
df_train_X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26570 entries, 0 to 26569
Data columns (total 31 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   product_code         26570 non-null  object 
 1   loading              26570 non-null  float64
 2   attribute_0          26570 non-null  float64
 3   attribute_1          26570 non-null  object 
 4   attribute_2          26570 non-null  int64  
 5   attribute_3          26570 non-null  int64  
 6   measurement_0        26570 non-null  int64  
 7   measurement_1        26570 non-null  int64  
 8   measurement_2        26570 non-null  int64  
 9   measurement_3        26570 non-null  float64
 10  measurement_4        26570 non-null  float64
 11  measurement_5        26570 non-null  float64
 12  measurement_6        26570 non-null  float64
 13  measurement_7        26570 non-null  float64
 14  measurement_8        26570 non-null  float64
 15  measurement_9        26570 non-null 

In [14]:
# Define the cross-validation approach

# Initialise lists that store the results from the different folds

auc_list = []
importance_list = []
prediction_list = []

# initialise the cross-validation "class"

KFOLD = GroupKFold(n_splits=5)


for train_index, val_index in KFOLD.split(X = df_train_X, y = df_train_y, groups = df_train_X["product_code"]):
    #Define the train and validation data set
    X_train = df_train_X.iloc[train_index]
    X_val = df_train_X.iloc[val_index]
    y_train = df_train_y.iloc[train_index]
    y_val = df_train_y.iloc[val_index]
    
    # Quick fix: Remove the product code variable
    X_train = X_train.drop(["product_code", "attribute_1"], axis = 1).copy()
    X_val = X_val.drop(["product_code", "attribute_1"], axis = 1).copy()
    # Instaniate the model
    
    model = LogisticRegression(max_iter = 200, C=0.05, penalty='l1', solver='liblinear')
    
    #Fit the model
    model.fit(X_train, y_train)
    
    # Make predictions on the validation set
    val_pred_results = model.predict_proba(X_val)[:,1]
    # Score the prediction by using a scoring function. The true value are the first argument, the predictions the second argument
    score = roc_auc_score( y_val,val_pred_results)
    print(f"The ROC is {score:.4f}")
    # Append the ROC result from each fold to a list
    auc_list.append(score)
    # Append the results from the importance score to a list
    importance_list.append(model.coef_.ravel())
    
    print(f"The average ROC is {np.mean(auc_list)}")

The ROC is 0.5814
The average ROC is 0.581399844902523
The ROC is 0.5840
The average ROC is 0.5826921070337372
The ROC is 0.5888
The average ROC is 0.5847159884700762
The ROC is 0.5944
The average ROC is 0.5871375983669456
The ROC is 0.5908
The average ROC is 0.5878649423106839


## Ideas

In [15]:
# Change fit_transform in the feature engineering process to transform for the test set
# Understand why he is doing the aggregations
# Understand when the to_frame method is required when using an encoder