In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_columns = 30

from tqdm.notebook import tqdm

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

# Data

In [2]:
df_train = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
print(df_train.shape)
df_train.head()

(26570, 26)


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,failure
0,0,A,80.1,material_7,material_8,9,5,7,8,4,18.04,12.518,15.748,19.292,11.739,20.155,10.672,15.859,17.594,15.193,15.029,,13.034,14.684,764.1,0
1,1,A,84.89,material_7,material_8,9,5,14,3,3,18.213,11.54,17.717,17.893,12.748,17.889,12.448,17.947,17.915,11.755,14.732,15.425,14.395,15.631,682.057,0
2,2,A,82.43,material_7,material_8,9,5,12,1,5,18.057,11.652,16.738,18.24,12.718,18.288,12.715,15.607,,13.798,16.711,18.631,14.094,17.946,663.376,0
3,3,A,101.07,material_7,material_8,9,5,13,2,6,17.295,11.188,18.576,18.339,12.583,19.06,12.471,16.346,18.377,10.02,15.25,15.562,16.154,17.172,826.282,0
4,4,A,188.06,material_7,material_8,9,5,9,2,8,19.346,12.95,16.99,15.746,11.306,18.093,10.337,17.082,19.932,12.428,16.182,12.76,13.153,16.412,579.885,0


In [3]:
df_valid = pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv')
print(df_valid.shape)
df_valid.head()

(20775, 25)


Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,measurement_7,measurement_8,measurement_9,measurement_10,measurement_11,measurement_12,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17
0,26570,F,119.57,material_5,material_6,6,4,6,9,6,19.305,10.178,17.534,18.168,11.598,18.654,10.802,15.909,18.07,13.772,13.659,16.825,13.742,17.71,634.612
1,26571,F,113.51,material_5,material_6,6,4,11,8,0,17.883,11.927,17.228,16.033,11.179,19.368,12.032,13.998,,12.473,17.468,16.708,14.776,14.102,537.037
2,26572,F,112.16,material_5,material_6,6,4,8,12,4,18.475,10.481,16.619,18.189,12.126,17.774,11.743,17.046,18.086,10.907,13.363,15.737,17.065,16.021,658.995
3,26573,F,112.72,material_5,material_6,6,4,8,11,10,16.518,10.888,15.293,18.592,11.304,18.948,11.79,18.165,16.163,10.933,15.501,15.667,12.62,16.111,594.301
4,26574,F,208.0,material_5,material_6,6,4,14,16,8,17.808,12.693,17.678,15.814,13.431,19.141,12.37,14.578,17.849,11.941,16.07,16.183,13.324,17.15,801.044


## Preprocessing

In [4]:
FEATURE_COLUMNS = df_train.columns.tolist()[2:-1]
CATEGORICAL_COLUMNS = ['attribute_0', 'attribute_1']
NUMERIC_COLUMNS = list(set(FEATURE_COLUMNS) - set(CATEGORICAL_COLUMNS))
x_train = df_train[FEATURE_COLUMNS]
x_valid = df_valid[FEATURE_COLUMNS]
y_train = df_train['failure']

In [5]:
numeric_pipeline = lambda: Pipeline([
    ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler())
])
categorical_pipeline = lambda: Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')), ('encoder', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

preprocessor = lambda: ColumnTransformer([
    ('numeric', numeric_pipeline(), NUMERIC_COLUMNS), ('categorical', categorical_pipeline(), CATEGORICAL_COLUMNS)
])

# Adversarial Validation

In [6]:
y_adv_train = pd.Series([0 for _ in range(len(x_train))], name='is_valid_set')
y_adv_valid = pd.Series([1 for _ in range(len(x_valid))], name='is_valid_set')

x_adv = pd.concat([x_train, x_valid])
y_adv = pd.concat([y_adv_train, y_adv_valid])

x_adv.shape, y_adv.shape

((47345, 23), (47345,))

In [7]:
model_adv = Pipeline([('preprocessor', preprocessor()), ('estimator', LogisticRegression(random_state=0))])
preds_adv = cross_val_predict(estimator=model_adv, X=x_adv, y=y_adv, cv=10, method='predict_proba')[:, 1]
roc_auc_score(y_adv, preds_adv)

1.0

**There is clear seperation between the distribution on training data and validation data**

# Cross Validation Predictions

In [8]:
skf = StratifiedKFold(n_splits=10)

preds_oof = pd.Series([-1 for _ in range(len(x_train))], name='oof_predictions')
preds_val = []

for train_index, valid_index in tqdm(skf.split(x_train, y_train), total=10):
    x_tr, x_vl, x_ts = x_train.iloc[train_index], x_train.iloc[valid_index], x_valid
    y_tr, y_vl = y_train.iloc[train_index], y_train.iloc[valid_index]
    
    model = Pipeline([('preprocessor', preprocessor()), ('estimator', LogisticRegression(random_state=0))])
    model.fit(x_tr, y_tr)
    
    preds_oof.iloc[valid_index] = model.predict_proba(x_vl)[:, 1]
    preds_val.append(model.predict_proba(x_ts)[:, 1])

preds_val = pd.DataFrame(np.array(preds_val).T, columns=[f'model_{i}' for i in range(10)])
roc_auc_score(y_train, preds_oof)

  0%|          | 0/10 [00:00<?, ?it/s]

0.585361690200943

# Pseudo Labelling

In [9]:
preds_val_stats = pd.DataFrame()
preds_val_stats['mean'] = preds_val.mean(axis=1)
preds_val_stats['std'] = preds_val.std(axis=1)
preds_val_stats['mean_conf'] = preds_val_stats['mean'].apply(lambda x: 1 - x if x < 0.5 else x)

preds_val_pseudo = preds_val_stats['mean'].round().astype(int)
preds_val_pseudo.name = 'failure'

## Based on Mean Confidence

In [10]:
thresh = 0.87
pseudo_index = preds_val_stats[preds_val_stats['mean_conf'] > thresh].index
x_pseudo = pd.concat([x_train, x_valid.iloc[pseudo_index]]).reset_index(drop=True)
y_pseudo = pd.concat([y_train, preds_val_pseudo.iloc[pseudo_index]]).reset_index(drop=True)

x_pseudo.shape, y_pseudo.shape

((26799, 23), (26799,))

In [11]:
skf = StratifiedKFold(n_splits=10)

preds_oof = pd.Series([-1 for _ in range(len(x_pseudo))], name='oof_predictions')
preds_val = []

for train_index, valid_index in tqdm(skf.split(x_pseudo, y_pseudo), total=10):
    x_tr, x_vl, x_ts = x_pseudo.iloc[train_index], x_pseudo.iloc[valid_index], x_valid
    y_tr, y_vl = y_pseudo.iloc[train_index], y_pseudo.iloc[valid_index]
    
    model = Pipeline([('preprocessor', preprocessor()), ('estimator', LogisticRegression(random_state=0))])
    model.fit(x_tr, y_tr)
    
    preds_oof.iloc[valid_index] = model.predict_proba(x_vl)[:, 1]
    preds_val.append(model.predict_proba(x_ts)[:, 1])

preds_val = pd.DataFrame(np.array(preds_val).T, columns=[f'model_{i}' for i in range(10)])
roc_auc_score(y_pseudo, preds_oof)

  0%|          | 0/10 [00:00<?, ?it/s]

0.5839692960154876

## Based on Strandard Deviation

In [12]:
thresh = 0.0023
pseudo_index = preds_val_stats[preds_val_stats['std'] < thresh].index
x_pseudo = pd.concat([x_train, x_valid.iloc[pseudo_index]]).reset_index(drop=True)
y_pseudo = pd.concat([y_train, preds_val_pseudo.iloc[pseudo_index]]).reset_index(drop=True)

x_pseudo.shape, y_pseudo.shape

((26794, 23), (26794,))

In [13]:
skf = StratifiedKFold(n_splits=10)

preds_oof = pd.Series([-1 for _ in range(len(x_pseudo))], name='oof_predictions')
preds_val = []

for train_index, valid_index in tqdm(skf.split(x_pseudo, y_pseudo), total=10):
    x_tr, x_vl, x_ts = x_pseudo.iloc[train_index], x_pseudo.iloc[valid_index], x_valid
    y_tr, y_vl = y_pseudo.iloc[train_index], y_pseudo.iloc[valid_index]
    
    model = Pipeline([('preprocessor', preprocessor()), ('estimator', LogisticRegression(random_state=0))])
    model.fit(x_tr, y_tr)
    
    preds_oof.iloc[valid_index] = model.predict_proba(x_vl)[:, 1]
    preds_val.append(model.predict_proba(x_ts)[:, 1])

preds_val = pd.DataFrame(np.array(preds_val).T, columns=[f'model_{i}' for i in range(10)])
roc_auc_score(y_pseudo, preds_oof)

  0%|          | 0/10 [00:00<?, ?it/s]

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.5819741803354687