#### Please upvote if you find the notebook interesting/useful :)

## Ok, as we can see from version 4 - we can recieve 0.81420 without pseudolabelling. 

# Install [AutoWoe](https://github.com/sberbank-ai-lab/AutoMLWhitebox) library

This library is a part of [LightAutoML](https://github.com/sberbank-ai-lab/LightAutoML) framework and is used in Whitebox preset, but here we will show how to use it separately

In [1]:
!pip install -U autowoe

Collecting autowoe
  Downloading AutoWoE-1.2.5-py3-none-any.whl (204 kB)
[K     |████████████████████████████████| 204 kB 895 kB/s 
Collecting sphinx
  Downloading Sphinx-3.5.4-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 4.6 MB/s 
Collecting sphinxcontrib-devhelp
  Downloading sphinxcontrib_devhelp-1.0.2-py2.py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 1.9 MB/s 
Collecting imagesize
  Downloading imagesize-1.2.0-py2.py3-none-any.whl (4.8 kB)
Collecting sphinxcontrib-serializinghtml
  Downloading sphinxcontrib_serializinghtml-1.1.4-py2.py3-none-any.whl (89 kB)
[K     |████████████████████████████████| 89 kB 3.6 MB/s 
[?25hCollecting sphinxcontrib-htmlhelp
  Downloading sphinxcontrib_htmlhelp-1.0.3-py2.py3-none-any.whl (96 kB)
[K     |████████████████████████████████| 96 kB 43 kB/s 
[?25hCollecting sphinxcontrib-applehelp
  Downloading sphinxcontrib_applehelp-1.0.2-py2.py3-none-any.whl (121 kB)
[K     |

# Imports 

In [2]:
%matplotlib inline

import pandas as pd
import numpy as np

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split, StratifiedKFold
from matplotlib import pyplot as plt

from autowoe import AutoWoE, ReportDeco

# Data loading

In [3]:
INPUT_PATH = '../input/tabular-playground-series-apr-2021/'
train_data = pd.read_csv(INPUT_PATH + 'train.csv')
train_data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.00,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.00,0,0,427635,7.76,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
99995,99995,1,2,"Bell, Adele",female,62.00,0,0,PC 15008,14.86,D17243,C
99996,99996,0,2,"Brown, Herman",male,66.00,0,0,13273,11.15,,S
99997,99997,0,3,"Childress, Charles",male,37.00,0,0,,9.95,,S
99998,99998,0,3,"Caughlin, Thomas",male,51.00,0,1,458654,30.92,,S


In [4]:
test_data = pd.read_csv(INPUT_PATH + 'test.csv')
test_data

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,100000,3,"Holliday, Daniel",male,19.0,0,0,24745,63.01,,S
1,100001,3,"Nguyen, Lorraine",female,53.0,0,0,13264,5.81,,S
2,100002,1,"Harris, Heather",female,19.0,0,0,25990,38.91,B15315,C
3,100003,2,"Larsen, Eric",male,25.0,0,0,314011,12.93,,S
4,100004,1,"Cleary, Sarah",female,17.0,0,2,26203,26.89,B22515,C
...,...,...,...,...,...,...,...,...,...,...,...
99995,199995,3,"Cash, Cheryle",female,27.0,0,0,7686,10.12,,Q
99996,199996,1,"Brown, Howard",male,59.0,1,0,13004,68.31,,S
99997,199997,3,"Lightfoot, Cameron",male,47.0,0,0,4383317,10.87,,S
99998,199998,1,"Jacobsen, Margaret",female,49.0,1,2,PC 26988,29.68,B20828,C


In [5]:
submission = pd.read_csv(INPUT_PATH + 'sample_submission.csv')
# submission

In [6]:
print('TRAIN TARGET MEAN = {:.3f}'.format(train_data['Survived'].mean()))

TRAIN TARGET MEAN = 0.428


# Load OOFs and Test predictions

In [7]:
import joblib
oof_preds_1, test_preds_1, oof_preds_2, test_preds_2 = joblib.load('../input/lightautoml-interpretable-model-autowoe/both_preproc_preds.pkl')
oofs = 0.48 * oof_preds_1 + (1 - 0.48) * oof_preds_2
test_preds = 0.48 * test_preds_1 + (1-0.48) * test_preds_2

In [8]:
train_data['preds'] = oofs
test_data['preds'] = test_preds

# Extra features functions creation

In [9]:
def create_extra_features_1(data_0):
    data = data_0.copy()
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip())
    
    data['Ticket1'] = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else np.nan)
    data['Ticket2'] = data.Ticket.str.replace('\.','', regex=True).\
                    str.replace('(\d+)', '', regex=True).\
                    str.replace(' ', '', regex=True).\
                    replace(r'^\s*$', 'X', regex=True)
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    for col in ['Name', 'FirstName', 'Surname']:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
        
    data.drop(columns = ['Name', 'Surname', 'Ticket'], inplace = True)
    
    for pair in [('Ticket1', 'Cabin'), ('Ticket2', 'Cabin'), 
                 ('Cabin', 'Parch'), ('Ticket1', 'Parch'), ('Ticket2', 'Parch'),
                 ('Cabin', 'Embarked'), ('Ticket1', 'Embarked'), ('Ticket2', 'Embarked'), ('Embarked', 'Parch')
                ]:
        data[pair[0] + '_' + pair[1]] = data[pair[0]].astype(str) + '_' + data[pair[1]].astype(str)
        
    return data

def create_extra_features_2(data_0):
    data = data_0.copy()
    data.Cabin = data.Cabin.map(lambda x: str(x)[0].strip())
    data.Ticket = data.Ticket.map(lambda x:str(x).split()[0] if len(str(x).split()) > 1 else np.nan)
    
    data['FamilySize'] = data['SibSp'] + data['Parch'] + 1
    
    data['FirstName'] = data.Name.map(lambda x: str(x).split(',')[0])
    data['Surname'] = data.Name.map(lambda x: str(x).split(',')[1])
    
    for col in ['Name', 'FirstName', 'Surname']:
        data['Counter_' + col] = data[col].map(data.groupby(col)['PassengerId'].count().to_dict())
        
    data.drop(columns = ['Name', 'Surname'], inplace = True)
    
    return data


all_df = pd.concat([train_data, test_data]).reset_index(drop = True)
print(all_df.shape)

(200000, 13)


# Train 2 separate models for Sex on full train and pseudolabelled test

In [10]:
TARGET = 'Survived'

def get_model():
    return AutoWoE(monotonic=False,
                     vif_th=20.,
                     imp_th=0,
                     th_const=32,
                     force_single_split=True,
                     min_bin_size = 0.01,
                     oof_woe=True,
                     n_folds=10,
                     n_jobs=4,
                     regularized_refit=True,
                     verbose=2
            )

def fit_autowoe(data, model_name):
    auto_woe = get_model()
    auto_woe.fit(data.drop('PassengerId', axis = 1), target_name=TARGET)
    train_pred = auto_woe.predict_proba(data)
    print('\t{}: ACCURACY ON TRAIN = {:.5f}'.format(model_name, accuracy_score(data[TARGET], (train_pred > 0.5).astype(int))))
    return auto_woe

def cv_autowoe(data, 
               test_data, 
               n_folds = 5, 
               sex = 'Unknown'):
    
    print('=' * 50)
    print('Start model for sex = {}'.format(sex))
    
    oof_pred = np.array([0.0] * len(data))
    test_pred = np.array([0.0] * len(test_data))
    
    skf = StratifiedKFold(n_splits = n_folds, shuffle = True, random_state = 13)
    models = []
    for fold, (train_index, test_index) in enumerate(skf.split(data[TARGET], data[TARGET])):
        print('\tStart model sex = {}, fold = {}'.format(sex, fold))
        train_data = data.iloc[train_index, :]
        valid_data = data.iloc[test_index, :]
        
        model_name = 'Model_{}_{}'.format(sex, fold)
        model = fit_autowoe(train_data, model_name)
        models.append(model)
        
        val_pred = model.predict_proba(valid_data)
        oof_pred[test_index] = val_pred
        print('\t{}: ACCURACY ON VALID = {:.5f}'.format(model_name, accuracy_score(valid_data[TARGET], (val_pred > 0.5).astype(int))))
        
        test_pred += model.predict_proba(test_data) / n_folds
        print('\t' + '*' * 50)
        
    print('ACCURACY ON OOF = {:.5f}'.format(accuracy_score(data[TARGET], (oof_pred > 0.5).astype(int))))
        
    return models, oof_pred, test_pred

def build_model_split_by_sex(train_data, test_data, n_folds):
    male_models, male_oof_pred, male_test_pred = cv_autowoe(train_data[train_data['Sex'] == 'male'], test_data, n_folds, 'male')
    print('=' * 50 + '\n' + '=' * 50)
    female_models, female_oof_pred, female_test_pred = cv_autowoe(train_data[train_data['Sex'] == 'female'], test_data, n_folds, 'female')
    
    oof_preds = np.array([0.0] * len(train_data))
    oof_preds[train_data['Sex'] == 'male'] = male_oof_pred
    oof_preds[train_data['Sex'] == 'female'] = female_oof_pred
    
    test_preds = np.where(test_data['Sex'] == 'male', male_test_pred, female_test_pred)
    return oof_preds, test_preds

In [11]:
all_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,preds
0,0,1.0,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S,0.239244
1,1,0.0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S,0.093614
2,2,0.0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S,0.105634
3,3,0.0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S,0.079921
4,4,1.0,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S,0.093814


In [12]:
%%time

gen_feats = create_extra_features_1(all_df)
train_data, test_data = gen_feats[:len(train_data)], gen_feats[len(train_data):]
print(train_data.shape, test_data.shape)
oof_preds_1, test_preds_1 = build_model_split_by_sex(train_data, test_data, 10)

(100000, 27) (100000, 27)
Start model for sex = male
	Start model sex = male, fold = 0
 features ['Sex'] contain too many nans or identical values
[LightGBM] [Info] Number of positive: 8291, number of negative: 32110
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4942
[LightGBM] [Info] Number of data points in the train set: 40401, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.205218 -> initscore=-1.353997
[LightGBM] [Info] Start training from score -1.353997
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	val_set's auc: 0.776074
 features ['Cabin'] have low importance
Pclass processing...
SibSp processing...
Fare processing...
preds processing...
Parch processing...
Embarked processing...
Age processing...
Ticket1 processing...
Ticket2 processing...
FirstName processing...
Counter_FirstName processing...
Ticket1_Cabin processing...
FamilySize processing...
Count

In [13]:
print('ACCURACY = {:.5f}'.format(accuracy_score(train_data[TARGET], (oof_preds_1 > 0.5).astype(int))))

ACCURACY = 0.77967


In [14]:
%%time

gen_feats = create_extra_features_2(all_df)
train_data, test_data = gen_feats[:len(train_data)], gen_feats[len(train_data):]
print(train_data.shape, test_data.shape)
oof_preds_2, test_preds_2 = build_model_split_by_sex(train_data, test_data, 10)

(100000, 17) (100000, 17)
Start model for sex = male
	Start model sex = male, fold = 0
 features ['Sex'] contain too many nans or identical values
[LightGBM] [Info] Number of positive: 8291, number of negative: 32110
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4093
[LightGBM] [Info] Number of data points in the train set: 40401, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.205218 -> initscore=-1.353997
[LightGBM] [Info] Start training from score -1.353997
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[18]	val_set's auc: 0.77031
 features [] have low importance
Pclass processing...
Age processing...
SibSp processing...
Parch processing...
Ticket processing...
Fare processing...
Cabin processing...
Embarked processing...
preds processing...
FamilySize processing...
FirstName processing...
Counter_Name

In [15]:
print('ACCURACY = {:.5f}'.format(accuracy_score(train_data[TARGET], (oof_preds_2 > 0.5).astype(int))))

ACCURACY = 0.77953


In [16]:
best_score = -1
best_w = None
for w in np.arange(0, 1.01, 0.01):
    comb_pred = w * oof_preds_1 + (1 - w) * oof_preds_2
    score = accuracy_score(train_data[TARGET], (comb_pred > 0.5).astype(int))
    
    if score > best_score:
        best_score = score
        best_w = w
    
    print('{:.2f} ACCURACY = {:.5f}'.format(w, score))

0.00 ACCURACY = 0.77953
0.01 ACCURACY = 0.77954
0.02 ACCURACY = 0.77951
0.03 ACCURACY = 0.77954
0.04 ACCURACY = 0.77953
0.05 ACCURACY = 0.77948
0.06 ACCURACY = 0.77948
0.07 ACCURACY = 0.77958
0.08 ACCURACY = 0.77959
0.09 ACCURACY = 0.77958
0.10 ACCURACY = 0.77958
0.11 ACCURACY = 0.77960
0.12 ACCURACY = 0.77961
0.13 ACCURACY = 0.77960
0.14 ACCURACY = 0.77956
0.15 ACCURACY = 0.77957
0.16 ACCURACY = 0.77957
0.17 ACCURACY = 0.77957
0.18 ACCURACY = 0.77960
0.19 ACCURACY = 0.77959
0.20 ACCURACY = 0.77960
0.21 ACCURACY = 0.77962
0.22 ACCURACY = 0.77962
0.23 ACCURACY = 0.77963
0.24 ACCURACY = 0.77963
0.25 ACCURACY = 0.77969
0.26 ACCURACY = 0.77969
0.27 ACCURACY = 0.77969
0.28 ACCURACY = 0.77968
0.29 ACCURACY = 0.77966
0.30 ACCURACY = 0.77967
0.31 ACCURACY = 0.77967
0.32 ACCURACY = 0.77966
0.33 ACCURACY = 0.77963
0.34 ACCURACY = 0.77964
0.35 ACCURACY = 0.77961
0.36 ACCURACY = 0.77962
0.37 ACCURACY = 0.77958
0.38 ACCURACY = 0.77958
0.39 ACCURACY = 0.77955
0.40 ACCURACY = 0.77955
0.41 ACCURACY = 

In [17]:
print('BEST W = {:.2f}, BEST ACCURACY = {:.5f}'.format(best_w, best_score))

BEST W = 0.92, BEST ACCURACY = 0.77970


In [18]:
preds = best_w * test_preds_1 + (1 - best_w) * test_preds_2

# Create submissions

In [19]:
bound = pd.Series(preds).sort_values(ascending = False).head(34911).values[-1]
bound

0.4177912594459845

In [20]:
submission['Survived'] = (preds > bound).astype(int)
submission.to_csv('1_AutoWoE_submission_combo.csv', index = False)

In [21]:
submission['Survived'].mean()

0.34876

# Appendix

In [22]:
import joblib
joblib.dump((oof_preds_1, test_preds_1, oof_preds_2, test_preds_2), 'both_preproc_preds.pkl')

['both_preproc_preds.pkl']