In [1]:
!pip install feature_engine

Collecting feature_engine
  Downloading feature_engine-1.4.0-py2.py3-none-any.whl (276 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.4/276.4 kB[0m [31m915.8 kB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature_engine
Successfully installed feature_engine-1.4.0
[0m

In [2]:

import pandas as pd 
import sys
import numpy as np
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.ensemble import VotingClassifier
import gc

In [3]:
train = pd.read_csv('../input/tabular-playground-series-aug-2022/train.csv')
train.drop('id', axis=1, inplace=True)
test = pd.read_csv('../input/tabular-playground-series-aug-2022/test.csv')
test.drop('id', axis=1, inplace=True)

In [4]:
train['m_3_missing'] = train.measurement_3.isna()
train['m_5_missing'] = train.measurement_5.isna()

test['m_3_missing'] = test.measurement_5.isna()
test['m_5_missing'] = test.measurement_5.isna()

In [5]:
!git clone https://github.com/analokmaus/kuma_utils.git
sys.path.append("kuma_utils/")
from kuma_utils.preprocessing.imputer import LGBMImputer
df_A = train[train['product_code']=='A']
df_B = train[train['product_code']=='B']
df_C = train[train['product_code']=='C']
df_D = train[train['product_code']=='D']
df_E = train[train['product_code']=='E']

df_F_t = test[test['product_code']=='F']
df_G_t = test[test['product_code']=='G']
df_H_t = test[test['product_code']=='H']
df_I_t = test[test['product_code']=='I']

Cloning into 'kuma_utils'...
remote: Enumerating objects: 915, done.[K
remote: Counting objects: 100% (120/120), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 915 (delta 104), reused 102 (delta 96), pack-reused 795[K
Receiving objects: 100% (915/915), 679.99 KiB | 1.87 MiB/s, done.
Resolving deltas: 100% (592/592), done.


In [6]:
target = train.pop('failure')
float_cols = [col for col in train.columns if train[col].dtypes == 'float64']
object_cols = [col for col in train.columns if train[col].dtypes == 'object']
int_object_cols = [col for col in train.columns[1:-1] if (train[col].dtypes == 'object' or train[col].dtypes == 'int64')]
nullValue_cols = [col for col in train.columns if train[col].isnull().sum()!=0]

In [7]:
lgbm_imtr = LGBMImputer(cat_features=object_cols, n_iter=50)

# train dataset
train_iterimp_A = lgbm_imtr.fit_transform(df_A[nullValue_cols])
train_iterimp_B = lgbm_imtr.fit_transform(df_B[nullValue_cols])
train_iterimp_C = lgbm_imtr.fit_transform(df_C[nullValue_cols])
train_iterimp_D = lgbm_imtr.fit_transform(df_D[nullValue_cols])
train_iterimp_E = lgbm_imtr.fit_transform(df_E[nullValue_cols])

# test dataset
test_iterimp_F = lgbm_imtr.fit_transform(df_F_t[nullValue_cols])
test_iterimp_G = lgbm_imtr.fit_transform(df_G_t[nullValue_cols])
test_iterimp_H = lgbm_imtr.fit_transform(df_H_t[nullValue_cols])
test_iterimp_I = lgbm_imtr.fit_transform(df_I_t[nullValue_cols])

  0%|          | 0/16 [00:00<?, ?it/s]



  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

In [8]:
none_na_cols = [col for col in train.columns if col not in nullValue_cols]
df_train = train[none_na_cols]
df_test = test[none_na_cols]

train_ = pd.concat([train_iterimp_A, train_iterimp_B,train_iterimp_C,train_iterimp_D,train_iterimp_E], axis=0)
train = pd.concat([df_train, train_], axis=1)

test_ = pd.concat([test_iterimp_F, test_iterimp_G,test_iterimp_H,test_iterimp_I], axis=0)
test = pd.concat([df_test, test_], axis=1)

In [9]:
print("Missing values in train dataset after pre-peocessing is: ", format(train.isna().sum().sum()))

Missing values in train dataset after pre-peocessing is:  0


In [10]:
train['attribute_2*3'] = train['attribute_2'] * train['attribute_3']
test['attribute_2*3'] = test['attribute_2'] * test['attribute_3']

In [11]:
meas_gr1_cols = [f"measurement_{i:d}" for i in list(range(3, 5)) + list(range(9, 17))]
train['meas_gr1_avg'] = np.mean(train[meas_gr1_cols], axis=1)
train['meas_gr1_std'] = np.std(train[meas_gr1_cols], axis=1)

test['meas_gr1_avg'] = np.mean(test[meas_gr1_cols], axis=1)
test['meas_gr1_std'] = np.std(test[meas_gr1_cols], axis=1) 

meas_gr2_cols = [f"measurement_{i:d}" for i in list(range(5, 9))]
train['meas_gr2_avg'] = np.mean(train[meas_gr2_cols], axis=1)
test['meas_gr2_avg'] = np.mean(test[meas_gr2_cols], axis=1)

In [12]:
train['meas17/meas_gr2_avg'] = train['measurement_17'] / train['meas_gr2_avg']
test['meas17/meas_gr2_avg'] = test['measurement_17'] / test['meas_gr2_avg']

In [13]:
from feature_engine.encoding import WoEEncoder, RareLabelEncoder
woe_encoder = WoEEncoder(variables=['attribute_0'])
woe_encoder.fit(train, target)
train_t = woe_encoder.transform(train)
test_t = woe_encoder.transform(test)

In [14]:
cols_to_use = ['attribute_0', 'measurement_0', 'measurement_1', 'measurement_2','m_3_missing', 'm_5_missing',
               'meas_gr1_avg', 'meas_gr1_std', 'attribute_2*3', 'loading', 'measurement_17', 'meas17/meas_gr2_avg']

In [15]:
train = train_t[cols_to_use]
test = test_t[cols_to_use]

In [16]:
train['m_5_missing'] = train['m_5_missing'].apply(lambda x: 0 if x==False else 1)
train['m_3_missing'] = train['m_3_missing'].apply(lambda x: 0 if x==False else 1)
test['m_5_missing'] = test['m_5_missing'].apply(lambda x: 0 if x==False else 1)
test['m_3_missing'] = test['m_3_missing'].apply(lambda x: 0 if x==False else 1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [17]:
scalerModel = StandardScaler().fit(train)
trainScaled = scalerModel.transform(train)
trainScaled = pd.DataFrame(trainScaled, columns=train.columns)
trainScaled['failure'] = target
trainScaled

Unnamed: 0,attribute_0,measurement_0,measurement_1,measurement_2,m_3_missing,m_5_missing,meas_gr1_avg,meas_gr1_std,attribute_2*3,loading,measurement_17,meas17/meas_gr2_avg,failure
0,0.496233,-0.101025,-0.055370,-0.681939,-0.120615,-0.161575,-0.937590,-1.818287,-0.273292,-1.228381,0.510988,0.435814,0
1,0.496233,1.599402,-1.246039,-0.984141,-0.120615,-0.161575,-0.198715,-1.089189,-0.273292,-1.105089,-0.158590,-0.250610,0
2,0.496233,1.113565,-1.722306,-0.379737,-0.120615,-0.161575,1.753041,-0.818522,-0.273292,-1.168408,-0.311052,-0.398966,0
3,0.496233,1.356484,-1.484172,-0.077535,-0.120615,-0.161575,-0.238142,-0.398015,-0.273292,-0.688625,1.018474,0.825942,0
4,0.496233,0.384811,-1.484172,0.526868,-0.120615,-0.161575,-0.060721,0.598962,-0.273292,1.550451,-0.992447,-0.840783,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26565,0.496233,-0.343944,1.849699,-0.681939,-0.120615,-0.161575,-0.525275,0.148032,0.621896,0.801176,0.225595,0.444453,0
26566,0.496233,0.627729,0.897164,0.526868,-0.120615,-0.161575,-1.874823,0.691949,0.621896,0.468364,1.244069,1.162477,0
26567,0.496233,-1.558534,0.420897,-1.588544,-0.120615,-0.161575,-1.976488,-0.337936,0.621896,-0.314115,0.398884,0.399680,0
26568,0.496233,-1.315616,0.182763,-0.681939,-0.120615,-0.161575,-0.628781,-0.891440,0.621896,-0.551948,0.233960,0.210397,0


In [18]:
scalerModel = StandardScaler().fit(test)
testScaled = scalerModel.transform(test)
testScaled = pd.DataFrame(testScaled, columns=test.columns)
testScaled

Unnamed: 0,attribute_0,measurement_0,measurement_1,measurement_2,m_3_missing,m_5_missing,meas_gr1_avg,meas_gr1_std,attribute_2*3,loading,measurement_17,meas17/meas_gr2_avg
0,-0.986469,-0.340067,0.008674,-0.033091,-0.15832,-0.15832,-0.283700,0.419450,-1.500282,-0.207270,-0.518728,-0.605249
1,-0.986469,0.829695,-0.222055,-1.597307,-0.15832,-0.15832,-0.481834,-1.253372,-1.500282,-0.362852,-1.272125,-1.272714
2,-0.986469,0.127838,0.700860,-0.554497,-0.15832,-0.15832,-0.532550,0.275554,-1.500282,-0.397512,-0.330461,-0.294049
3,-0.986469,0.127838,0.470131,1.009719,-0.15832,-0.15832,-1.617229,-0.690996,-1.500282,-0.383135,-0.829977,-0.802303
4,-0.986469,1.531552,1.623774,0.488314,-0.15832,-0.15832,-0.285125,-1.464837,-1.500282,2.063048,0.766329,0.780649
...,...,...,...,...,...,...,...,...,...,...,...,...
20770,1.013716,-1.743781,-1.144969,0.749016,-0.15832,-0.15832,0.697276,-0.253587,-0.203307,0.438935,-0.041140,-0.001311
20771,1.013716,-0.807972,-0.222055,0.227611,-0.15832,-0.15832,-1.187916,-0.603452,-0.203307,-1.363610,-0.683676,-0.711755
20772,1.013716,0.595742,0.470131,-1.075902,-0.15832,-0.15832,0.336321,1.168676,-0.203307,-1.538191,0.629702,0.458330
20773,1.013716,0.127838,1.623774,1.270422,-0.15832,-0.15832,-0.394115,0.330102,-0.203307,-0.038338,0.335223,0.165333


In [19]:
def getScore(model, yval, yvalPred):
    valScore = roc_auc_score(yval, yvalPred)
    print("Model -> {}, Validation Score -> {}".format(model, valScore))

In [20]:
def score(X, y, model, cv):
    scoring = ["roc_auc"]
    scores = cross_validate(
        model, X, y, scoring=scoring, cv=cv, return_train_score=True,
    )
    scores = pd.DataFrame(scores).T
    return scores.assign(
        mean = lambda x: x.mean(axis=1),
        std = lambda x: x.std(axis=1),
    )
def get_models():
    models = list()
    models.append(('lr', LogisticRegression(max_iter=500, C=0.0001, penalty='l2', solver='newton-cg')))
    models.append(('bayes', GaussianNB(var_smoothing=0.5, priors=[len(trainScaled.failure[trainScaled.failure == 0]) / len(trainScaled.failure), len(trainScaled.failure[trainScaled.failure == 1])/len(trainScaled.failure)])))
    return models
def evaluate_models(models, X_train, X_val, y_train, y_val):
    scores = list()
    for name, model in models:
        model.fit(X_train, y_train)
        yhat = model.predict(X_val)
        acc = accuracy_score(y_val, yhat)
        scores.append(acc)
    return scores
X_train_full, X_test, y_train_full, y_test = train_test_split(trainScaled.drop('failure', axis=1), trainScaled.failure, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)
models = get_models()
scores = evaluate_models(models, X_train, X_val, y_train, y_val)
print(scores)
ensemble = VotingClassifier(estimators=models, voting='soft', weights=scores)
ensemble.fit(X_train_full, y_train_full)

yhat = ensemble.predict(X_test)
score = accuracy_score(y_test, yhat)
print('Weighted Avg Accuracy: %.3f' % (score*100))
submission = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv")
sub = pd.DataFrame({'id': submission.id, 'failure': ensemble.predict_proba(testScaled)[:,1]})
sub.to_csv("submission.csv", index=False)

[0.7859830667920978, 0.7737535277516463]
Weighted Avg Accuracy: 79.733


In [21]:
# def inference(X_trainFold, y_trainFold, X_validFold, y_validFold, X_test, iterations):
#     pred_list = []
#     for i in range(iterations):
#         X_train = X_trainFold.sample(int(0.8 * len(X_trainFold)))
#         y_train = y_trainFold.loc[X_train.index]
        
#         model = LogisticRegression(C = 0.0001, penalty = 'l2', random_state=i, tol = 1e-2, max_iter = 1000)
#         model.fit(X_train, y_train)
#         #getScore('Logistic Regression', y_validFold, y_pred)
#         testPred = model.predict_proba(X_test)[:,1]

#         pred_list.append(testPred)    
#     pred_df = pd.DataFrame(pred_list).T
#     pred_df = pred_df.rank()
#     pred_df["mean"] = pred_df.mean(axis=1)    
#     return pred_df['mean']

In [22]:
predictProbs = pd.DataFrame()

In [23]:
# n_splits = 5
# skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
# for n, (train_index, val_index) in enumerate(skf.split(trainScaled, trainScaled.failure)):
#     trainScaled.loc[val_index, 'fold'] = int(n)
    
# for fold in range(n_splits):
#     print(f"===== fold{fold} =====")
#     XtrainFold = trainScaled[trainScaled['fold'] != fold].drop(['failure', 'fold'], axis=1)
#     XvalidFold = trainScaled[trainScaled['fold'] == fold].drop(['failure', 'fold'], axis=1)
    
#     ytrainFold = trainScaled[trainScaled['fold'] != fold]['failure']
#     yvalidFold = trainScaled[trainScaled['fold'] == fold]['failure']
    
#         # Logistic Regression
    
#     XtestPred = inference(XtrainFold, ytrainFold, XvalidFold, yvalidFold, testScaled, iterations = 500)
#     predictProbs[f'LR_{fold}'] = XtestPred
    

In [24]:
# predictProbs['mean'] = predictProbs.mean(axis=1)

In [25]:
# submission = pd.read_csv("/kaggle/input/tabular-playground-series-aug-2022/sample_submission.csv")
# submission.failure = predictProbs['mean']
# submission.to_csv("submission.csv", index=False)