### Import

In [107]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import rankdata
import itertools
import pickle

### Load Data

In [108]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')
print(f'train {train_df.shape}, test {test_df.shape}')

train (26570, 26), test (20775, 25)


### Data Type

In [109]:
total_feature = train_df.columns
featType_float = train_df.select_dtypes(float).columns
featType_int = train_df.select_dtypes(int).columns
featType_object = train_df.select_dtypes(object).columns

print(featType_float)
print(featType_int)
print(featType_object)

Index(['loading', 'measurement_3', 'measurement_4', 'measurement_5',
       'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9',
       'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13',
       'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17'],
      dtype='object')
Index(['id', 'attribute_2', 'attribute_3', 'measurement_0', 'measurement_1',
       'measurement_2', 'failure'],
      dtype='object')
Index(['product_code', 'attribute_0', 'attribute_1'], dtype='object')


### Encode: attribute0 & 1

In [110]:
label_encoder = LabelEncoder()
train_df_encode = train_df.copy()
test_df_encode = test_df.copy()
train_size = train_df.shape[0]

encode_object = featType_object.drop(['product_code'])
for column in encode_object:
        merge_data = pd.concat([train_df[column], test_df[column]])
        tmp_encode = label_encoder.fit_transform(merge_data)
        train_df_encode[column] = tmp_encode[:train_size]
        test_df_encode[column] = tmp_encode[train_size:]
        
train_df = train_df_encode
test_df = test_df_encode

### pre-process data

In [111]:
# from kaggle discussion -> m3_missing & m5_missing are important features
data_t = pd.concat([train_df, test_df])
data_t['m3_missing'] = data_t['measurement_3'].isnull().astype(np.int8)
data_t['m5_missing'] = data_t['measurement_5'].isnull().astype(np.int8)

# from kaggle discussion -> suggest to log(loading)
data_t['loading'] = np.log1p(data_t['loading'])

### correlation: failure

In [112]:
corr_failure={}
data_feat = data_t.drop(['id', 'product_code', 'failure'], axis=1).columns
for col in list(data_feat):
    corr = data_t[:len(train_df)][col].corr(train_df['failure'])
    corr_failure[col]= np.abs(corr) #corr

sort_corr_failure = np.array(sorted(corr_failure.items(), key=lambda x:x[1], reverse=True))
corr_fail_feature = list(sort_corr_failure[:,0])
sort_corr_failure


array([['loading', '0.1272022939661035'],
       ['measurement_17', '0.033905002567120944'],
       ['attribute_3', '0.01922213433101689'],
       ['measurement_5', '0.018078821477476657'],
       ['measurement_8', '0.01711945861678688'],
       ['measurement_7', '0.01678667610884866'],
       ['m5_missing', '0.01651857999116315'],
       ['measurement_2', '0.015807565750564224'],
       ['m3_missing', '0.015477595735098788'],
       ['attribute_0', '0.014829601287774169'],
       ['measurement_6', '0.014791019365985439'],
       ['attribute_1', '0.011999161647528154'],
       ['measurement_1', '0.01080998860865806'],
       ['measurement_4', '0.010488110572352235'],
       ['measurement_0', '0.009645933201494657'],
       ['attribute_2', '0.006336975021853562'],
       ['measurement_14', '0.0062108925877654595'],
       ['measurement_11', '0.0048014263775955105'],
       ['measurement_12', '0.004398103485910207'],
       ['measurement_9', '0.0035874245134301787'],
       ['measurement

### Predict measurement_17 & fill null column

In [113]:
# choose some higher correlation to predict measurement_17

at_mots = 4 #(previous: 5 -> worse)
code_measure = {}
for code in data_t.product_code.unique():
    correlation = {}
    for col in list(featType_float): #calculate correlation
        data_code = data_t[data_t.product_code==code]
        value = data_code[:len(data_code)][col].corr(data_code['measurement_17'])
        correlation[col]= np.abs(value)

    # sort the correlation
    sort_corr = np.array(sorted(correlation.items(), key=lambda x:x[1], reverse=True))
    top_f = sort_corr[1:at_mots+1]
    corr_feature = [feat for feat, val in top_f if float(val) > 0.1]
    code_measure[code] = corr_feature
    print(f'code {code}:\n{corr_feature}')

code A:
['measurement_8', 'measurement_5', 'measurement_6', 'measurement_7']
code B:
['measurement_7', 'measurement_4', 'measurement_5', 'measurement_9']
code C:
['measurement_8', 'measurement_5', 'measurement_7', 'measurement_9']
code D:
['measurement_6', 'measurement_5', 'measurement_8', 'measurement_7']
code E:
['measurement_6', 'measurement_8', 'measurement_5', 'measurement_4']
code F:
['measurement_6', 'measurement_4', 'measurement_7', 'measurement_5']
code G:
['measurement_6', 'measurement_4', 'measurement_8', 'measurement_9']
code H:
['measurement_5', 'measurement_9', 'measurement_4', 'measurement_8']
code I:
['measurement_8', 'measurement_3', 'measurement_7', 'measurement_9']


In [114]:
data = data_t.copy()
feature = [ f for f in data.columns if f=='loading' or f.startswith('measurement') ]

for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    measurement = code_measure[code]

    # drop the row if it has any null values
    tmp_train = tmp[measurement+['measurement_17']].dropna(how='any')
    
    # test -> only measurement_17 is null values
    tmp_test = tmp[(tmp[measurement].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]

    print(f"code {code} : {len(tmp_test)} null values of measurement_17")
    
    # use HuberRegressor to predit the column of 'measurement_17'
    hr_model = HuberRegressor()
    hr_model.fit(tmp_train[measurement], tmp_train['measurement_17'])

    # fill those columns of 'measurement_17' is null
    hr_pred = hr_model.predict(tmp_test[measurement])
    index = (data.product_code==code) & (data[measurement].isnull().sum(axis=1)==0) & (data['measurement_17'].isnull())
    data.loc[index, 'measurement_17'] = hr_pred

    # fill the index who has any null value
    knn_model = KNNImputer(n_neighbors=5)
    print(f"-- KNN imputing code {code}...")
    knn_pred = knn_model.fit_transform(data.loc[data.product_code==code, feature])
    data.loc[data.product_code==code, feature] = knn_pred

code A : 374 null values of measurement_17
-- KNN imputing code A...
code B : 397 null values of measurement_17
-- KNN imputing code B...
code C : 391 null values of measurement_17
-- KNN imputing code C...
code D : 398 null values of measurement_17
-- KNN imputing code D...
code E : 429 null values of measurement_17
-- KNN imputing code E...
code F : 420 null values of measurement_17
-- KNN imputing code F...
code G : 373 null values of measurement_17
-- KNN imputing code G...
code H : 378 null values of measurement_17
-- KNN imputing code H...
code I : 358 null values of measurement_17
-- KNN imputing code I...


In [115]:
# check the data has non null value of every column
assert (data[feature].isnull().sum(axis=1).isnull().sum()) == 0

### Spilt Data

In [116]:
def scaler_model(train_data, val_data, test_data, feature):
    scaler = StandardScaler()
    
    # np.array
    sc_train = scaler.fit_transform(train_data[feature])
    sc_val = scaler.transform(val_data[feature])
    sc_test = scaler.transform(test_data[feature])
    
    # dataframe
    tmp_train, tmp_val, tmp_test = train_data.copy(), val_data.copy(), test_data.copy()
    
    tmp_train[feature] = sc_train
    tmp_val[feature] = sc_val
    tmp_test[feature] = sc_test
    
    return tmp_train, tmp_val, tmp_test

In [117]:
train_data = data[:train_df.shape[0]]
test_data = data[train_df.shape[0]:]

assert train_data.shape[0] + test_data.shape[0] == data.shape[0]
print(f'train data: {train_data.shape}')
print(f'test data: {test_data.shape}')

train data: (26570, 28)
test data: (20775, 28)


In [118]:
Xt = train_data.drop(['failure'], axis=1)
Yt = train_data['failure'].astype(int)
test = test_data.drop(['failure'], axis=1)

### Train model

In [119]:
# important feature
round_feature = [['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_1', 'measurement_2'],
                 ['loading', 'measurement_17', 'measurement_1', 'measurement_2'],
                 ['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_2']]

In [120]:
models = []
C_tmp = [0.0001, 0.00005, 0.00001, 0.000005, 0.000001]

for idx, feature in enumerate(round_feature):
    lr_test = np.zeros(len(test))
    lr_auc, lr_acc = 0, 0
    tmp_model = []
    print(f'#{idx}: features{feature}')
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for fold_idx, (train_idx, val_idx) in enumerate(kf.split(Xt, Yt)):
        #print("  Fold:", fold_idx+1)
        x_train, x_val = Xt.iloc[train_idx], Xt.iloc[val_idx]
        y_train, y_val = Yt.iloc[train_idx], Yt.iloc[val_idx]
        x_test = test.copy()
        
        x_train, x_val, x_test = scaler_model(x_train, x_val, x_test, feature)
        
        lr_model = LogisticRegression(max_iter=1000,
                                      C=0.000005,#0.000005,
                                      penalty='l2',
                                      solver='newton-cg')
        lr_model.fit(x_train[feature], y_train)

        val_preds = lr_model.predict_proba(x_val[feature])[:, 1]
        lr_auc += roc_auc_score(y_val, val_preds) / 5
        y_preds = lr_model.predict(x_val[feature])
        lr_acc += accuracy_score(y_val, y_preds) / 5
        lr_test += lr_model.predict_proba(x_test[feature])[:, 1] / 5
        tmp_model.append(lr_model)
    models.append(tmp_model)
    print(f"\tAverage auc = {np.round(lr_auc, 5)}, Average acc = {np.round(lr_acc, 5)}")
    submission_df[f'lr{idx}'] = lr_test


#0: features['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_1', 'measurement_2']
	Average auc = 0.59119, Average acc = 0.78739
#1: features['loading', 'measurement_17', 'measurement_1', 'measurement_2']
	Average auc = 0.59036, Average acc = 0.78739
#2: features['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_2']
	Average auc = 0.59097, Average acc = 0.78739


In [121]:
with open("models.pckl", "wb") as f:
    for list_model in models:
        print(list_model)
        for model in list_model:
            pickle.dump(model, f)

[LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg')]
[LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg')]
[LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg'), LogisticRegression(C=5e-06, max_iter=1000, solver='newton-cg')]


In [122]:
# submission_df.head()

In [123]:
# submission_df['rank0'] = rankdata(submission_df['lr0'])
# submission_df['rank1'] = rankdata(submission_df['lr1'])
# submission_df['rank2'] = rankdata(submission_df['lr2'])

In [124]:
#submission_df['failure'] = submission_df['rank0']*(0.3) + \
#                          submission_df['rank1']*0.3 + \
#                          submission_df['rank2']*0.4

In [125]:
#submission_df.head()

In [126]:
#submission_df[['id', 'failure']].to_csv('submission.csv', index=False)