### Import

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.model_selection import StratifiedKFold, GroupKFold, train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from scipy.stats import rankdata
import itertools
import pickle

### Load Model

In [36]:
models = []
num_result, num_model = 3, 5
with open("models.pckl", "rb") as f:
    while True:
        try:
            for i in range(0, num_result):
                tmp_model = []
                for j in range(0, num_model):
                    tmp_model.append(pickle.load(f))
                models.append(tmp_model)
        except EOFError:
            break
assert len(models) == 3

### Load Data

In [37]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
submission_df = pd.read_csv('sample_submission.csv')
print(f'train {train_df.shape}, test {test_df.shape}')

train (26570, 26), test (20775, 25)


### Data Type

In [38]:
total_feature = train_df.columns
featType_float = train_df.select_dtypes(float).columns
featType_int = train_df.select_dtypes(int).columns
featType_object = train_df.select_dtypes(object).columns

### Encode: attribute0 & 1

In [39]:
label_encoder = LabelEncoder()
train_df_encode = train_df.copy()
test_df_encode = test_df.copy()
train_size = train_df.shape[0]

encode_object = featType_object.drop(['product_code'])
for column in encode_object:
        merge_data = pd.concat([train_df[column], test_df[column]])
        tmp_encode = label_encoder.fit_transform(merge_data)
        train_df_encode[column] = tmp_encode[:train_size]
        test_df_encode[column] = tmp_encode[train_size:]
        
train_df = train_df_encode
test_df = test_df_encode

### pre-process data

In [40]:
# from kaggle discussion -> m3_missing & m5_missing are important features
data_t = pd.concat([train_df, test_df])
data_t['m3_missing'] = data_t['measurement_3'].isnull().astype(np.int8)
data_t['m5_missing'] = data_t['measurement_5'].isnull().astype(np.int8)

# from kaggle discussion -> suggest to log(loading)
data_t['loading'] = np.log1p(data_t['loading'])

### Predict measurement_17 & fill null column

In [41]:
# choose some higher correlation to predict measurement_17

at_mots = 4 #(previous: 5 -> worse)
code_measure = {}
for code in data_t.product_code.unique():
    correlation = {}
    for col in list(featType_float): #calculate correlation
        data_code = data_t[data_t.product_code==code]
        value = data_code[:len(data_code)][col].corr(data_code['measurement_17'])
        correlation[col]= np.abs(value)

    # sort the correlation
    sort_corr = np.array(sorted(correlation.items(), key=lambda x:x[1], reverse=True))
    top_f = sort_corr[1:at_mots+1]
    corr_feature = [feat for feat, val in top_f if float(val) > 0.1]
    code_measure[code] = corr_feature

In [42]:
data = data_t.copy()
feature = [ f for f in data.columns if f=='loading' or f.startswith('measurement') ]

for code in data.product_code.unique():
    tmp = data[data.product_code==code]
    measurement = code_measure[code]

    # drop the row if it has any null values
    tmp_train = tmp[measurement+['measurement_17']].dropna(how='any')
    
    # test -> only measurement_17 is null values
    tmp_test = tmp[(tmp[measurement].isnull().sum(axis=1)==0)&(tmp['measurement_17'].isnull())]

    #print(f"code {code} : {len(tmp_test)} null values of measurement_17")
    
    # use HuberRegressor to predit the column of 'measurement_17'
    hr_model = HuberRegressor()
    hr_model.fit(tmp_train[measurement], tmp_train['measurement_17'])

    # fill those columns of 'measurement_17' is null
    hr_pred = hr_model.predict(tmp_test[measurement])
    index = (data.product_code==code) & (data[measurement].isnull().sum(axis=1)==0) & (data['measurement_17'].isnull())
    data.loc[index, 'measurement_17'] = hr_pred

    # fill the index who has any null value
    knn_model = KNNImputer(n_neighbors=5)
    #print(f"-- KNN imputing code {code}...")
    knn_pred = knn_model.fit_transform(data.loc[data.product_code==code, feature])
    data.loc[data.product_code==code, feature] = knn_pred

In [43]:
# check the data has non null value of every column
assert (data[feature].isnull().sum(axis=1).isnull().sum()) == 0

### Spilt Data

In [44]:
def scaler_model(train_data, val_data, test_data, feature):
    scaler = StandardScaler()
    
    # np.array
    sc_train = scaler.fit_transform(train_data[feature])
    sc_val = scaler.transform(val_data[feature])
    sc_test = scaler.transform(test_data[feature])
    
    # dataframe
    tmp_train, tmp_val, tmp_test = train_data.copy(), val_data.copy(), test_data.copy()
    
    tmp_train[feature] = sc_train
    tmp_val[feature] = sc_val
    tmp_test[feature] = sc_test
    
    return tmp_train, tmp_val, tmp_test

In [45]:
train_data = data[:train_df.shape[0]]
test_data = data[train_df.shape[0]:]

assert train_data.shape[0] + test_data.shape[0] == data.shape[0]
print(f'train data: {train_data.shape}')
print(f'test data: {test_data.shape}')

train data: (26570, 28)
test data: (20775, 28)


In [46]:
Xt = train_data.drop(['failure'], axis=1)
Yt = train_data['failure'].astype(int)
test = test_data.drop(['failure'], axis=1)

### Predict model

In [47]:
round_feature = [['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_1', 'measurement_2'],
                 ['loading', 'measurement_17', 'measurement_1', 'measurement_2'],
                 ['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_2']]

In [48]:
for idx, (feature, model_list) in enumerate(zip(round_feature, models)):
    lr_test = np.zeros(len(test))
    tmp_model = []
    print(f'#{idx+1}: features{feature}')
    
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
    for fold_idx, ((train_idx, val_idx), model) in enumerate(zip(kf.split(Xt, Yt), model_list)): 
        x_train, x_val = Xt.iloc[train_idx], Xt.iloc[val_idx]
        y_train, y_val = Yt.iloc[train_idx], Yt.iloc[val_idx]
        
        x_test = test.copy()
        
        x_train, x_val, x_test = scaler_model(x_train, x_val, x_test, feature)
        
        lr_test += model.predict_proba(x_test[feature])[:, 1] / 5
    submission_df[f'lr{idx}'] = lr_test

#1: features['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_1', 'measurement_2']
#2: features['loading', 'measurement_17', 'measurement_1', 'measurement_2']
#3: features['loading', 'measurement_17', 'm3_missing', 'm5_missing', 'measurement_2']


In [49]:
submission_df.head()

Unnamed: 0,id,failure,lr0,lr1,lr2
0,26570,0.0,0.212382,0.212388,0.212396
1,26571,0.0,0.211852,0.211857,0.211847
2,26572,0.0,0.21211,0.212116,0.212179
3,26573,0.0,0.212225,0.212231,0.212276
4,26574,0.0,0.214334,0.21434,0.214478


In [53]:
submission_df['rank0'] = rankdata(submission_df['lr0'])
submission_df['rank1'] = rankdata(submission_df['lr1'])
submission_df['rank2'] = rankdata(submission_df['lr2'])
submission_df['failure'] = submission_df['rank0']*(0.35) + \
                           submission_df['rank1']*0.35 + \
                           submission_df['rank2']*0.3
submission_df.head()

Unnamed: 0,id,failure,lr0,lr1,lr2,rank0,rank1,rank2
0,26570,8708.25,0.212382,0.212388,0.212396,8717.0,8716.0,8689.0
1,26571,4665.5,0.211852,0.211857,0.211847,4718.0,4682.0,4585.0
2,26572,6662.0,0.21211,0.212116,0.212179,6531.0,6511.0,6991.0
3,26573,7515.05,0.212225,0.212231,0.212276,7435.0,7422.0,7717.0
4,26574,20102.85,0.214334,0.21434,0.214478,20021.0,20086.0,20218.0


In [51]:
submission_df[['id', 'failure']].to_csv('109550006_sub.csv', index=False)