In [35]:
import numpy as np
import pandas as pd
import joblib
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

In [36]:
df_test = pd.read_csv("test.csv")
print(df_test.shape)

df_test.drop(columns='id', inplace=True)

(20775, 25)


In [37]:
#take out the columns of 'measurement_x' and 'loading'
col_mea_load = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

#the columns mean the high correlation with 'measurement_17'
fill_dict = {}
fill_dict = {
    'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
    'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
    'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
    'I': ['measurement_3','measurement_7','measurement_8', 'measurement_9']
}

for code in df_test.product_code.unique():
    curr_code_data = df_test[df_test.product_code == code]
    cur_column = fill_dict[code]
    #besides the corresponded columns, add the column 'measurement_17' and drop any row with nan.
    cur_train = curr_code_data[cur_column + ['measurement_17']].dropna(how='any')
    #cur_test take the data of the columns of the corresponding code without any nan and the column 'measurement_17' is nan
    cur_val = curr_code_data[(curr_code_data[cur_column].isnull().sum(axis = 1) == 0) & (curr_code_data['measurement_17'].isnull())]

    mode_HR = HuberRegressor()
    mode_HR.fit(cur_train[cur_column], cur_train['measurement_17'])

    df_test.loc[(df_test.product_code == code) &\
        #there is no nan in the corresponded column(cur_column)
        (df_test[cur_column].isnull().sum(axis = 1) == 0) &\
            #the value in column 'measurement_17' is nan
            (df_test['measurement_17'].isnull()), 'measurement_17'] = mode_HR.predict(cur_val[cur_column])

    #find the 5 nearest neighbors, use their values to get the mean to fill is the empty of nan.
    impute_number = KNNImputer(n_neighbors=5)
    df_test.loc[df_test.product_code == code, col_mea_load] = impute_number.fit_transform(df_test.loc[df_test.product_code == code, col_mea_load])

In [38]:
#LabelEncoder:把object type的資料轉換為數值的順序
label_encode = LabelEncoder()

df_test_cpy = df_test.copy()

col_obj = df_test.select_dtypes(object).columns

for col in col_obj:
    df_test_cpy[col] = label_encode.fit_transform(df_test[col])

df_test = df_test_cpy

In [39]:
load_model = joblib.load('my_model')

In [40]:
y_pred = load_model.predict_proba(df_test)

In [41]:
df_sub = pd.read_csv("sample_submission.csv")

df_sub['failure']=y_pred[:, 1]

df_sub.to_csv("submission.csv", index=False)
df_sub

Unnamed: 0,id,failure
0,26570,0.192887
1,26571,0.175995
2,26572,0.185518
3,26573,0.181445
4,26574,0.337172
...,...,...
20770,47340,0.230777
20771,47341,0.144326
20772,47342,0.150269
20773,47343,0.210797
