# Import

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import roc_auc_score, accuracy_score


from sklearn.linear_model import HuberRegressor
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler, PowerTransformer

import pickle

from imblearn.over_sampling import SMOTE
import imblearn

import matplotlib.pylab as plt

# model 
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge, \
    TweedieRegressor,  SGDRegressor, RANSACRegressor, TheilSenRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.neighbors import KNeighborsRegressor, \
    RadiusNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor

# Load data

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
data = pd.concat([train, test])

# Data analysis

> 資料分布不均勻

In [3]:
failure_count = np.sum(data['failure']==1)
not_failure_count = np.sum(data['failure']==0)
print(f'failure: {failure_count}, notFailureCount: {not_failure_count}')

failure: 5649, notFailureCount: 20921


> 資料中有NaN

In [4]:
data.isna().sum()

id                    0
product_code          0
loading             473
attribute_0           0
attribute_1           0
attribute_2           0
attribute_3           0
measurement_0         0
measurement_1         0
measurement_2         0
measurement_3       710
measurement_4       947
measurement_5      1184
measurement_6      1420
measurement_7      1657
measurement_8      1894
measurement_9      2131
measurement_10     2367
measurement_11     2604
measurement_12     2841
measurement_13     3077
measurement_14     3314
measurement_15     3551
measurement_16     3788
measurement_17     4024
failure           20775
dtype: int64

# Data Preprocessing

### Remove the data that is not going to use

In [5]:
# Drop attrubute column, 
# Since we can use product_code to identify different product
data = data.drop([col for col in data.columns 
                  if col.startswith('attribute')], axis=1)
data.columns

Index(['id', 'product_code', 'loading', 'measurement_0', 'measurement_1',
       'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5',
       'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9',
       'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13',
       'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17',
       'failure'],
      dtype='object')

### Fill NaN according to most correlated field, if non matching, using KNN instad.

> Find correlated coefficiency

In [6]:
# all measurement columns except 0, 1, 2 since these columns have no null
# and so don't need to be deal with
candidate_columns = [col for col in data.columns 
                     if col.startswith('measurement_') and 
                     int(col.split('measurement_')[1]) > 2] 

# dictionary to record the most related columns
most_correlated_columns \
    = {productId:[] for productId in data.product_code.unique()}

#for each product
for product_code in data.product_code.unique():
    productData = data[data.product_code==product_code]

    product_correlation = {}
    for col in candidate_columns:
        productDataColumn = productData[candidate_columns]

        #calculate correlation for specific column for specific product
        correlations = productDataColumn.corr()[col] \
                                        .sort_values(ascending=False) 

        #take the most 5 correlation(except itself)
        most_correlations = correlations[1:2] 
        product_correlation[col] = most_correlations.index.tolist()
    most_correlated_columns[product_code] = product_correlation

In [7]:
len(most_correlated_columns)

9

> Fill Null

In [8]:
# use these columns as the reference columns of KNN
reference_columns = [col for col in data.columns 
                    if col.startswith('measurement') or col=='loading']
# the columns that have null value in it and needed to be deal with
null_columns = [col for col in reference_columns \
                    if data[col].isnull().any()]
print(reference_columns)
print(null_columns)

['loading', 'measurement_0', 'measurement_1', 'measurement_2', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']
['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_10', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'measurement_15', 'measurement_16', 'measurement_17']


In [9]:
for product_code in data.product_code.unique():
    # fill null using correlated columns first
    for current_column, correlated_columns in \
        most_correlated_columns[product_code].items():
        data_part = data[data.product_code==product_code] \
                        [correlated_columns+[current_column, 'id']]

        # only use the correlated columns that is not null
        complete_data_part = data_part.dropna(how='any')
        # use HberRegressor to predict null
        predictor = HuberRegressor(epsilon=2)
        predictor.fit(complete_data_part[correlated_columns], 
                      complete_data_part[current_column])

        # replace all data that is null in target columns and
        # not null in all its correlated columns
        na_data_dart = data_part[data_part[current_column].isnull() & 
                                 (~data_part[correlated_columns]
                                    .isnull().any(axis=1))]
        data.loc[data.id.isin(na_data_dart.id), current_column] = \
            predictor.predict(na_data_dart[correlated_columns])

    # others null columns, use KNN to fill null
    na_data_index = (data.product_code==product_code, reference_columns)
    knn = KNNImputer(n_neighbors=3)
    data.loc[na_data_index] = knn.fit_transform(data.loc[na_data_index])

In [10]:
# all columns shouldn't have null
data.isnull().sum()

id                    0
product_code          0
loading               0
measurement_0         0
measurement_1         0
measurement_2         0
measurement_3         0
measurement_4         0
measurement_5         0
measurement_6         0
measurement_7         0
measurement_8         0
measurement_9         0
measurement_10        0
measurement_11        0
measurement_12        0
measurement_13        0
measurement_14        0
measurement_15        0
measurement_16        0
measurement_17        0
failure           20775
dtype: int64

# Prepare trainX and trainY

In [11]:
# split train and test data
df_train = data.iloc[:train.shape[0],:]
df_test = data.iloc[train.shape[0]:,:]

In [12]:
# all features ranked by the importance to the regressor
features = ['loading',
            'measurement_17',
            'measurement_1',
            'measurement_2',
            'measurement_11',
            'measurement_13',
            'measurement_4',
            'measurement_7',
            'measurement_3',
            'measurement_5',
            'measurement_0',
            'measurement_9',
            'measurement_12',
            'measurement_6',
            'measurement_10',      
            'measurement_16',
            'measurement_8',
            'measurement_14',
            'measurement_15',        
]
# use the top 4 important features to train
features = features[:4]

In [13]:
train_x = df_train[features]
train_y = df_train['failure']
test_x = df_test[features]

## Training

In [14]:
#deal with unbalanced input
balancer = SMOTE(random_state=42)

clf = LinearSVR(random_state=0)

In [15]:
print(f"start training...")

# standardize data
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# balanced data
train_x, train_y = balancer.fit_resample(train_x, train_y)

# train
clf.fit(train_x, train_y)
print(f"finish!")

start training...
finish!


> Save model

In [16]:
with open('model.pkl', 'wb') as modelFile:
    modelFile.write(pickle.dumps(clf))