## Importing Libraries

In [2]:
# %pip install feature_engine

In [3]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, HuberRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
# from feature_engine.encoding import WoEEncoder


from fast_ml.model_development import train_valid_test_split


## Data Extraction

In [4]:
TRAIN_PATH = r'../../Datasets/train.csv'
TEST_PATH = r'../../Datasets/test.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/sample_submission.csv"

In [5]:
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [6]:
train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

## Data Prep


In [7]:
# #drop id
# train_dat = train_dat.drop(columns='id')
# test_dat = test_dat.drop(columns='id')


In [8]:
target, groups = train_dat['failure'], train_dat['product_code']
train_dat.drop('failure',axis=1, inplace = True)

In [9]:
def preprocessing(df_train, df_test):
    data = pd.concat([df_train, df_test])
    
    data['m3_missing'] = data['measurement_3'].isnull().astype(np.int8)
    data['m5_missing'] = data['measurement_5'].isnull().astype(np.int8)
    data['area'] = data['attribute_2'] * data['attribute_3']

    feature = [f for f in df_test.columns if f.startswith('measurement') or f=='loading']

    # dictionnary of dictionnaries (for the 11 best correlated measurement columns), 
    # we will use the dictionnaries below to select the best correlated columns according to the product code)
    # Only for 'measurement_17' we make a 'manual' selection :
    full_fill_dict ={}
    full_fill_dict['measurement_17'] = {
        'A': ['measurement_5','measurement_6','measurement_8'],
        'B': ['measurement_4','measurement_5','measurement_7'],
        'C': ['measurement_5','measurement_7','measurement_8','measurement_9'],
        'D': ['measurement_5','measurement_6','measurement_7','measurement_8'],
        'E': ['measurement_4','measurement_5','measurement_6','measurement_8'],
        'F': ['measurement_4','measurement_5','measurement_6','measurement_7'],
        'G': ['measurement_4','measurement_6','measurement_8','measurement_9'],
        'H': ['measurement_4','measurement_5','measurement_7','measurement_8','measurement_9'],
        'I': ['measurement_3','measurement_7','measurement_8']
    }

    # collect the name of the next 10 best measurement columns sorted by correlation (except 17 already done above):
    col = [col for col in df_test.columns if 'measurement' not in col]+ ['loading','m3_missing','m5_missing']
    a = []
    b =[]
    for x in range(3,17):
        corr = np.absolute(data.drop(col, axis=1).corr()[f'measurement_{x}']).sort_values(ascending=False)
        a.append(np.round(np.sum(corr[1:4]),3)) # we add the 3 first lines of the correlation values to get the "most correlated"
        b.append(f'measurement_{x}')
    c = pd.DataFrame()
    c['Selected columns'] = b
    c['correlation total'] = a
    c = c.sort_values(by = 'correlation total',ascending=False).reset_index(drop = True)
    print(f'Columns selected by correlation sum of the 3 first rows : ')
    display(c.head(10))

    for i in range(10):
        measurement_col = 'measurement_' + c.iloc[i,0][12:] # we select the next best correlated column 
        fill_dict = {}
        for x in data.product_code.unique() : 
            corr = np.absolute(data[data.product_code == x].drop(col, axis=1).corr()[measurement_col]).sort_values(ascending=False)
            measurement_col_dic = {}
            measurement_col_dic[measurement_col] = corr[1:5].index.tolist()
            fill_dict[x] = measurement_col_dic[measurement_col]
        full_fill_dict[measurement_col] =fill_dict

    feature = [f for f in data.columns if f.startswith('measurement') or f=='loading']
    nullValue_cols = [col for col in df_train.columns if df_train[col].isnull().sum()!=0]

    for code in data.product_code.unique():
        total_na_filled_by_linear_model = 0
        print(f'\n-------- Product code {code} ----------\n')
        print(f'filled by linear model :')
        for measurement_col in list(full_fill_dict.keys()):
            tmp = data[data.product_code == code]
            column = full_fill_dict[measurement_col][code]
            tmp_train = tmp[column+[measurement_col]].dropna(how='any')
            tmp_test = tmp[(tmp[column].isnull().sum(axis=1)==0)&(tmp[measurement_col].isnull())]

            model = HuberRegressor(epsilon=1.9)
            model.fit(tmp_train[column], tmp_train[measurement_col])
            data.loc[(data.product_code==code)&(data[column].isnull().sum(axis=1)==0)&(data[measurement_col].isnull()),measurement_col] = model.predict(tmp_test[column])
            print(f'{measurement_col} : {len(tmp_test)}')
            total_na_filled_by_linear_model += len(tmp_test)

        # others NA columns:
        NA = data.loc[data["product_code"] == code,nullValue_cols ].isnull().sum().sum()
        model1 = KNNImputer(n_neighbors=3)
        data.loc[data.product_code==code, feature] = model1.fit_transform(data.loc[data.product_code==code, feature])
        print(f'\n{total_na_filled_by_linear_model} filled by linear model ') 
        print(f'{NA} filled by KNN ')

    data['measurement_avg'] = data[[f'measurement_{i}' for i in range(3, 17)]].mean(axis=1)
    df_train = data.iloc[:df_train.shape[0],:]
    df_test = data.iloc[df_train.shape[0]:,:]

    df_train['attribute_0'] = df_train['attribute_0'].astype(str)
    df_test['attribute_0'] = df_test['attribute_0'].astype(str)
    df_train['attribute_0'] = LabelEncoder().fit_transform(df_train['attribute_0'])
    df_test['attribute_0'] = LabelEncoder().fit_transform(df_test['attribute_0'])

    features = ['loading', 'attribute_0', 'measurement_17', 'measurement_0', 'measurement_1', 'measurement_2', 'area', 'm3_missing', 'm5_missing', 'measurement_avg']
    
    return df_train, df_test, features

def scale(train_data, val_data, test_data, feats):
    scaler = StandardScaler()
    scaled_train = scaler.fit_transform(train_data[feats])
    scaled_val = scaler.transform(val_data[feats])
    scaled_test = scaler.transform(test_data[feats])
    new_train = train_data.copy()
    new_val = val_data.copy()
    new_test = test_data.copy()
    new_train[feats] = scaled_train
    new_val[feats] = scaled_val
    new_test[feats] = scaled_test
    return new_train, new_val, new_test

train_dat, test_dat, features = preprocessing(train_dat, test_dat)
train_dat['failure'] = target

Columns selected by correlation sum of the 3 first rows : 


Unnamed: 0,Selected columns,correlation total
0,measurement_8,0.448
1,measurement_11,0.395
2,measurement_5,0.376
3,measurement_6,0.359
4,measurement_7,0.33
5,measurement_4,0.328
6,measurement_15,0.301
7,measurement_10,0.3
8,measurement_16,0.252
9,measurement_14,0.225



-------- Product code A ----------

filled by linear model :
measurement_17 : 386
measurement_8 : 167
measurement_11 : 225
measurement_5 : 113
measurement_6 : 146
measurement_7 : 153
measurement_4 : 79
measurement_15 : 273
measurement_10 : 209
measurement_16 : 293
measurement_14 : 237

2281 filled by linear model 
1568 filled by KNN 

-------- Product code B ----------

filled by linear model :
measurement_17 : 418
measurement_8 : 165
measurement_11 : 220
measurement_5 : 83
measurement_6 : 106
measurement_7 : 174
measurement_4 : 80
measurement_15 : 294
measurement_10 : 197
measurement_16 : 358
measurement_14 : 330

2425 filled by linear model 
1550 filled by KNN 

-------- Product code C ----------

filled by linear model :
measurement_17 : 391
measurement_8 : 189
measurement_11 : 231
measurement_5 : 141
measurement_6 : 150
measurement_7 : 140
measurement_4 : 108
measurement_15 : 319
measurement_10 : 262
measurement_16 : 343
measurement_14 : 330

2604 filled by linear model 
1740 fill

In [10]:
target

0        0
1        0
2        0
3        0
4        0
        ..
26565    0
26566    0
26567    0
26568    0
26569    0
Name: failure, Length: 26570, dtype: int64

In [11]:
test_dat[features]

Unnamed: 0,loading,attribute_0,measurement_17,measurement_0,measurement_1,measurement_2,area,m3_missing,m5_missing,measurement_avg
0,119.57,0,634.612,6.0,9.0,6.0,24,0,0,15.423286
1,113.51,0,537.037,11.0,8.0,0.0,24,0,0,15.223815
2,112.16,0,658.995,8.0,12.0,4.0,24,0,0,15.259429
3,112.72,0,594.301,8.0,11.0,10.0,24,0,0,14.892357
4,208.00,0,801.044,14.0,16.0,8.0,24,0,0,15.430714
...,...,...,...,...,...,...,...,...,...,...
20770,144.74,1,696.466,0.0,4.0,9.0,45,0,0,15.732368
20771,74.53,1,613.249,4.0,8.0,7.0,45,0,0,15.118155
20772,67.73,1,783.349,10.0,11.0,2.0,45,0,0,15.764572
20773,126.15,1,745.210,8.0,16.0,11.0,45,0,0,15.512000


### Null Handling

In [281]:
# drop all null values
train_dat = train_dat.fillna(train_dat.mean())
test_dat = test_dat.fillna(test_dat.mean())

train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26570 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26570 non-null  float64
 11  measurement_4   26570 non-null  float64
 12  measurement_5   26570 non-null  float64
 13  measurement_6   26570 non-null  float64
 14  measurement_7   26570 non-null  float64
 15  measurement_8   26570 non-null  float64
 16  measurement_9   26570 non-null  float64
 17  measurement_10  26570 non-null 

### Outlier Handling

In [282]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in train_dat.select_dtypes(include=np.number):
    if column != 'failure':
        train_dat[column] = handle_outliers_iqr(train_dat[column])
        test_dat[column] = handle_outliers_iqr(test_dat[column])


### Encoding

#### One hot encoding

In [283]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

#### Label Encoding

In [284]:
def label_encoder(train, test, columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] = LabelEncoder().fit_transform(test[col])
    return train, test

In [285]:
cat_features = ['product_code','attribute_0', 'attribute_1' ]
train_dat, test_dat = label_encoder(train_dat, test_dat, cat_features)

## Feature Engineering

### Feature Selection

In [286]:
# Dari notebook explore.ipynb

train_dat = train_dat[['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'failure']]
test_dat = test_dat[['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14']]

## SPLIT TRAIN AND TEST

In [287]:
# Memisahkan fitur dengan target
full_train = train_dat.drop(columns=["failure"])
full_target = train_dat["failure"]

In [29]:
train_dat.head()

Unnamed: 0,id,product_code,loading,attribute_0,attribute_1,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,...,measurement_13,measurement_14,measurement_15,measurement_16,measurement_17,m3_missing,m5_missing,area,measurement_avg,failure
0,0,A,80.1,1,material_8,9,5,7.0,8.0,4.0,...,15.029,15.495857,13.034,14.684,764.1,0,0,45,15.360918,0
1,1,A,84.89,1,material_8,9,5,14.0,3.0,3.0,...,14.732,15.425,14.395,15.631,682.057,0,0,45,15.446286,0
2,2,A,82.43,1,material_8,9,5,12.0,1.0,5.0,...,16.711,18.631,14.094,17.946,663.376,0,0,45,16.09882,0
3,3,A,101.07,1,material_8,9,5,13.0,2.0,6.0,...,15.25,15.562,16.154,17.172,826.282,0,0,45,15.5995,0
4,4,A,188.06,1,material_8,9,5,9.0,2.0,8.0,...,16.182,12.76,13.153,16.412,579.885,0,0,45,15.194071,0


In [12]:
X_train, X_val, X_test = scale(train_dat[features], train_dat[features], test_dat[features], features)

## Oversampling

In [289]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## FEATURE SCALING

In [290]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [291]:
print(X_train.shape)
print(X_test.shape)

(29198, 12)
(7971, 12)


#  MODEL

In [13]:
def metrics(y_true, y_pred):
    print("ROC_AUC  :", roc_auc_score(y_true, y_pred))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## Tuning KNN

In [293]:
# from sklearn.model_selection import GridSearchCV

# # Load the dataset

# # Define parameter grid
# param_grid = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}

# # Choose a cross-validation strategy
# cv = 5  # 5-fold cross-validation

# # Choose a performance metric
# scoring = 'roc_auc'

# # Initialize KNN classifier
# knn = KNeighborsClassifier()

# # Perform grid search
# grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=cv, scoring=scoring)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Hyperparameters:", best_params)
# print("Best Score:", best_score)


## SVM

In [294]:
# svm_model = SVC()
# svm_model.fit(X_train, y_train)

# y_pred = svm_model.predict(X_test)

# metrics(y_test, y_pred)

## KNN

In [14]:
# Best hyperparameters obtained from hyperparameter tuning
best_params = {'n_neighbors': 5, 'weights': 'distance'}  # Example values
# n_neighbors=best_params['n_neighbors'], weights=best_params['weights']
# Instantiate the KNN classifier with the best hyperparameters
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, target)



## Naive Bayes

In [296]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

metrics(y_test, y_pred)

ROC_AUC  : 0.5413562023823936


### Kaggle Submission


In [25]:
target.unique()

array([0, 1], dtype=int64)

In [27]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# submission['failure'] = nb_model.predict(test_dat)
submission['failure'] = (knn_model.predict(X_test))
submission.to_csv('../submissions/testing_ATM.csv', index=False)


In [28]:
file = pd.read_csv('../submissions/testing_ATM.csv')

In [29]:
count_promosi = file['failure'].value_counts().sort_index()
print(count_promosi)

0    19315
1     1460
Name: failure, dtype: int64


In [299]:
count_promosi = file['failure'].value_counts().sort_index()
print(count_promosi)

0     9129
1    11646
Name: failure, dtype: int64
