## Importing Libraries

In [55]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler


from fast_ml.model_development import train_valid_test_split


## Data Extraction

In [56]:
TRAIN_PATH = r'../../Datasets/train.csv'
TEST_PATH = r'../../Datasets/test.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/sample_submission.csv"

In [57]:
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [58]:
train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

## Data Prep


In [59]:
#drop id
train_dat = train_dat.drop(columns='id')
test_dat = test_dat.drop(columns='id')


In [62]:
train_dat.isnull().count()

product_code      26570
loading           26570
attribute_0       26570
attribute_1       26570
attribute_2       26570
attribute_3       26570
measurement_0     26570
measurement_1     26570
measurement_2     26570
measurement_3     26570
measurement_4     26570
measurement_5     26570
measurement_6     26570
measurement_7     26570
measurement_8     26570
measurement_9     26570
measurement_10    26570
measurement_11    26570
measurement_12    26570
measurement_13    26570
measurement_14    26570
measurement_15    26570
measurement_16    26570
measurement_17    26570
failure           26570
dtype: int64

### Null Handling

In [30]:
# fill null values with mean
train_dat = train_dat.fillna(train_dat.mean())
test_dat = test_dat.fillna(test_dat.mean())

train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   product_code    26570 non-null  object 
 1   loading         26570 non-null  float64
 2   attribute_0     26570 non-null  object 
 3   attribute_1     26570 non-null  object 
 4   attribute_2     26570 non-null  int64  
 5   attribute_3     26570 non-null  int64  
 6   measurement_0   26570 non-null  int64  
 7   measurement_1   26570 non-null  int64  
 8   measurement_2   26570 non-null  int64  
 9   measurement_3   26570 non-null  float64
 10  measurement_4   26570 non-null  float64
 11  measurement_5   26570 non-null  float64
 12  measurement_6   26570 non-null  float64
 13  measurement_7   26570 non-null  float64
 14  measurement_8   26570 non-null  float64
 15  measurement_9   26570 non-null  float64
 16  measurement_10  26570 non-null  float64
 17  measurement_11  26570 non-null 

### Encoding

#### One hot encoding

In [31]:
# Perform one-hot encoding
train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

In [32]:
test_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20775 entries, 0 to 20774
Data columns (total 27 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   product_code            20775 non-null  object 
 1   loading                 20775 non-null  float64
 2   attribute_2             20775 non-null  int64  
 3   attribute_3             20775 non-null  int64  
 4   measurement_0           20775 non-null  int64  
 5   measurement_1           20775 non-null  int64  
 6   measurement_2           20775 non-null  int64  
 7   measurement_3           20775 non-null  float64
 8   measurement_4           20775 non-null  float64
 9   measurement_5           20775 non-null  float64
 10  measurement_6           20775 non-null  float64
 11  measurement_7           20775 non-null  float64
 12  measurement_8           20775 non-null  float64
 13  measurement_9           20775 non-null  float64
 14  measurement_10          20775 non-null

#### Label Encoding

In [33]:
label_encoder = LabelEncoder()
train_dat['product_code_encoded'] = label_encoder.fit_transform(train_dat['product_code'])
train_dat =train_dat.drop(columns='product_code')
test_dat['product_code_encoded'] = label_encoder.fit_transform(test_dat['product_code'])
test_dat =test_dat.drop(columns='product_code')


In [34]:
test_dat.head()

Unnamed: 0,loading,attribute_2,attribute_3,measurement_0,measurement_1,measurement_2,measurement_3,measurement_4,measurement_5,measurement_6,...,measurement_14,measurement_15,measurement_16,measurement_17,attribute_0_material_5,attribute_0_material_7,attribute_1_material_5,attribute_1_material_6,attribute_1_material_7,product_code_encoded
0,119.57,6,4,6,9,6,19.305,10.178,17.534,18.168,...,16.825,13.742,17.71,634.612,1,0,0,1,0,0
1,113.51,6,4,11,8,0,17.883,11.927,17.228,16.033,...,16.708,14.776,14.102,537.037,1,0,0,1,0,0
2,112.16,6,4,8,12,4,18.475,10.481,16.619,18.189,...,15.737,17.065,16.021,658.995,1,0,0,1,0,0
3,112.72,6,4,8,11,10,16.518,10.888,15.293,18.592,...,15.667,12.62,16.111,594.301,1,0,0,1,0,0
4,208.0,6,4,14,16,8,17.808,12.693,17.678,15.814,...,16.183,13.324,17.15,801.044,1,0,0,1,0,0


# Data Preprocessing

Feature Engineering

Outlier Handling

# SPLIT TRAIN AND TEST

In [35]:
# Memisahkan fitur dengan target
full_train = train_dat.drop(columns=["failure"])
full_target = train_dat["failure"]

In [36]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
                                        full_train,
                                        full_target,
                                        test_size=0.3,
                                        random_state=42,
                                        )

# FEATURE SCALING

In [37]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [38]:
print(X_train.shape)
print(X_test.shape)

(18599, 27)
(7971, 27)


#  MODEL

In [46]:
def metrics(y_true, y_pred):
    print("ROC_AUC  :", roc_auc_score(y_true, y_pred))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

### SVM, KNN, Naive Bayes

## SVM

In [40]:
svm_model = SVC()
svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

metrics(y_test, y_pred)

MSE  : 0.20687492159076654
RMSE : 0.4548350487712733


## KNN

In [47]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

metrics(y_test, y_pred)

ROC_AUC  : 0.5038937732050849


## Naive Bayes

In [48]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

metrics(y_test, y_pred)

ROC_AUC  : 0.5093611708341256


# Kaggle Submission


In [43]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission['failure'] = knn_model.predict(test_dat)
submission.to_csv('../submissions/testing_2.csv', index=False)

In [20]:
file = pd.read_csv('../submissions/submission_1.csv')

In [21]:
count_promosi = file['failure'].value_counts().sort_index()
print(count_promosi)

0     9129
1    11646
Name: failure, dtype: int64


In [24]:
file = pd.read_csv('../submissions/testing_2.csv')
count_promosi = file['failure'].value_counts().sort_index()
print(count_promosi)

0    20774
1        1
Name: failure, dtype: int64
