## Importing Libraries

In [276]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import MinMaxScaler


from fast_ml.model_development import train_valid_test_split


## Data Extraction

In [277]:
TRAIN_PATH = r'../../Datasets/train.csv'
TEST_PATH = r'../../Datasets/test.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/sample_submission.csv"

In [278]:
train_dat = pd.read_csv(TRAIN_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [279]:
train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26320 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26189 non-null  float64
 11  measurement_4   26032 non-null  float64
 12  measurement_5   25894 non-null  float64
 13  measurement_6   25774 non-null  float64
 14  measurement_7   25633 non-null  float64
 15  measurement_8   25522 non-null  float64
 16  measurement_9   25343 non-null  float64
 17  measurement_10  25270 non-null 

## Data Prep


In [280]:
# #drop id
# train_dat = train_dat.drop(columns='id')
# test_dat = test_dat.drop(columns='id')


### Null Handling

In [281]:
# drop all null values
train_dat = train_dat.fillna(train_dat.mean())
test_dat = test_dat.fillna(test_dat.mean())

train_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26570 entries, 0 to 26569
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              26570 non-null  int64  
 1   product_code    26570 non-null  object 
 2   loading         26570 non-null  float64
 3   attribute_0     26570 non-null  object 
 4   attribute_1     26570 non-null  object 
 5   attribute_2     26570 non-null  int64  
 6   attribute_3     26570 non-null  int64  
 7   measurement_0   26570 non-null  int64  
 8   measurement_1   26570 non-null  int64  
 9   measurement_2   26570 non-null  int64  
 10  measurement_3   26570 non-null  float64
 11  measurement_4   26570 non-null  float64
 12  measurement_5   26570 non-null  float64
 13  measurement_6   26570 non-null  float64
 14  measurement_7   26570 non-null  float64
 15  measurement_8   26570 non-null  float64
 16  measurement_9   26570 non-null  float64
 17  measurement_10  26570 non-null 

### Outlier Handling

In [282]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in train_dat.select_dtypes(include=np.number):
    if column != 'failure':
        train_dat[column] = handle_outliers_iqr(train_dat[column])
        test_dat[column] = handle_outliers_iqr(test_dat[column])


### Encoding

#### One hot encoding

In [283]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

#### Label Encoding

In [284]:
def label_encoder(train, test, columns):
    for col in columns:
        train[col] = train[col].astype(str)
        test[col] = test[col].astype(str)
        train[col] = LabelEncoder().fit_transform(train[col])
        test[col] = LabelEncoder().fit_transform(test[col])
    return train, test

In [285]:
cat_features = ['product_code','attribute_0', 'attribute_1' ]
train_dat, test_dat = label_encoder(train_dat, test_dat, cat_features)

## Feature Engineering

### Feature Selection

In [286]:
# Dari notebook explore.ipynb

train_dat = train_dat[['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14', 'failure']]
test_dat = test_dat[['loading', 'measurement_3', 'measurement_4', 'measurement_5', 'measurement_6', 'measurement_7', 'measurement_8', 'measurement_9', 'measurement_11', 'measurement_12', 'measurement_13', 'measurement_14']]

## SPLIT TRAIN AND TEST

In [287]:
# Memisahkan fitur dengan target
full_train = train_dat.drop(columns=["failure"])
full_target = train_dat["failure"]

In [288]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(
                                        full_train,
                                        full_target,
                                        test_size=0.3,
                                        random_state=42,
                                        )

## Oversampling

In [289]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## FEATURE SCALING

In [290]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [291]:
print(X_train.shape)
print(X_test.shape)

(29198, 12)
(7971, 12)


#  MODEL

In [292]:
def metrics(y_true, y_pred):
    print("ROC_AUC  :", roc_auc_score(y_true, y_pred))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## Tuning KNN

In [293]:
# from sklearn.model_selection import GridSearchCV

# # Load the dataset

# # Define parameter grid
# param_grid = {'n_neighbors': [3, 5, 7, 9, 11], 'weights': ['uniform', 'distance']}

# # Choose a cross-validation strategy
# cv = 5  # 5-fold cross-validation

# # Choose a performance metric
# scoring = 'roc_auc'

# # Initialize KNN classifier
# knn = KNeighborsClassifier()

# # Perform grid search
# grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=cv, scoring=scoring)
# grid_search.fit(X_train, y_train)

# # Get the best hyperparameters
# best_params = grid_search.best_params_
# best_score = grid_search.best_score_

# print("Best Hyperparameters:", best_params)
# print("Best Score:", best_score)


## SVM

In [294]:
# svm_model = SVC()
# svm_model.fit(X_train, y_train)

# y_pred = svm_model.predict(X_test)

# metrics(y_test, y_pred)

## KNN

In [295]:
# Best hyperparameters obtained from hyperparameter tuning
best_params = {'n_neighbors': 5, 'weights': 'distance'}  # Example values
# n_neighbors=best_params['n_neighbors'], weights=best_params['weights']
# Instantiate the KNN classifier with the best hyperparameters
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)

y_pred = knn_model.predict(X_test)

metrics(y_test, y_pred)

ROC_AUC  : 0.5114617987683043


## Naive Bayes

In [296]:
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)

metrics(y_test, y_pred)

ROC_AUC  : 0.5413562023823936


### Kaggle Submission


In [297]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission['failure'] = nb_model.predict(test_dat)
submission.to_csv('../submissions/testing_3.csv', index=False)

In [298]:
file = pd.read_csv('../submissions/submission_1.csv')

In [299]:
count_promosi = file['failure'].value_counts().sort_index()
print(count_promosi)

0     9129
1    11646
Name: failure, dtype: int64
