# Importing Libraries

In [49]:
%pip install category_encoders

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [50]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from category_encoders import TargetEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split


from fast_ml.model_development import train_valid_test_split


# Data Extraction

In [51]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [52]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [53]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

# Data Prep


In [54]:
train_feature_dat = pd.merge(train_feature_dat, train_label_dat, left_index=True, right_index=True)

## Dropping Irrelevant features

In [55]:
#drop tanggal_menjadi_anggota
train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota'})
test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota'})

In [56]:
train_feature_dat = train_feature_dat.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})
test_dat = test_dat.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})

In [57]:
# #drop tanggal_menjadi_anggota & Belanjaan
# train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})
# test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})

## Encoding

### Label Encoder

In [58]:
# label_encoder = LabelEncoder()
# train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
# train_feature_dat =train_feature_dat.drop(columns='pendidikan')
# test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
# test_dat =test_dat.drop(columns='pendidikan')

# train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
# train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
# test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
# test_dat =test_dat.drop(columns='status_pernikahan')


## Binning

In [59]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
test_dat.drop(columns='tahun_kelahiran', inplace=True)

## IQR

In [60]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in train_feature_dat.select_dtypes(include=np.number):
    if column != 'jumlah_promosi':
        train_feature_dat[column] = handle_outliers_iqr(train_feature_dat[column])
        test_dat[column] = handle_outliers_iqr(test_dat[column])


## Null Handling

### Simple Imputer

In [61]:
from sklearn.impute import SimpleImputer
import pandas as pd
# Define numerical columns
numerical_columns = train_feature_dat.select_dtypes(include=['number']).columns

# Membuat objek SimpleImputer untuk data pelatihan dengan strategi 'median'
imputer_median = SimpleImputer(strategy='median')

# Mengisi nilai yang hilang dalam data pelatihan dengan strategi 'median' hanya untuk kolom numerik
train_feature_dat[numerical_columns] = imputer_median.fit_transform(train_feature_dat[numerical_columns])

# Mengambil kolom 'ID' dari df_test
test_dat_id = test_dat['ID']

# Menghapus kolom 'ID' dari df_test
test_dat_features = test_dat.drop('ID', axis=1)
numerical_columns = test_dat_features.select_dtypes(include=['number']).columns

# Mengisi nilai yang hilang dalam data pengujian kecuali pada kolom 'ID' dengan strategi 'median' hanya untuk kolom numerik
test_dat_features[numerical_columns] = imputer_median.fit_transform(test_dat_features[numerical_columns])

# Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
test_dat = pd.concat([test_dat_id, test_dat_features], axis=1)

In [62]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   pendidikan              3628 non-null   object  
 1   status_pernikahan       3605 non-null   object  
 2   pendapatan              3817 non-null   float64 
 3   terakhir_belanja        3817 non-null   float64 
 4   belanja_buah            3817 non-null   float64 
 5   belanja_daging          3817 non-null   float64 
 6   belanja_ikan            3817 non-null   float64 
 7   belanja_kue             3817 non-null   float64 
 8   pembelian_diskon        3817 non-null   float64 
 9   pembelian_web           3817 non-null   float64 
 10  pembelian_toko          3817 non-null   float64 
 11  jumlah_promosi          3817 non-null   float64 
 12  tahun_kelahiran_binned  3817 non-null   category
dtypes: category(1), float64(10), object(2)
memory usage: 361.9+ KB


### Mean for Numerical, Mode for Categorical

In [63]:
# # fill all null values with mean and mode
# train_feature_dat.fillna(train_feature_dat.mean(), inplace=True)
# test_dat.fillna(test_dat.mean(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### Median for Numerical, Mode for Categorical

In [64]:
# # fill all null values with median and mode
# train_feature_dat.fillna(train_feature_dat.median(), inplace=True)
# test_dat.fillna(test_dat.median(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### KNN Imputer

In [65]:
# categorical_columns = train_feature_dat.select_dtypes(include=['object']).columns

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# train_feature_dat[categorical_columns] = encoder.fit_transform(train_feature_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# train_feature_dat = pd.DataFrame(imputer.fit_transform(train_feature_dat), columns=train_feature_dat.columns)

# # Decode the imputed numerical values back to categorical values
# train_feature_dat[categorical_columns] = encoder.inverse_transform(train_feature_dat[categorical_columns].astype(int))


In [66]:
# # Mengambil kolom 'ID' dari df_test
# test_dat_id = test_dat['ID']

# # Menghapus kolom 'ID' dari test_dat
# test_dat = test_dat.drop('ID', axis=1)

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# test_dat[categorical_columns] = encoder.fit_transform(test_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# test_dat = pd.DataFrame(imputer.fit_transform(test_dat), columns=test_dat.columns)

# # Decode the imputed numerical values back to categorical values
# test_dat[categorical_columns] = encoder.inverse_transform(test_dat[categorical_columns].astype(int))

# # Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
# test_dat = pd.concat([test_dat_id, test_dat], axis=1)

## Outlier Handling

### Windsorizer

In [67]:
# #windsorizer
# def windsorize_by_percentage(data, lower_percentile, upper_percentile):
#     lower_bound = np.percentile(data, lower_percentile)
#     upper_bound = np.percentile(data, upper_percentile)
#     windsorized_data = []
#     for value in data:
#         if value < lower_bound:
#             windsorized_data.append(lower_bound)
#         elif value > upper_bound:
#             windsorized_data.append(upper_bound)
#         else:
#             windsorized_data.append(value)

#     return windsorized_data

# # Specify lower and upper percentiles
# lower_percentile = 10
# upper_percentile = 90

# for column in train_feature_dat.select_dtypes(include=np.number):
#     train_feature_dat[column] = windsorize_by_percentage(train_feature_dat[column], lower_percentile, upper_percentile)
#     test_dat[column] = windsorize_by_percentage(test_dat[column], lower_percentile, upper_percentile)



### IQR

In [68]:
# def handle_outliers_iqr(data):
#     # Calculate quartiles
#     Q1 = np.percentile(data, 25)
#     Q3 = np.percentile(data, 75)
#     IQR = Q3 - Q1
    
#     # Calculate lower and upper bounds
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     # Handle outliers
#     # Replace outliers with the upper or lower bound
#     data[data < lower_bound] = lower_bound
#     data[data > upper_bound] = upper_bound
    
#     return data

# for column in train_feature_dat.select_dtypes(include=np.number):
#     if column != 'jumlah_promosi':
#         train_feature_dat[column] = handle_outliers_iqr(train_feature_dat[column])
#         test_dat[column] = handle_outliers_iqr(test_dat[column])


## Encoding

### One hot encoding

In [69]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

### Label Encoding

In [70]:
# label_encoder = LabelEncoder()
# train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
# train_feature_dat =train_feature_dat.drop(columns='pendidikan')
# test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
# test_dat =test_dat.drop(columns='pendidikan')

# train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
# train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
# test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
# test_dat =test_dat.drop(columns='status_pernikahan')


In [71]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# Feature Engineering

## Binning Tahun Kelahiran

In [72]:
# # Define bin edges and labels
# bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
# bin_labels = ['0', '1', '2', '3', '4', '5']

# # Perform binning
# train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
# train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
# train_feature_dat['tahun_kelahiran_binned'] = train_feature_dat['tahun_kelahiran_binned'].astype('int')

# test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
# test_dat.drop(columns='tahun_kelahiran', inplace=True)
# test_dat['tahun_kelahiran_binned'] = test_dat['tahun_kelahiran_binned'].astype('int')

## Binning Terakhir Belanja

In [73]:
# # Choose the number of bins
# num_bins = 5

# # Bin the data using equal-width binning
# train_feature_dat['terakhir_belanja_bins'] = pd.cut(train_feature_dat['terakhir_belanja'], bins=num_bins, labels=False)
# # train_feature_dat.drop(columns='terakhir_belanja', inplace=True)

# test_dat['terakhir_belanja_bins'] = pd.cut(test_dat['terakhir_belanja'], bins=num_bins, labels=False)
# # test_dat.drop(columns='terakhir_belanja', inplace=True)

In [74]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 13 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   pendidikan              3628 non-null   object  
 1   status_pernikahan       3605 non-null   object  
 2   pendapatan              3817 non-null   float64 
 3   terakhir_belanja        3817 non-null   float64 
 4   belanja_buah            3817 non-null   float64 
 5   belanja_daging          3817 non-null   float64 
 6   belanja_ikan            3817 non-null   float64 
 7   belanja_kue             3817 non-null   float64 
 8   pembelian_diskon        3817 non-null   float64 
 9   pembelian_web           3817 non-null   float64 
 10  pembelian_toko          3817 non-null   float64 
 11  jumlah_promosi          3817 non-null   float64 
 12  tahun_kelahiran_binned  3817 non-null   category
dtypes: category(1), float64(10), object(2)
memory usage: 361.9+ KB


In [75]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# SPLIT TRAIN AND TEST

In [76]:
# Train test split
X = train_feature_dat.drop(columns='jumlah_promosi')
y = train_feature_dat['jumlah_promosi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Target Encoding

In [77]:
# Step 3: Calculate statistics and encode
encoder = TargetEncoder()
X_train = encoder.fit_transform(X_train, y_train)

# Step 4: Apply encoding to validation and test sets
X_test = encoder.transform(X_test)

# Mengambil kolom 'ID' dari df_test
test_dat_id = test_dat['ID']

# Menghapus kolom 'ID' dari df_test
test_dat_features = test_dat.drop('ID', axis=1)

test_dat_features = encoder.transform(test_dat_features)

test_dat = pd.concat([test_dat_id, test_dat_features], axis=1)

# Imbalance Handling

## Under + Oversampling (GAGAL)

In [423]:
# from imblearn.combine import SMOTEENN

# # Create an instance of SMOTEENN
# smote_enn = SMOTEENN(random_state=42)

# # Fit and transform the dataset
# X_train, y_train = smote_enn.fit_resample(X_train, y_train)


## Oversampling

In [79]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## Undersampling

In [425]:
# from imblearn.under_sampling import RandomUnderSampler

# # Create an instance of RandomUnderSampler
# undersampler = RandomUnderSampler(random_state=42)

# # Fit and transform the dataset
# X_train, y_train = undersampler.fit_resample(X_train, y_train)


# FEATURE SCALING

In [426]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [427]:
print(X_train.shape)
print(X_test.shape)

(5516, 7)
(764, 7)


#  MODEL

In [80]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## RF, Gradient Boost, XGBosst, CatBoost

In [81]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Inisialisasi model XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


catboost = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass')
catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)
y_pred_cat = catboost.predict(X_test)



0:	learn: 1.9107474	test: 1.9254197	best: 1.9254197 (0)	total: 157ms	remaining: 2m 37s
100:	learn: 0.9657071	test: 1.3132441	best: 1.3132441 (100)	total: 1.15s	remaining: 10.3s
200:	learn: 0.6770993	test: 1.1495406	best: 1.1495406 (200)	total: 2.09s	remaining: 8.31s
300:	learn: 0.5058659	test: 1.0564352	best: 1.0564352 (300)	total: 3.04s	remaining: 7.07s
400:	learn: 0.3950518	test: 1.0004282	best: 1.0004282 (400)	total: 4.01s	remaining: 5.99s
500:	learn: 0.3174597	test: 0.9655477	best: 0.9655477 (500)	total: 5.1s	remaining: 5.08s
600:	learn: 0.2638995	test: 0.9379116	best: 0.9379116 (600)	total: 6.22s	remaining: 4.13s
700:	learn: 0.2204811	test: 0.9254334	best: 0.9254334 (700)	total: 7.36s	remaining: 3.14s
800:	learn: 0.1876185	test: 0.9069598	best: 0.9069598 (800)	total: 8.4s	remaining: 2.09s
900:	learn: 0.1616104	test: 0.8962462	best: 0.8962462 (900)	total: 9.42s	remaining: 1.03s
999:	learn: 0.1410761	test: 0.8868368	best: 0.8867157 (997)	total: 10.4s	remaining: 0us

bestTest = 0.886

In [83]:
from sklearn.metrics import f1_score

# Evaluasi kinerja Random Forest Classifier
f1_macro_rf = f1_score(y_test, y_pred_rf, average='macro')
print("F1-score Macro untuk Random Forest Classifier:", f1_macro_rf)

# Evaluasi kinerja Gradient Boosting Classifier
f1_macro_gb = f1_score(y_test, y_pred_gb, average='macro')
print("F1-score Macro untuk Gradient Boosting Classifier:", f1_macro_gb)

# Evaluasi kinerja XGradient Boosting Classifier
f1_macro_xgb = f1_score(y_test, y_pred_xgb, average='macro')
print("F1-score Macro untuk XGBClassifier:", f1_macro_xgb)

# Evaluasi kinerja XGradient Boosting Classifier
f1_macro_cat = f1_score(y_test, y_pred_cat, average='macro')
print("F1-score Macro untuk catboostClassifier:", f1_macro_cat)



F1-score Macro untuk Random Forest Classifier: 0.7351749383402358
F1-score Macro untuk Gradient Boosting Classifier: 0.5629420638058601
F1-score Macro untuk XGBClassifier: 0.6520414352551932
F1-score Macro untuk catboostClassifier: 0.6788416313416278


## CV score for RF

In [84]:
from sklearn.model_selection import cross_val_score

# Cross-validation untuk Random Forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation Scores - Random Forest:", cv_scores_rf)


Cross-Validation Scores - Random Forest: [0.80481256 0.80755438 0.86694452 0.88691458 0.89903503]


## Stacking

In [85]:
# Meta-model dan base-models
meta_model = LogisticRegression()
base_models = [
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('cb', CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass'))
]

# Membuat Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Latih Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = stacking_clf.predict(X_test)

# Evaluasi dengan F1-score Macro
f1_macro = f1_score(y_test, y_pred, average='macro')
print("F1-score Macro untuk Stacking Classifier:", f1_macro)

0:	learn: 1.9107474	total: 10.8ms	remaining: 10.8s
1:	learn: 1.8722544	total: 22.8ms	remaining: 11.4s
2:	learn: 1.8416632	total: 34.2ms	remaining: 11.4s
3:	learn: 1.8116977	total: 44.7ms	remaining: 11.1s
4:	learn: 1.7825179	total: 55.5ms	remaining: 11s
5:	learn: 1.7602951	total: 65.4ms	remaining: 10.8s
6:	learn: 1.7332624	total: 75.9ms	remaining: 10.8s
7:	learn: 1.7105833	total: 86.6ms	remaining: 10.7s
8:	learn: 1.6860420	total: 96.8ms	remaining: 10.7s
9:	learn: 1.6695675	total: 108ms	remaining: 10.6s
10:	learn: 1.6487894	total: 122ms	remaining: 11s
11:	learn: 1.6293460	total: 132ms	remaining: 10.9s
12:	learn: 1.6085061	total: 143ms	remaining: 10.9s
13:	learn: 1.5942466	total: 155ms	remaining: 10.9s
14:	learn: 1.5711211	total: 169ms	remaining: 11.1s
15:	learn: 1.5552235	total: 182ms	remaining: 11.2s
16:	learn: 1.5436982	total: 194ms	remaining: 11.2s
17:	learn: 1.5297517	total: 206ms	remaining: 11.2s
18:	learn: 1.5185212	total: 219ms	remaining: 11.3s
19:	learn: 1.5063978	total: 233ms	re

In [359]:
# from sklearn.model_selection import cross_val_score

# # Cross-validation untuk Stacking Classifier
# cv_scores_stacking = cross_val_score(stacking_clf, X_train, y_train, cv=5, scoring='f1_macro')
# print("Cross-Validation Scores - Stacking Classifier:", cv_scores_stacking)


0:	learn: 1.9100642	total: 13ms	remaining: 13s
1:	learn: 1.8737557	total: 25.3ms	remaining: 12.6s
2:	learn: 1.8413881	total: 39.5ms	remaining: 13.1s
3:	learn: 1.8157819	total: 51.9ms	remaining: 12.9s
4:	learn: 1.7893226	total: 63.4ms	remaining: 12.6s
5:	learn: 1.7637227	total: 75.7ms	remaining: 12.5s
6:	learn: 1.7363847	total: 90.2ms	remaining: 12.8s
7:	learn: 1.7158957	total: 104ms	remaining: 12.9s
8:	learn: 1.6932530	total: 116ms	remaining: 12.8s
9:	learn: 1.6728312	total: 127ms	remaining: 12.6s
10:	learn: 1.6522821	total: 140ms	remaining: 12.5s
11:	learn: 1.6362418	total: 154ms	remaining: 12.6s
12:	learn: 1.6183319	total: 165ms	remaining: 12.5s
13:	learn: 1.6020157	total: 176ms	remaining: 12.4s
14:	learn: 1.5865224	total: 189ms	remaining: 12.4s
15:	learn: 1.5741802	total: 203ms	remaining: 12.5s
16:	learn: 1.5545843	total: 216ms	remaining: 12.5s
17:	learn: 1.5392098	total: 230ms	remaining: 12.5s
18:	learn: 1.5263343	total: 242ms	remaining: 12.5s
19:	learn: 1.5142209	total: 256ms	rema

# Kaggle Submission


In [32]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# test_dat.drop(columns='ID', inplace=True)
submission['jumlah_promosi'] = stacking_clf.predict(test_dat)
submission.to_csv('../submissions/stackingclf.csv', index=False)

In [33]:
SUBMIT_PATH = '../submissions/stackingclf.csv'

csv = pd.read_csv(SUBMIT_PATH)

In [34]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts

0    1112
1     456
2     341
3     471
4     522
5     578
6     338
Name: jumlah_promosi, dtype: int64