# Importing Libraries

In [1]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression


from sklearn.model_selection import train_test_split

%pip install fast-ml
from fast_ml.model_development import train_valid_test_split


Note: you may need to restart the kernel to use updated packages.


# Data Extraction

In [2]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [3]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [4]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

# Data Prep


In [5]:
train_feature_dat = pd.merge(train_feature_dat, train_label_dat, left_index=True, right_index=True)

## Dropping Irrelevant features

In [6]:
#drop tanggal_menjadi_anggota
train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota'})
test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota'})

In [7]:
train_feature_dat = train_feature_dat.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})
test_dat = test_dat.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})

In [8]:
# #drop tanggal_menjadi_anggota & Belanjaan
# train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})
# test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})

## Encoding

In [9]:
label_encoder = LabelEncoder()
train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
train_feature_dat =train_feature_dat.drop(columns='pendidikan')
test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
test_dat =test_dat.drop(columns='pendidikan')

train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
test_dat =test_dat.drop(columns='status_pernikahan')


## Binning

In [10]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
test_dat.drop(columns='tahun_kelahiran', inplace=True)

## IQR

In [11]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in train_feature_dat.select_dtypes(include=np.number):
    if column != 'jumlah_promosi':
        train_feature_dat[column] = handle_outliers_iqr(train_feature_dat[column])
        test_dat[column] = handle_outliers_iqr(test_dat[column])


## Null Handling

### Simple Imputer

In [12]:
# from datetime import datetime

# # Ubah kolom 'tanggal_menjadi_anggota' menjadi format datetime
# train_feature_dat['tanggal_menjadi_anggota'] = pd.to_datetime(train_feature_dat['tanggal_menjadi_anggota'])
# test_dat['tanggal_menjadi_anggota'] = pd.to_datetime(test_dat['tanggal_menjadi_anggota'])

# # Hitung jumlah hari sejak tanggal tertentu (misalnya 2000-01-01)
# train_feature_dat['tanggal_menjadi_anggota'] = (train_feature_dat['tanggal_menjadi_anggota'] - datetime(2000, 1, 1)).dt.days
# test_dat['tanggal_menjadi_anggota'] = (test_dat['tanggal_menjadi_anggota'] - datetime(2000, 1, 1)).dt.days


In [13]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Membuat objek SimpleImputer untuk data pelatihan dengan strategi 'median'
imputer_median = SimpleImputer(strategy='median')

# Mengisi nilai yang hilang dalam data pelatihan dengan strategi 'median'
train_feature_dat = pd.DataFrame(imputer_median.fit_transform(train_feature_dat), columns=train_feature_dat.columns)

# Membuat objek SimpleImputer untuk data pengujian dengan strategi 'most_frequent'
imputer_most_frequent = SimpleImputer(strategy='most_frequent')

# Mengambil kolom 'ID' dari df_test
test_dat_id = test_dat['ID']

# Menghapus kolom 'ID' dari df_test
test_dat_features = test_dat.drop('ID', axis=1)

# Melakukan imputasi nilai yang hilang dalam data pengujian kecuali pada kolom 'ID' dengan strategi 'most_frequent'
test_dat_features = pd.DataFrame(imputer_median.fit_transform(test_dat_features), columns=test_dat_features.columns)

# Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
test_dat = pd.concat([test_dat_id, test_dat_features], axis=1)


### Mean for Numerical, Mode for Categorical

In [14]:
# # fill all null values with mean and mode
# train_feature_dat.fillna(train_feature_dat.mean(), inplace=True)
# test_dat.fillna(test_dat.mean(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### Median for Numerical, Mode for Categorical

In [15]:
# # fill all null values with median and mode
# train_feature_dat.fillna(train_feature_dat.median(), inplace=True)
# test_dat.fillna(test_dat.median(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### KNN Imputer

In [16]:
# categorical_columns = train_feature_dat.select_dtypes(include=['object']).columns

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# train_feature_dat[categorical_columns] = encoder.fit_transform(train_feature_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# train_feature_dat = pd.DataFrame(imputer.fit_transform(train_feature_dat), columns=train_feature_dat.columns)

# # Decode the imputed numerical values back to categorical values
# train_feature_dat[categorical_columns] = encoder.inverse_transform(train_feature_dat[categorical_columns].astype(int))


In [17]:
# # Mengambil kolom 'ID' dari df_test
# test_dat_id = test_dat['ID']

# # Menghapus kolom 'ID' dari test_dat
# test_dat = test_dat.drop('ID', axis=1)

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# test_dat[categorical_columns] = encoder.fit_transform(test_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# test_dat = pd.DataFrame(imputer.fit_transform(test_dat), columns=test_dat.columns)

# # Decode the imputed numerical values back to categorical values
# test_dat[categorical_columns] = encoder.inverse_transform(test_dat[categorical_columns].astype(int))

# # Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
# test_dat = pd.concat([test_dat_id, test_dat], axis=1)

## Outlier Handling

### Windsorizer

In [18]:
# #windsorizer
# def windsorize_by_percentage(data, lower_percentile, upper_percentile):
#     lower_bound = np.percentile(data, lower_percentile)
#     upper_bound = np.percentile(data, upper_percentile)
#     windsorized_data = []
#     for value in data:
#         if value < lower_bound:
#             windsorized_data.append(lower_bound)
#         elif value > upper_bound:
#             windsorized_data.append(upper_bound)
#         else:
#             windsorized_data.append(value)

#     return windsorized_data

# # Specify lower and upper percentiles
# lower_percentile = 10
# upper_percentile = 90

# for column in train_feature_dat.select_dtypes(include=np.number):
#     train_feature_dat[column] = windsorize_by_percentage(train_feature_dat[column], lower_percentile, upper_percentile)
#     test_dat[column] = windsorize_by_percentage(test_dat[column], lower_percentile, upper_percentile)



### IQR

In [19]:
# def handle_outliers_iqr(data):
#     # Calculate quartiles
#     Q1 = np.percentile(data, 25)
#     Q3 = np.percentile(data, 75)
#     IQR = Q3 - Q1
    
#     # Calculate lower and upper bounds
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
    
#     # Handle outliers
#     # Replace outliers with the upper or lower bound
#     data[data < lower_bound] = lower_bound
#     data[data > upper_bound] = upper_bound
    
#     return data

# for column in train_feature_dat.select_dtypes(include=np.number):
#     if column != 'jumlah_promosi':
#         train_feature_dat[column] = handle_outliers_iqr(train_feature_dat[column])
#         test_dat[column] = handle_outliers_iqr(test_dat[column])


## Encoding

### One hot encoding

In [20]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

### Label Encoding

In [21]:
# label_encoder = LabelEncoder()
# train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
# train_feature_dat =train_feature_dat.drop(columns='pendidikan')
# test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
# test_dat =test_dat.drop(columns='pendidikan')

# train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
# train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
# test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
# test_dat =test_dat.drop(columns='status_pernikahan')


In [22]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# Feature Engineering

## Binning Tahun Kelahiran

In [23]:
# # Define bin edges and labels
# bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
# bin_labels = ['0', '1', '2', '3', '4', '5']

# # Perform binning
# train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
# train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
# train_feature_dat['tahun_kelahiran_binned'] = train_feature_dat['tahun_kelahiran_binned'].astype('int')

# test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
# test_dat.drop(columns='tahun_kelahiran', inplace=True)
# test_dat['tahun_kelahiran_binned'] = test_dat['tahun_kelahiran_binned'].astype('int')

## Binning Terakhir Belanja

In [24]:
# # Choose the number of bins
# num_bins = 5

# # Bin the data using equal-width binning
# train_feature_dat['terakhir_belanja_bins'] = pd.cut(train_feature_dat['terakhir_belanja'], bins=num_bins, labels=False)
# # train_feature_dat.drop(columns='terakhir_belanja', inplace=True)

# test_dat['terakhir_belanja_bins'] = pd.cut(test_dat['terakhir_belanja'], bins=num_bins, labels=False)
# # test_dat.drop(columns='terakhir_belanja', inplace=True)

In [25]:
from datetime import datetime

# Feature Engineering
train_feature_dat['total_belanja'] = train_feature_dat['belanja_buah'] + train_feature_dat['belanja_daging'] + train_feature_dat['belanja_ikan'] + train_feature_dat['belanja_kue']
train_feature_dat['usia'] = datetime.now().year - train_feature_dat['tahun_kelahiran_binned']
train_feature_dat['rata2_belanja_per_kategori'] = train_feature_dat[['belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue']].mean(axis=1)


In [26]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   pendapatan                  3817 non-null   float64
 1   terakhir_belanja            3817 non-null   float64
 2   belanja_buah                3817 non-null   float64
 3   belanja_daging              3817 non-null   float64
 4   belanja_ikan                3817 non-null   float64
 5   belanja_kue                 3817 non-null   float64
 6   pembelian_diskon            3817 non-null   float64
 7   pembelian_web               3817 non-null   float64
 8   pembelian_toko              3817 non-null   float64
 9   jumlah_promosi              3817 non-null   float64
 10  pendidikan_encoded          3817 non-null   float64
 11  status_pernikahan_encoded   3817 non-null   float64
 12  tahun_kelahiran_binned      3817 non-null   float64
 13  total_belanja               3817 

In [27]:
train_feature_dat.isna().sum()

pendapatan                    0
terakhir_belanja              0
belanja_buah                  0
belanja_daging                0
belanja_ikan                  0
belanja_kue                   0
pembelian_diskon              0
pembelian_web                 0
pembelian_toko                0
jumlah_promosi                0
pendidikan_encoded            0
status_pernikahan_encoded     0
tahun_kelahiran_binned        0
total_belanja                 0
usia                          0
rata2_belanja_per_kategori    0
dtype: int64

In [28]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# SPLIT TRAIN AND TEST

In [29]:
# Train test split
X = train_feature_dat.drop(columns='jumlah_promosi')
y = train_feature_dat['jumlah_promosi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [30]:
type(y_test)

pandas.core.series.Series

# Imbalance Handling

## Under + Oversampling (GAGAL)

In [31]:
# from imblearn.combine import SMOTEENN

# # Create an instance of SMOTEENN
# smote_enn = SMOTEENN(random_state=42)

# # Fit and transform the dataset
# X_train, y_train = smote_enn.fit_resample(X_train, y_train)


## Oversampling

In [32]:
# from imblearn.over_sampling import RandomOverSampler

# ros = RandomOverSampler(random_state=42)
# X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

In [33]:
# from imblearn.over_sampling import ADASYN

# ada = ADASYN(random_state=42)
# X_train_ada, y_train_ada = ada.fit_resample(X_train, y_train)

In [34]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# FEATURE SCALING

In [35]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [36]:
print(X_train.shape)
print(X_test.shape)

(5516, 15)
(764, 15)


#  MODEL

In [37]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## RF, Gradient Boost, XGBosst, CatBoost

In [38]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)


from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
gb_model.fit(X_train, y_train)
y_pred_gb = gb_model.predict(X_test)

# Inisialisasi model XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)


catboost = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass')
catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)
y_pred_cat = catboost.predict(X_test)

# Inisialisasi model Logistic Regression
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state=42)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)

# Inisialisasi model Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
mnb_model = MultinomialNB()
mnb_model.fit(X_train, y_train)
y_pred_mnb = mnb_model.predict(X_test)

0:	learn: 1.9068457	test: 1.9198210	best: 1.9198210 (0)	total: 180ms	remaining: 2m 59s
100:	learn: 0.9730333	test: 1.3131793	best: 1.3131793 (100)	total: 2.5s	remaining: 22.2s
200:	learn: 0.6812832	test: 1.1432456	best: 1.1432456 (200)	total: 5.07s	remaining: 20.1s
300:	learn: 0.5189338	test: 1.0514041	best: 1.0514041 (300)	total: 7.53s	remaining: 17.5s
400:	learn: 0.4027976	test: 0.9990787	best: 0.9990787 (400)	total: 11s	remaining: 16.5s
500:	learn: 0.3228309	test: 0.9628792	best: 0.9628792 (500)	total: 15.5s	remaining: 15.5s
600:	learn: 0.2676558	test: 0.9400231	best: 0.9400231 (600)	total: 19.5s	remaining: 13s
700:	learn: 0.2256784	test: 0.9223783	best: 0.9223783 (700)	total: 23.2s	remaining: 9.91s
800:	learn: 0.1941856	test: 0.9098194	best: 0.9098194 (800)	total: 26.1s	remaining: 6.48s
900:	learn: 0.1666231	test: 0.8999988	best: 0.8998408 (899)	total: 29s	remaining: 3.19s
999:	learn: 0.1442474	test: 0.8913115	best: 0.8913115 (999)	total: 32.2s	remaining: 0us

bestTest = 0.89131146

In [39]:
from sklearn.metrics import f1_score

# Evaluasi kinerja Random Forest Classifier
f1_macro_rf = f1_score(y_test, y_pred_rf, average='macro')
print("F1-score Macro untuk Random Forest Classifier:", f1_macro_rf)

# Evaluasi kinerja Gradient Boosting Classifier
f1_macro_gb = f1_score(y_test, y_pred_gb, average='macro')
print("F1-score Macro untuk Gradient Boosting Classifier:", f1_macro_gb)

# Evaluasi kinerja XGradient Boosting Classifier
f1_macro_xgb = f1_score(y_test, y_pred_xgb, average='macro')
print("F1-score Macro untuk XGBClassifier:", f1_macro_xgb)

# Evaluasi kinerja XGradient Boosting Classifier
f1_macro_cat = f1_score(y_test, y_pred_cat, average='macro')
print("F1-score Macro untuk catboostClassifier:", f1_macro_cat)

# Evaluasi kinerja Logistic Regression
f1_macro_lr = f1_score(y_test, y_pred_lr, average='macro')
print("F1-score Macro untuk Logistic Regression:", f1_macro_lr)

# Evaluasi kinerja Multinomial Naive Bayes
f1_macro_mnb = f1_score(y_test, y_pred_mnb, average='macro')
print("F1-score Macro untuk Multinomial Naive Bayes:", f1_macro_mnb)

F1-score Macro untuk Random Forest Classifier: 0.7053163740667973
F1-score Macro untuk Gradient Boosting Classifier: 0.5649588730521066
F1-score Macro untuk XGBClassifier: 0.6638336134398657
F1-score Macro untuk catboostClassifier: 0.6878152142312227
F1-score Macro untuk Logistic Regression: 0.25091716155238936
F1-score Macro untuk Multinomial Naive Bayes: 0.1690589435057072


## CV score for RF

In [40]:
from sklearn.model_selection import cross_val_score

# Cross-validation untuk Random Forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation Scores - Random Forest:", cv_scores_rf)

# # Cross-validation untuk Logistic Regression
# cv_scores_lr = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='f1_macro')
# print("Cross-Validation Scores - Logistic Regression:", cv_scores_lr)

# # Cross-validation untuk Multinomial Naive Bayes
# cv_scores_mnb = cross_val_score(mnb_model, X_train, y_train, cv=5, scoring='f1_macro')
# print("Cross-Validation Scores - Multinomial Naive Bayes:", cv_scores_mnb)

Cross-Validation Scores - Random Forest: [0.80380166 0.79843607 0.86213882 0.89376665 0.90075187]


## Stacking

In [41]:
# Meta-model dan base-models
meta_model = LogisticRegression()
base_models = [
    ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('cb', CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass'))
]

# Membuat Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# Latih Stacking Classifier
stacking_clf.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred = stacking_clf.predict(X_test)

# Evaluasi dengan F1-score Macro
f1_macro = f1_score(y_test, y_pred, average='macro')
print("F1-score Macro untuk Stacking Classifier:", f1_macro)

0:	learn: 1.9068457	total: 28.9ms	remaining: 28.9s
1:	learn: 1.8673084	total: 49.6ms	remaining: 24.7s
2:	learn: 1.8398212	total: 72.7ms	remaining: 24.2s
3:	learn: 1.8124335	total: 94.6ms	remaining: 23.6s
4:	learn: 1.7847675	total: 115ms	remaining: 22.9s
5:	learn: 1.7625946	total: 136ms	remaining: 22.6s
6:	learn: 1.7410559	total: 156ms	remaining: 22.2s
7:	learn: 1.7168749	total: 175ms	remaining: 21.7s
8:	learn: 1.6934626	total: 194ms	remaining: 21.4s
9:	learn: 1.6731140	total: 213ms	remaining: 21.1s
10:	learn: 1.6597552	total: 236ms	remaining: 21.2s
11:	learn: 1.6425481	total: 256ms	remaining: 21.1s
12:	learn: 1.6316069	total: 278ms	remaining: 21.1s
13:	learn: 1.6159261	total: 301ms	remaining: 21.2s
14:	learn: 1.5960086	total: 319ms	remaining: 20.9s
15:	learn: 1.5822578	total: 339ms	remaining: 20.8s
16:	learn: 1.5656618	total: 363ms	remaining: 21s
17:	learn: 1.5536766	total: 386ms	remaining: 21s
18:	learn: 1.5392727	total: 405ms	remaining: 20.9s
19:	learn: 1.5273047	total: 424ms	remaini

In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation untuk Stacking Classifier
cv_scores_stacking = cross_val_score(stacking_clf, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation Scores - Stacking Classifier:", cv_scores_stacking)

## Voting Classifier

In [None]:
from sklearn.ensemble import VotingClassifier

# Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
        ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('cb', CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass'))
    ], 
    voting='soft'
)

# Latih Voting Classifier
voting_clf.fit(X_train, y_train)

# Prediksi dan evaluasi
y_pred_voting = voting_clf.predict(X_test)

# Evaluasi dengan F1-score Macro
f1_macro_voting = f1_score(y_test, y_pred_voting, average='macro')
print("F1-score Macro untuk Voting Classifier:", f1_macro_voting)

In [None]:
# Cross-validation untuk Voting Classifier
cv_scores_voting = cross_val_score(voting_clf, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation Scores - Voting Classifier:", cv_scores_voting)

# Kaggle Submission


In [None]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# test_dat.drop(columns='ID', inplace=True)
submission['jumlah_promosi'] = stacking_clf.predict(test_dat)
submission.to_csv('../submissions/stackingclf.csv', index=False)

In [None]:
SUBMIT_PATH = '../submissions/stackingclf.csv'

csv = pd.read_csv(SUBMIT_PATH)

In [None]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts