In [212]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV


In [213]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [214]:
df_train_feature = pd.read_csv(TRAIN_FEATURE_PATH)
df_train_label = pd.read_csv(TRAIN_LABEL_PATH)
df_test = pd.read_csv(TEST_PATH)

In [215]:
df_train_feature

Unnamed: 0,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota
0,1979,Sarjana,Rencana Menikah,,0.0,1.0,,50575.0,260967.0,50575.0,20230.0,2.0,2.0,5.0,0.0,2014-05-05
1,1950,Sarjana,Rencana Menikah,84063000.0,,,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,0.0,2013-03-17
2,1966,Sarjana,Menikah,127532564.0,0.0,0.0,45.0,117611.0,265460.0,96341.0,145573.0,1.0,1.0,7.0,0.0,
3,1961,Magister,Rencana Menikah,165579620.0,0.0,0.0,90.0,206346.0,1613901.0,27725.0,125868.0,0.0,7.0,8.0,0.0,
4,1970,Sarjana,Rencana Menikah,117703159.0,1.0,1.0,78.0,90563.0,311757.0,40358.0,33875.0,7.0,6.0,5.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3812,1955,Magister,Menikah,78199470.0,0.0,0.0,33.0,6069.0,25977.0,3856.0,5784.0,5.0,1.0,0.0,0.0,
3813,1947,Doktor,Rencana Menikah,109306000.0,0.0,1.0,44.0,0.0,50575.0,,0.0,3.0,6.0,3.0,0.0,2014-06-09
3814,1974,Magister,Menikah,104621000.0,0.0,2.0,68.0,2023.0,62713.0,8092.0,0.0,7.0,5.0,7.0,0.0,2013-11-07
3815,1957,SMA,Rencana Menikah,110850000.0,1.0,1.0,67.0,18207.0,70805.0,24276.0,,4.0,5.0,4.0,0.0,2013-06-30


In [216]:
df_train_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

In [217]:
df_train_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   jumlah_promosi  3817 non-null   int64
dtypes: int64(1)
memory usage: 29.9 KB


https://datascience.stackexchange.com/questions/26581/should-i-impute-target-values

Cleaning


In [218]:
#drop tanggal_menjadi_anggota
df_train_feature = df_train_feature.drop(columns={'tanggal_menjadi_anggota'})
df_test = df_test.drop(columns={'tanggal_menjadi_anggota'})

In [219]:
label_encoder = LabelEncoder()
df_train_feature['pendidikan_encoded'] = label_encoder.fit_transform(df_train_feature['pendidikan'])
df_train_feature =df_train_feature.drop(columns='pendidikan')
df_test['pendidikan_encoded'] = label_encoder.fit_transform(df_test['pendidikan'])
df_test =df_test.drop(columns='pendidikan')

df_train_feature['status_pernikahan_encoded'] = label_encoder.fit_transform(df_train_feature['status_pernikahan'])
df_train_feature =df_train_feature.drop(columns='status_pernikahan')
df_test['status_pernikahan_encoded'] = label_encoder.fit_transform(df_test['status_pernikahan'])
df_test =df_test.drop(columns='status_pernikahan')


In [220]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
df_train_feature['tahun_kelahiran_binned'] = pd.cut(df_train_feature['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
df_train_feature.drop(columns='tahun_kelahiran', inplace=True)
df_test['tahun_kelahiran_binned'] = pd.cut(df_test['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
df_test.drop(columns='tahun_kelahiran', inplace=True)

In [221]:
from sklearn.ensemble import RandomForestClassifier

# Memisahkan fitur dan label
X = df_train_feature
y = df_train_label['jumlah_promosi']

# Melatih model Random Forest untuk menentukan feature importance
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Mendapatkan feature importance
feature_importances = model.feature_importances_

# Menampilkan feature importance
feature_importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

print(feature_importance_df)


                      Feature  Importance
0                  pendapatan    0.150370
5              belanja_daging    0.116151
3            terakhir_belanja    0.095650
4                belanja_buah    0.093883
7                 belanja_kue    0.093829
6                belanja_ikan    0.093479
9               pembelian_web    0.072439
10             pembelian_toko    0.071748
8            pembelian_diskon    0.059096
13  status_pernikahan_encoded    0.043209
12         pendidikan_encoded    0.042171
14     tahun_kelahiran_binned    0.029991
2          jumlah_anak_remaja    0.019906
1          jumlah_anak_balita    0.017139
11                    keluhan    0.000938


In [222]:
df_train_feature = df_train_feature.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})
df_test = df_test.drop(columns={'keluhan', 'jumlah_anak_balita', 'jumlah_anak_remaja'})

In [223]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in df_train_feature.select_dtypes(include=np.number):
    if column != 'jumlah_promosi':
        df_train_feature[column] = handle_outliers_iqr(df_train_feature[column])
        df_test[column] = handle_outliers_iqr(df_test[column])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data < lower_bound] = lower_bound
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data > upper_bound] = upper_bound
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data < lower_bound] = lower_bound
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[data > upper_bound] = upper_bound
A value is t

In [224]:
from sklearn.impute import SimpleImputer
import pandas as pd

# Membuat objek SimpleImputer untuk data pelatihan dengan strategi 'median'
imputer_median = SimpleImputer(strategy='median')

# Mengisi nilai yang hilang dalam data pelatihan dengan strategi 'median'
df_train_features_imputed = pd.DataFrame(imputer_median.fit_transform(df_train_feature), columns=df_train_feature.columns)

# Membuat objek SimpleImputer untuk data pengujian dengan strategi 'most_frequent'
imputer_most_frequent = SimpleImputer(strategy='most_frequent')

# Mengambil kolom 'ID' dari df_test
df_test_id = df_test['ID']

# Menghapus kolom 'ID' dari df_test
df_test_features = df_test.drop('ID', axis=1)

# Melakukan imputasi nilai yang hilang dalam data pengujian kecuali pada kolom 'ID' dengan strategi 'most_frequent'
df_test_features_imputed = pd.DataFrame(imputer_most_frequent.fit_transform(df_test_features), columns=df_test_features.columns)

# Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
df_test_imputed = pd.concat([df_test_id, df_test_features_imputed], axis=1)


In [225]:
df_train_features_imputed

Unnamed: 0,pendapatan,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,pendidikan_encoded,status_pernikahan_encoded,tahun_kelahiran_binned
0,115621394.0,47.0,50575.0,260967.0,50575.0,20230.0,2.0,2.0,5.0,5.0,4.0,3.0
1,84063000.0,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,5.0,4.0,2.0
2,127532564.0,45.0,117611.0,265460.0,96341.0,145573.0,1.0,1.0,7.0,5.0,3.0,3.0
3,165579620.0,90.0,206346.0,1613901.0,27725.0,125868.0,0.0,7.0,8.0,2.0,4.0,3.0
4,117703159.0,78.0,90563.0,311757.0,40358.0,33875.0,7.0,6.0,5.0,5.0,4.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3812,78199470.0,33.0,6069.0,25977.0,3856.0,5784.0,5.0,1.0,0.0,2.0,3.0,2.0
3813,109306000.0,44.0,0.0,50575.0,36054.5,0.0,3.0,6.0,3.0,1.0,4.0,2.0
3814,104621000.0,68.0,2023.0,62713.0,8092.0,0.0,7.0,5.0,7.0,2.0,3.0,3.0
3815,110850000.0,67.0,18207.0,70805.0,24276.0,27795.0,4.0,5.0,4.0,3.0,4.0,2.0


In [226]:
X_train, X_test, y_train, y_test = train_test_split(df_train_features_imputed, df_train_label, test_size=0.2)


In [227]:
# # from sklearn.preprocessing import MinMaxScaler
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()

# # minmax = MinMaxScaler()

# X_train = scaler.fit_transform(X_train)

# X_test = scaler.transform(X_test)

# # X_train = minmax.fit_transform(X_train)

# # X_test = minmax.transform(X_test)



In [228]:
print(X_test.shape)
print(X_train.shape)


(764, 12)
(3053, 12)


In [229]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=300, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)


from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

# Inisialisasi model XGBoost Classifier
xgb_model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)




  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


In [230]:
from sklearn.metrics import f1_score

# Evaluasi kinerja Random Forest Classifier
f1_macro_rf = f1_score(y_test, y_pred_rf, average='macro')
print("F1-score Macro untuk Random Forest Classifier:", f1_macro_rf)

# Evaluasi kinerja Gradient Boosting Classifier
f1_macro_gb = f1_score(y_test, y_pred_gb, average='macro')
print("F1-score Macro untuk Gradient Boosting Classifier:", f1_macro_gb)

# Evaluasi kinerja XGradient Boosting Classifier
f1_macro_xgb = f1_score(y_test, y_pred_xgb, average='macro')
print("F1-score Macro untuk XGBClassifier:", f1_macro_xgb)



F1-score Macro untuk Random Forest Classifier: 0.7172368779529249
F1-score Macro untuk Gradient Boosting Classifier: 0.5674939513220619
F1-score Macro untuk XGBClassifier: 0.6485073094123239


In [231]:
from sklearn.model_selection import cross_val_score

# Cross-validation untuk Random Forest
cv_scores_rf = cross_val_score(rf_model, X_train, y_train, cv=5, scoring='f1_macro')
print("Cross-Validation Scores - Random Forest:", cv_scores_rf)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)


Cross-Validation Scores - Random Forest: [0.70352967 0.71864976 0.73017511 0.66296877 0.70394196]


In [232]:
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# # 
# # Inisialisasi model Random Forest Classifier
# rf_model = RandomForestClassifier()

# # Lakukan cross-validation dengan F1-score macro sebagai metrik evaluasi
# f1_scores = cross_val_score(rf_model, X_train, y_train, cv=10, scoring='f1_macro')

# # Cetak hasil cross-validation
# print("F1-scores setiap fold:", f1_scores)
# print("Rata-rata F1-score:", f1_scores.mean())


Tuning RF


In [233]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

#deafult
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1_macro')

grid_search.fit(X_train, y_train)

print("Parameter Terbaik:", grid_search.best_params_)

y_pred_grid = grid_search.predict(X_test)
f1_macro_grid = f1_score(y_test, y_pred_grid, average='macro')
print("F1-score Macro setelah GridSearchCV:", f1_macro_grid)


  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **kwargs)
  return fit_method(estimator, *args, **

Parameter Terbaik: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
F1-score Macro setelah GridSearchCV: 0.7167335498473575


Tuning XGB

In [138]:
# from xgboost import XGBClassifier
# from sklearn.model_selection import GridSearchCV

# # Inisialisasi model XGBoost Classifier
# xgb_model = XGBClassifier()

# # Definisikan grid hyperparameter yang ingin Anda telusuri
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 5, 7],
#     'learning_rate': [0.1, 0.01, 0.001]
# }

# # Inisialisasi objek GridSearchCV
# grid_search_xgb = GridSearchCV(xgb_model, param_grid, cv=5, scoring='f1_macro')

# # Lakukan penyetelan hyperparameter pada data pelatihan
# grid_search_xgb.fit(X_train, y_train)

# # Cetak parameter terbaik yang ditemukan
# print("Parameter Terbaik:", grid_search_xgb.best_params_)

# # Evaluasi kinerja model menggunakan parameter terbaik pada data pengujian
# y_pred_grid_xgb = grid_search_xgb.predict(X_test)
# f1_macro_grid_xgb = f1_score(y_test, y_pred_grid_xgb, average='macro')
# print("F1-score Macro setelah GridSearchCV:", f1_macro_grid_xgb)


In [204]:
# from sklearn.ensemble import StackingClassifier

# # Meta-model dan base-models
# meta_model = LogisticRegression()
# base_models = [
#     ('rf', RandomForestClassifier(n_estimators=300, random_state=42)),
#     ('xgb', XGBClassifier(n_estimators=100, random_state=42)),
#     ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42))
# ]

# # Membuat Stacking Classifier
# stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# # Latih Stacking Classifier
# stacking_clf.fit(X_train, y_train)

# # Prediksi dan evaluasi
# y_pred = stacking_clf.predict(X_test)

# # Evaluasi dengan F1-score Macro
# f1_macro = f1_score(y_test, y_pred, average='macro')
# print("F1-score Macro untuk Stacking Classifier:", f1_macro)


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, dtype=self.classes_.dtype, warn=True)


F1-score Macro untuk Stacking Classifier: 0.7568628008423576


In [205]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission_data = df_test_imputed.drop(columns='ID')
submission['jumlah_promosi'] = grid_search.predict(submission_data)
submission.to_csv('../submissions/submission_simple_10.csv', index=False)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:pendapatan: object, terakhir_belanja: object, belanja_buah: object, belanja_daging: object, belanja_ikan: object, belanja_kue: object, pembelian_diskon: object, pembelian_web: object, pembelian_toko: object, pendidikan_encoded: object, status_pernikahan_encoded: object, tahun_kelahiran_binned: object