In [93]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer


In [94]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [95]:
df_train_feature = pd.read_csv(TRAIN_FEATURE_PATH)
df_train_label = pd.read_csv(TRAIN_LABEL_PATH)
df_test = pd.read_csv(TEST_PATH)

In [96]:
df_train_feature

Unnamed: 0,tahun_kelahiran,pendidikan,status_pernikahan,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,tanggal_menjadi_anggota
0,1979,Sarjana,Rencana Menikah,,0.0,1.0,,50575.0,260967.0,50575.0,20230.0,2.0,2.0,5.0,0.0,2014-05-05
1,1950,Sarjana,Rencana Menikah,84063000.0,,,70.0,6069.0,44506.0,80920.0,20230.0,9.0,6.0,4.0,0.0,2013-03-17
2,1966,Sarjana,Menikah,127532564.0,0.0,0.0,45.0,117611.0,265460.0,96341.0,145573.0,1.0,1.0,7.0,0.0,
3,1961,Magister,Rencana Menikah,165579620.0,0.0,0.0,90.0,206346.0,1613901.0,27725.0,125868.0,0.0,7.0,8.0,0.0,
4,1970,Sarjana,Rencana Menikah,117703159.0,1.0,1.0,78.0,90563.0,311757.0,40358.0,33875.0,7.0,6.0,5.0,0.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3812,1955,Magister,Menikah,78199470.0,0.0,0.0,33.0,6069.0,25977.0,3856.0,5784.0,5.0,1.0,0.0,0.0,
3813,1947,Doktor,Rencana Menikah,109306000.0,0.0,1.0,44.0,0.0,50575.0,,0.0,3.0,6.0,3.0,0.0,2014-06-09
3814,1974,Magister,Menikah,104621000.0,0.0,2.0,68.0,2023.0,62713.0,8092.0,0.0,7.0,5.0,7.0,0.0,2013-11-07
3815,1957,SMA,Rencana Menikah,110850000.0,1.0,1.0,67.0,18207.0,70805.0,24276.0,,4.0,5.0,4.0,0.0,2013-06-30


In [97]:
df_train_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

In [98]:
df_train_label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 1 columns):
 #   Column          Non-Null Count  Dtype
---  ------          --------------  -----
 0   jumlah_promosi  3817 non-null   int64
dtypes: int64(1)
memory usage: 29.9 KB


https://datascience.stackexchange.com/questions/26581/should-i-impute-target-values

Cleaning


In [99]:
df_train_feature = df_train_feature.drop(columns='tanggal_menjadi_anggota')
df_test = df_test.drop(columns='tanggal_menjadi_anggota')


In [100]:
label_encoder = LabelEncoder()
df_train_feature['pendidikan_encoded'] = label_encoder.fit_transform(df_train_feature['pendidikan'])
df_train_feature =df_train_feature.drop(columns='pendidikan')
df_test['pendidikan_encoded'] = label_encoder.fit_transform(df_test['pendidikan'])
df_test =df_test.drop(columns='pendidikan')

df_train_feature['status_pernikahan_encoded'] = label_encoder.fit_transform(df_train_feature['status_pernikahan'])
df_train_feature =df_train_feature.drop(columns='status_pernikahan')
df_test['status_pernikahan_encoded'] = label_encoder.fit_transform(df_test['status_pernikahan'])
df_test =df_test.drop(columns='status_pernikahan')


In [101]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   ID                         3818 non-null   int64  
 1   tahun_kelahiran            3818 non-null   int64  
 2   pendapatan                 3615 non-null   float64
 3   jumlah_anak_balita         3609 non-null   float64
 4   jumlah_anak_remaja         3608 non-null   float64
 5   terakhir_belanja           3617 non-null   float64
 6   belanja_buah               3632 non-null   float64
 7   belanja_daging             3623 non-null   float64
 8   belanja_ikan               3622 non-null   float64
 9   belanja_kue                3631 non-null   float64
 10  pembelian_diskon           3639 non-null   float64
 11  pembelian_web              3638 non-null   float64
 12  pembelian_toko             3632 non-null   float64
 13  keluhan                    3625 non-null   float

In [102]:
df_train_feature.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   tahun_kelahiran            3817 non-null   int64  
 1   pendapatan                 3627 non-null   float64
 2   jumlah_anak_balita         3627 non-null   float64
 3   jumlah_anak_remaja         3613 non-null   float64
 4   terakhir_belanja           3645 non-null   float64
 5   belanja_buah               3636 non-null   float64
 6   belanja_daging             3639 non-null   float64
 7   belanja_ikan               3624 non-null   float64
 8   belanja_kue                3603 non-null   float64
 9   pembelian_diskon           3639 non-null   float64
 10  pembelian_web              3652 non-null   float64
 11  pembelian_toko             3648 non-null   float64
 12  keluhan                    3621 non-null   float64
 13  pendidikan_encoded         3817 non-null   int32

In [103]:
# Membuat objek KNNImputer
imputer = KNNImputer(n_neighbors=13)

# Mengisi nilai yang hilang dalam data pelatihan
df_train_features_imputed = pd.DataFrame(imputer.fit_transform(df_train_feature), columns=df_train_feature.columns)



# Mengambil kolom 'ID' dari df_test
df_test_id = df_test['ID']

# Menghapus kolom 'ID' dari df_test
df_test_features = df_test.drop('ID', axis=1)

# Melakukan imputasi nilai yang hilang kecuali pada kolom 'ID'
df_test_features_imputed = pd.DataFrame(imputer.transform(df_test_features), columns=df_test_features.columns)

# Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
df_test_imputed = pd.concat([df_test_id, df_test_features_imputed], axis=1)



In [104]:
df_train_features_imputed

Unnamed: 0,tahun_kelahiran,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,pendidikan_encoded,status_pernikahan_encoded
0,1979.0,1.089153e+08,0.000000,1.000000,40.923077,50575.0,260967.0,50575.000000,20230.000000,2.0,2.0,5.0,0.0,5.0,4.0
1,1950.0,8.406300e+07,0.538462,0.230769,70.000000,6069.0,44506.0,80920.000000,20230.000000,9.0,6.0,4.0,0.0,5.0,4.0
2,1966.0,1.275326e+08,0.000000,0.000000,45.000000,117611.0,265460.0,96341.000000,145573.000000,1.0,1.0,7.0,0.0,5.0,3.0
3,1961.0,1.655796e+08,0.000000,0.000000,90.000000,206346.0,1613901.0,27725.000000,125868.000000,0.0,7.0,8.0,0.0,2.0,4.0
4,1970.0,1.177032e+08,1.000000,1.000000,78.000000,90563.0,311757.0,40358.000000,33875.000000,7.0,6.0,5.0,0.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3812,1955.0,7.819947e+07,0.000000,0.000000,33.000000,6069.0,25977.0,3856.000000,5784.000000,5.0,1.0,0.0,0.0,2.0,3.0
3813,1947.0,1.093060e+08,0.000000,1.000000,44.000000,0.0,50575.0,8796.307692,0.000000,3.0,6.0,3.0,0.0,1.0,4.0
3814,1974.0,1.046210e+08,0.000000,2.000000,68.000000,2023.0,62713.0,8092.000000,0.000000,7.0,5.0,7.0,0.0,2.0,3.0
3815,1957.0,1.108500e+08,1.000000,1.000000,67.000000,18207.0,70805.0,24276.000000,11048.461538,4.0,5.0,4.0,0.0,3.0,4.0


In [105]:
X_train, X_test, y_train, y_test = train_test_split(df_train_features_imputed, df_train_label, test_size=0.2, random_state=42)


In [106]:
# Hitung nilai kuartil pertama (Q1) dan kuartil ketiga (Q3) untuk setiap fitur dalam data pelatihan
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)

# Hitung rentang interkuartil (IQR) untuk setiap fitur
IQR = Q3 - Q1

# Tentukan batas bawah dan batas atas untuk mendefinisikan outlier
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Temukan indeks baris yang mengandung outlier
outlier_indices = ((X_train < lower_bound) | (X_train > upper_bound)).any(axis=1)

# Hapus baris yang mengandung outlier dari data pelatihan dan label yang sesuai
X_train = X_train[~outlier_indices]
y_train = y_train[~outlier_indices]

# Cetak jumlah outlier yang dihapus
print("Jumlah outlier yang dihapus:", sum(outlier_indices))


Jumlah outlier yang dihapus: 636


In [107]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)

X_test = scaler.transform(X_test)


In [108]:
print(X_test.shape)
print(X_train.shape)


(764, 15)
(2417, 15)


In [109]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)


from sklearn.ensemble import GradientBoostingClassifier

gb_model = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)




  return fit_method(estimator, *args, **kwargs)
  y = column_or_1d(y, warn=True)


In [110]:
from sklearn.metrics import f1_score

# Evaluasi kinerja Random Forest Classifier
f1_macro_rf = f1_score(y_test, y_pred_rf, average='macro')
print("F1-score Macro untuk Random Forest Classifier:", f1_macro_rf)

# Evaluasi kinerja Gradient Boosting Classifier
f1_macro_gb = f1_score(y_test, y_pred_gb, average='macro')
print("F1-score Macro untuk Gradient Boosting Classifier:", f1_macro_gb)


F1-score Macro untuk Random Forest Classifier: 0.6268064105594037
F1-score Macro untuk Gradient Boosting Classifier: 0.5109914221280191


In [111]:
# from sklearn.model_selection import cross_val_score
# from sklearn.ensemble import RandomForestClassifier
# # 
# # Inisialisasi model Random Forest Classifier
# rf_model = RandomForestClassifier()

# # Lakukan cross-validation dengan F1-score macro sebagai metrik evaluasi
# f1_scores = cross_val_score(rf_model, X_train, y_train, cv=10, scoring='f1_macro')

# # Cetak hasil cross-validation
# print("F1-scores setiap fold:", f1_scores)
# print("Rata-rata F1-score:", f1_scores.mean())


Tuning


In [112]:
# from sklearn.model_selection import GridSearchCV
# from sklearn.ensemble import RandomForestClassifier

# rf_model = RandomForestClassifier()

# #deafult
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

# grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1_macro')

# grid_search.fit(X_train, y_train)

# print("Parameter Terbaik:", grid_search.best_params_)

# y_pred_grid = grid_search.predict(X_test)
# f1_macro_grid = f1_score(y_test, y_pred_grid, average='macro')
# print("F1-score Macro setelah GridSearchCV:", f1_macro_grid)


In [113]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission_data = df_test_imputed.drop(columns='ID')
submission['jumlah_promosi'] = rf_model.predict(submission_data)
submission.to_csv('../submissions/submission_2.csv', index=False)