# Importing Libraries

In [69]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split


from fast_ml.model_development import train_valid_test_split


# Data Extraction

In [70]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [71]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [72]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

# Data Prep


In [73]:
train_feature_dat = pd.merge(train_feature_dat, train_label_dat, left_index=True, right_index=True)

## Dropping Irrelevant features

In [74]:
train_feature_dat = train_feature_dat[['tahun_kelahiran', 'pendapatan', 'terakhir_belanja', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue', 'pembelian_toko', 'jumlah_promosi']]
test_dat = test_dat[['tahun_kelahiran', 'pendapatan', 'terakhir_belanja', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue', 'pembelian_toko']]

In [75]:
# #drop tanggal_menjadi_anggota & Belanjaan
# train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})
# test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})

## Null Handling

### Mean for Numerical, Mode for Categorical

In [76]:
# # fill all null values with mean and mode
# train_feature_dat.fillna(train_feature_dat.mean(), inplace=True)
# test_dat.fillna(test_dat.mean(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### Median for Numerical, Mode for Categorical

In [77]:
# fill all null values with median and mode
train_feature_dat.fillna(train_feature_dat.median(), inplace=True)
test_dat.fillna(test_dat.median(), inplace=True)

train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   tahun_kelahiran   3817 non-null   int64  
 1   pendapatan        3817 non-null   float64
 2   terakhir_belanja  3817 non-null   float64
 3   belanja_buah      3817 non-null   float64
 4   belanja_daging    3817 non-null   float64
 5   belanja_ikan      3817 non-null   float64
 6   belanja_kue       3817 non-null   float64
 7   pembelian_toko    3817 non-null   float64
 8   jumlah_promosi    3817 non-null   int64  
dtypes: float64(7), int64(2)
memory usage: 268.5 KB


### KNN Imputer

In [78]:
# categorical_columns = train_feature_dat.select_dtypes(include=['object']).columns

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# train_feature_dat[categorical_columns] = encoder.fit_transform(train_feature_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# train_feature_dat = pd.DataFrame(imputer.fit_transform(train_feature_dat), columns=train_feature_dat.columns)

# # Decode the imputed numerical values back to categorical values
# train_feature_dat[categorical_columns] = encoder.inverse_transform(train_feature_dat[categorical_columns].astype(int))


In [79]:
# # Mengambil kolom 'ID' dari df_test
# test_dat_id = test_dat['ID']

# # Menghapus kolom 'ID' dari test_dat
# test_dat = test_dat.drop('ID', axis=1)

# # Encode categorical features into numerical format
# encoder = OrdinalEncoder()
# test_dat[categorical_columns] = encoder.fit_transform(test_dat[categorical_columns])

# # Apply KNN imputer to impute missing values
# imputer = KNNImputer(n_neighbors=5)
# test_dat = pd.DataFrame(imputer.fit_transform(test_dat), columns=test_dat.columns)

# # Decode the imputed numerical values back to categorical values
# test_dat[categorical_columns] = encoder.inverse_transform(test_dat[categorical_columns].astype(int))

# # Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
# test_dat = pd.concat([test_dat_id, test_dat], axis=1)

## Outlier Handling

### Windsorizer

In [80]:
# #windsorizer
# def windsorize_by_percentage(data, lower_percentile, upper_percentile):
#     lower_bound = np.percentile(data, lower_percentile)
#     upper_bound = np.percentile(data, upper_percentile)
#     windsorized_data = []
#     for value in data:
#         if value < lower_bound:
#             windsorized_data.append(lower_bound)
#         elif value > upper_bound:
#             windsorized_data.append(upper_bound)
#         else:
#             windsorized_data.append(value)

#     return windsorized_data

# # Specify lower and upper percentiles
# lower_percentile = 10
# upper_percentile = 90

# for column in train_feature_dat.select_dtypes(include=np.number):
#     train_feature_dat[column] = windsorize_by_percentage(train_feature_dat[column], lower_percentile, upper_percentile)
#     test_dat[column] = windsorize_by_percentage(test_dat[column], lower_percentile, upper_percentile)



### IQR

In [81]:
def handle_outliers_iqr(data):
    # Calculate quartiles
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    
    # Calculate lower and upper bounds
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Handle outliers
    # Replace outliers with the upper or lower bound
    data[data < lower_bound] = lower_bound
    data[data > upper_bound] = upper_bound
    
    return data

for column in train_feature_dat.select_dtypes(include=np.number):
    if column != 'jumlah_promosi':
        train_feature_dat[column] = handle_outliers_iqr(train_feature_dat[column])
        test_dat[column] = handle_outliers_iqr(test_dat[column])


## Encoding

### One hot encoding

In [82]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

### Label Encoding

In [83]:
# label_encoder = LabelEncoder()
# train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
# train_feature_dat =train_feature_dat.drop(columns='pendidikan')
# test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
# test_dat =test_dat.drop(columns='pendidikan')

# train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
# train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
# test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
# test_dat =test_dat.drop(columns='status_pernikahan')


In [84]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# Feature Engineering

## Binning Tahun Kelahiran

In [85]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
train_feature_dat['tahun_kelahiran_binned'] = train_feature_dat['tahun_kelahiran_binned'].astype('int')

test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
test_dat.drop(columns='tahun_kelahiran', inplace=True)
test_dat['tahun_kelahiran_binned'] = test_dat['tahun_kelahiran_binned'].astype('int')

## Binning Terakhir Belanja

In [86]:
# Choose the number of bins
num_bins = 5

# Bin the data using equal-width binning
train_feature_dat['terakhir_belanja_bins'] = pd.cut(train_feature_dat['terakhir_belanja'], bins=num_bins, labels=False)
# train_feature_dat.drop(columns='terakhir_belanja', inplace=True)

test_dat['terakhir_belanja_bins'] = pd.cut(test_dat['terakhir_belanja'], bins=num_bins, labels=False)
# test_dat.drop(columns='terakhir_belanja', inplace=True)

In [87]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 10 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   pendapatan              3817 non-null   float64
 1   terakhir_belanja        3817 non-null   float64
 2   belanja_buah            3817 non-null   float64
 3   belanja_daging          3817 non-null   float64
 4   belanja_ikan            3817 non-null   float64
 5   belanja_kue             3817 non-null   float64
 6   pembelian_toko          3817 non-null   float64
 7   jumlah_promosi          3817 non-null   int64  
 8   tahun_kelahiran_binned  3817 non-null   int32  
 9   terakhir_belanja_bins   3817 non-null   int64  
dtypes: float64(7), int32(1), int64(2)
memory usage: 283.4 KB


In [88]:
# train_feature_dat.to_csv('../../Datasets/cleaned.csv', index=False)

# SPLIT TRAIN AND TEST

In [89]:
# Train test split
X = train_feature_dat.drop(columns='jumlah_promosi')
y = train_feature_dat['jumlah_promosi']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


# Oversampling

In [90]:
from imblearn.over_sampling import SMOTE

# Apply SMOTE for oversampling
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# FEATURE SCALING

In [91]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [92]:
print(X_train.shape)
print(X_test.shape)

(4823, 9)
(1146, 9)


#  MODEL

In [93]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## KNN

In [94]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.47735508125164877


## Gradient Boost

In [95]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train, y_train)

y_pred = gdb.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.5497467648338119


## XGBoost

In [96]:
xgboost = xgb.XGBClassifier(objective='binary:logistic',random_state=42 )
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.6581792555621321


## CatBoost

In [97]:
# Define CatBoost classifier
catboost = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass')

# Train the catboost
catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

# Predict on test set
y_pred = catboost.predict(X_test)
metrics(y_test, y_pred)


0:	learn: 1.9062422	test: 1.9182394	best: 1.9182394 (0)	total: 7.95ms	remaining: 7.94s


100:	learn: 1.0489492	test: 1.3834391	best: 1.3834391 (100)	total: 719ms	remaining: 6.4s
200:	learn: 0.7605273	test: 1.2402519	best: 1.2402519 (200)	total: 1.36s	remaining: 5.39s
300:	learn: 0.5933080	test: 1.1709016	best: 1.1709016 (300)	total: 1.99s	remaining: 4.63s
400:	learn: 0.4799615	test: 1.1289557	best: 1.1289557 (400)	total: 2.63s	remaining: 3.93s
500:	learn: 0.3972790	test: 1.1033948	best: 1.1033948 (500)	total: 3.31s	remaining: 3.29s
600:	learn: 0.3372517	test: 1.0842124	best: 1.0841225 (598)	total: 4.02s	remaining: 2.67s
700:	learn: 0.2869673	test: 1.0707682	best: 1.0707682 (700)	total: 4.75s	remaining: 2.03s
800:	learn: 0.2483200	test: 1.0644468	best: 1.0644468 (800)	total: 5.5s	remaining: 1.37s
900:	learn: 0.2185451	test: 1.0573533	best: 1.0573533 (900)	total: 6.23s	remaining: 685ms
999:	learn: 0.1942228	test: 1.0545300	best: 1.0539978 (984)	total: 7s	remaining: 0us

bestTest = 1.053997806
bestIteration = 984

Shrink model to first 985 iterations.
F1 Score  : 0.6441090981

## RF Tuned

In [68]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()

#deafult
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='f1_macro')

grid_search.fit(X_train, y_train)

print("Parameter Terbaik:", grid_search.best_params_)

y_pred_grid = grid_search.predict(X_test)
f1_macro_grid = f1_score(y_test, y_pred_grid, average='macro')
print("F1-score Macro setelah GridSearchCV:", f1_macro_grid)


KeyboardInterrupt: 

# Kaggle Submission


In [34]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# test_dat.drop(columns='ID', inplace=True)
submission['jumlah_promosi'] = grid_search.predict(test_dat)
submission.to_csv('../submissions/test_gridRF.csv', index=False)

In [37]:
SUBMIT_PATH = '../submissions/submission_oversampling.csv'

csv = pd.read_csv(SUBMIT_PATH)

In [38]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts

0    951
1    432
2    396
3    519
4    565
5    587
6    368
Name: jumlah_promosi, dtype: int64

In [36]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts

0    886
1    420
2    378
3    535
4    585
5    633
6    381
Name: jumlah_promosi, dtype: int64