# Importing Libraries

In [299]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder
from catboost import CatBoostClassifier
import xgboost as xgb

from sklearn.model_selection import train_test_split


from fast_ml.model_development import train_valid_test_split


# Data Extraction

In [300]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [301]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [302]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

# Data Prep


In [303]:
#drop tanggal_menjadi_anggota & Belanjaan
train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})
test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})

## Null Handling

In [304]:
# # fill all null values
# train_feature_dat.fillna(train_feature_dat.mean(), inplace=True)
# test_dat.fillna(test_dat.mean(), inplace=True)

# train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
# test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
# train_feature_dat.info()

### KNN Imputer

In [305]:
categorical_columns = train_feature_dat.select_dtypes(include=['object']).columns

# Encode categorical features into numerical format
encoder = OrdinalEncoder()
train_feature_dat[categorical_columns] = encoder.fit_transform(train_feature_dat[categorical_columns])

# Apply KNN imputer to impute missing values
imputer = KNNImputer(n_neighbors=5)
train_feature_dat = pd.DataFrame(imputer.fit_transform(train_feature_dat), columns=train_feature_dat.columns)

# Decode the imputed numerical values back to categorical values
train_feature_dat[categorical_columns] = encoder.inverse_transform(train_feature_dat[categorical_columns].astype(int))


In [306]:
# Mengambil kolom 'ID' dari df_test
test_dat_id = test_dat['ID']

# Menghapus kolom 'ID' dari test_dat
test_dat = test_dat.drop('ID', axis=1)

# Encode categorical features into numerical format
encoder = OrdinalEncoder()
test_dat[categorical_columns] = encoder.fit_transform(test_dat[categorical_columns])

# Apply KNN imputer to impute missing values
imputer = KNNImputer(n_neighbors=5)
test_dat = pd.DataFrame(imputer.fit_transform(test_dat), columns=test_dat.columns)

# Decode the imputed numerical values back to categorical values
test_dat[categorical_columns] = encoder.inverse_transform(test_dat[categorical_columns].astype(int))

# Menggabungkan kembali kolom 'ID' dengan data yang telah diimputasi
test_dat = pd.concat([test_dat_id, test_dat], axis=1)

## Outlier Handling

In [307]:
#windsorizer
def windsorize_by_percentage(data, lower_percentile, upper_percentile):
    lower_bound = np.percentile(data, lower_percentile)
    upper_bound = np.percentile(data, upper_percentile)
    windsorized_data = []
    for value in data:
        if value < lower_bound:
            windsorized_data.append(lower_bound)
        elif value > upper_bound:
            windsorized_data.append(upper_bound)
        else:
            windsorized_data.append(value)

    return windsorized_data

# Specify lower and upper percentiles
lower_percentile = 10
upper_percentile = 90

for column in train_feature_dat.select_dtypes(include=np.number):
    train_feature_dat[column] = windsorize_by_percentage(train_feature_dat[column], lower_percentile, upper_percentile)
    test_dat[column] = windsorize_by_percentage(test_dat[column], lower_percentile, upper_percentile)



## Encoding

### One hot encoding

In [308]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

### Label Encoding

In [309]:
label_encoder = LabelEncoder()
train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
train_feature_dat =train_feature_dat.drop(columns='pendidikan')
test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
test_dat =test_dat.drop(columns='pendidikan')

train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
test_dat =test_dat.drop(columns='status_pernikahan')


# Feature Engineering

## Binning Tahun Kelahiran

In [310]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
train_feature_dat['tahun_kelahiran_binned'] = train_feature_dat['tahun_kelahiran_binned'].astype('int')
test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
test_dat.drop(columns='tahun_kelahiran', inplace=True)
test_dat['tahun_kelahiran_binned'] = test_dat['tahun_kelahiran_binned'].astype('int')

# SPLIT TRAIN AND TEST

In [311]:
# Train test split
# Train test split
X_train, X_test, y_train, y_test = train_test_split(train_feature_dat, train_label_dat, test_size=0.3, random_state=42)


# FEATURE SCALING

In [312]:
# scaler = MinMaxScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

In [313]:
print(X_train.shape)
print(X_test.shape)

(2671, 11)
(1146, 11)


#  MODEL

In [314]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## KNN

In [315]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.3605781328247498


## Gradient Boost

In [316]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train, y_train)

y_pred = gdb.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.5273509831776079


## XGBoost

In [317]:
xgboost = xgb.XGBClassifier(objective='binary:logistic',random_state=42 )
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.6190785698787161


## CatBoost

In [318]:
# Define CatBoost classifier
catboost = CatBoostClassifier(iterations=1000, depth=6, learning_rate=0.1, loss_function='MultiClass')

# Train the catboost
catboost.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=50, verbose=100)

# Predict on test set
y_pred = catboost.predict(X_test)
metrics(y_test, y_pred)


0:	learn: 1.9073559	test: 1.9129905	best: 1.9129905 (0)	total: 9.06ms	remaining: 9.06s
100:	learn: 1.0358080	test: 1.3563170	best: 1.3563170 (100)	total: 605ms	remaining: 5.39s
200:	learn: 0.7429585	test: 1.2198814	best: 1.2198814 (200)	total: 1.2s	remaining: 4.78s
300:	learn: 0.5745997	test: 1.1676279	best: 1.1676279 (300)	total: 1.83s	remaining: 4.24s
400:	learn: 0.4619222	test: 1.1355235	best: 1.1354944 (399)	total: 2.41s	remaining: 3.6s
500:	learn: 0.3802125	test: 1.1228300	best: 1.1225758 (499)	total: 2.98s	remaining: 2.97s
600:	learn: 0.3232447	test: 1.1144476	best: 1.1137875 (590)	total: 3.65s	remaining: 2.42s
700:	learn: 0.2750729	test: 1.1090772	best: 1.1090772 (700)	total: 4.27s	remaining: 1.82s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 1.106200615
bestIteration = 739

Shrink model to first 740 iterations.
F1 Score  : 0.5995452003798684


# Kaggle Submission


In [319]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
# test_dat.drop(columns='ID', inplace=True)
submission['jumlah_promosi'] = catboost.predict(test_dat)
submission.to_csv('../submissions/testing_4.csv', index=False)

In [320]:
SUBMIT_PATH = '../submissions/testing_4.csv'

csv = pd.read_csv(SUBMIT_PATH)

In [321]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts

0    1196
1     348
2     297
3     449
4     578
5     688
6     262
Name: jumlah_promosi, dtype: int64