# Importing Libraries

In [449]:
# Pake yang ini
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
import xgboost as xgb

from sklearn.model_selection import train_test_split


from fast_ml.model_development import train_valid_test_split


# Data Extraction

In [450]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [451]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [452]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

# Data Prep


In [453]:
#drop tanggal_menjadi_anggota & Belanjaan
train_feature_dat = train_feature_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})
test_dat = test_dat.drop(columns={'tanggal_menjadi_anggota', 'belanja_buah', 'belanja_daging', 'belanja_ikan', 'belanja_kue'})

## Null Handling

In [454]:
# fill all null values
train_feature_dat.fillna(train_feature_dat.mean(), inplace=True)
test_dat.fillna(test_dat.mean(), inplace=True)

train_feature_dat.fillna(train_feature_dat.mode().iloc[0], inplace=True)
test_dat.fillna(test_dat.mode().iloc[0], inplace=True)
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tahun_kelahiran     3817 non-null   int64  
 1   pendidikan          3817 non-null   object 
 2   status_pernikahan   3817 non-null   object 
 3   pendapatan          3817 non-null   float64
 4   jumlah_anak_balita  3817 non-null   float64
 5   jumlah_anak_remaja  3817 non-null   float64
 6   terakhir_belanja    3817 non-null   float64
 7   belanja_buah        3817 non-null   float64
 8   belanja_daging      3817 non-null   float64
 9   belanja_ikan        3817 non-null   float64
 10  belanja_kue         3817 non-null   float64
 11  pembelian_diskon    3817 non-null   float64
 12  pembelian_web       3817 non-null   float64
 13  pembelian_toko      3817 non-null   float64
 14  keluhan             3817 non-null   float64
dtypes: float64(12), int64(1), object(2)
memory usage: 447.4

## Outlier Handling

In [455]:
#windsorizer
def windsorize_by_percentage(data, lower_percentile, upper_percentile):
    lower_bound = np.percentile(data, lower_percentile)
    upper_bound = np.percentile(data, upper_percentile)
    windsorized_data = []
    for value in data:
        if value < lower_bound:
            windsorized_data.append(lower_bound)
        elif value > upper_bound:
            windsorized_data.append(upper_bound)
        else:
            windsorized_data.append(value)

    return windsorized_data

# Specify lower and upper percentiles
lower_percentile = 10
upper_percentile = 90

for column in train_feature_dat.select_dtypes(include=np.number):
    train_feature_dat[column] = windsorize_by_percentage(train_feature_dat[column], lower_percentile, upper_percentile)
    test_dat[column] = windsorize_by_percentage(test_dat[column], lower_percentile, upper_percentile)



## Encoding

### One hot encoding

In [456]:
# # Perform one-hot encoding
# train_dat = pd.get_dummies(train_dat, columns=['attribute_0', 'attribute_1'])
# test_dat = pd.get_dummies(test_dat, columns=['attribute_0', 'attribute_1'])

### Label Encoding

In [457]:
label_encoder = LabelEncoder()
train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
train_feature_dat =train_feature_dat.drop(columns='pendidikan')
test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
test_dat =test_dat.drop(columns='pendidikan')

train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
test_dat =test_dat.drop(columns='status_pernikahan')


# Feature Engineering

## Binning Tahun Kelahiran

In [458]:
# Define bin edges and labels
bin_edges = [1890, 1920, 1940, 1960, 1980, 2000, 2010]
bin_labels = ['0', '1', '2', '3', '4', '5']

# Perform binning
train_feature_dat['tahun_kelahiran_binned'] = pd.cut(train_feature_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
train_feature_dat.drop(columns='tahun_kelahiran', inplace=True)
train_feature_dat['tahun_kelahiran_binned'] = train_feature_dat['tahun_kelahiran_binned'].astype('int')
test_dat['tahun_kelahiran_binned'] = pd.cut(test_dat['tahun_kelahiran'], bins=bin_edges, labels=bin_labels)
test_dat.drop(columns='tahun_kelahiran', inplace=True)
test_dat['tahun_kelahiran_binned'] = test_dat['tahun_kelahiran_binned'].astype('int')

# SPLIT TRAIN AND TEST

In [459]:
# Train test split
# Train test split
X_train, X_test, y_train, y_test = train_test_split(train_feature_dat, train_label_dat, test_size=0.3, random_state=42)


# FEATURE SCALING

In [460]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [461]:
print(X_train.shape)
print(X_test.shape)

(2671, 15)
(1146, 15)


#  MODEL

In [462]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

## KNN

In [463]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.4989660235833643


## Gradient Boost

In [464]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train, y_train)

y_pred = gdb.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.5375259635363211


## XGBoost

In [465]:
xgboost = xgb.XGBClassifier(objective='binary:logistic',random_state=42, )
xgboost.fit(X_train, y_train)

y_pred = xgboost.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.6513704231099195


# Kaggle Submission


In [466]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
test_dat.drop(columns='ID', inplace=True)
submission['jumlah_promosi'] = xgboost.predict(test_dat)
submission.to_csv('../submissions/testing_4.csv', index=False)

In [470]:
SUBMIT_PATH = '../submissions/submission_2.csv'

csv = pd.read_csv(SUBMIT_PATH)

In [471]:
category_counts = csv['jumlah_promosi'].value_counts().sort_index()
category_counts

0    1120
1     490
2     310
3     488
4     531
5     617
6     262
Name: jumlah_promosi, dtype: int64