# Importing Libraries

In [62]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

from fast_ml.model_development import train_valid_test_split


# Data Extraction

In [63]:
TRAIN_FEATURE_PATH = r'../../Datasets/train_features.csv'
TRAIN_LABEL_PATH = r'../../Datasets/train_labels.csv'
TEST_PATH = r'../../Datasets/test_features.csv'
SAMPLE_SUBMISSION_PATH = r"../../Datasets/submission_format.csv"

In [64]:
train_feature_dat = pd.read_csv(TRAIN_FEATURE_PATH)
train_label_dat = pd.read_csv(TRAIN_LABEL_PATH)
test_dat = pd.read_csv(TEST_PATH)

In [65]:
train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 16 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   tahun_kelahiran          3817 non-null   int64  
 1   pendidikan               3628 non-null   object 
 2   status_pernikahan        3605 non-null   object 
 3   pendapatan               3627 non-null   float64
 4   jumlah_anak_balita       3627 non-null   float64
 5   jumlah_anak_remaja       3613 non-null   float64
 6   terakhir_belanja         3645 non-null   float64
 7   belanja_buah             3636 non-null   float64
 8   belanja_daging           3639 non-null   float64
 9   belanja_ikan             3624 non-null   float64
 10  belanja_kue              3603 non-null   float64
 11  pembelian_diskon         3639 non-null   float64
 12  pembelian_web            3652 non-null   float64
 13  pembelian_toko           3648 non-null   float64
 14  keluhan                 

## Data Prep


In [66]:
#drop tanggal_menjadi_anggota
train_feature_dat = train_feature_dat.drop(columns='tanggal_menjadi_anggota')
test_dat = test_dat.drop(columns='tanggal_menjadi_anggota')


## Null Handling

In [67]:
# drop all null values
train_feature_dat = train_feature_dat.fillna(train_feature_dat.mean())
test_dat = test_dat.fillna(test_dat.mean())

train_feature_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3817 entries, 0 to 3816
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tahun_kelahiran     3817 non-null   int64  
 1   pendidikan          3628 non-null   object 
 2   status_pernikahan   3605 non-null   object 
 3   pendapatan          3817 non-null   float64
 4   jumlah_anak_balita  3817 non-null   float64
 5   jumlah_anak_remaja  3817 non-null   float64
 6   terakhir_belanja    3817 non-null   float64
 7   belanja_buah        3817 non-null   float64
 8   belanja_daging      3817 non-null   float64
 9   belanja_ikan        3817 non-null   float64
 10  belanja_kue         3817 non-null   float64
 11  pembelian_diskon    3817 non-null   float64
 12  pembelian_web       3817 non-null   float64
 13  pembelian_toko      3817 non-null   float64
 14  keluhan             3817 non-null   float64
dtypes: float64(12), int64(1), object(2)
memory usage: 447.4

## Encoding

### One hot encoding

In [68]:
# # Perform one-hot encoding
# train_feature_dat = pd.get_dummies(train_feature_dat, columns=['pendidikan', 'status_pernikahan'])
# test_dat = pd.get_dummies(test_dat, columns=['pendidikan', 'status_pernikahan'])

In [69]:
test_dat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3818 entries, 0 to 3817
Data columns (total 15 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   tahun_kelahiran     3818 non-null   int64  
 1   pendidikan          3614 non-null   object 
 2   status_pernikahan   3636 non-null   object 
 3   pendapatan          3818 non-null   float64
 4   jumlah_anak_balita  3818 non-null   float64
 5   jumlah_anak_remaja  3818 non-null   float64
 6   terakhir_belanja    3818 non-null   float64
 7   belanja_buah        3818 non-null   float64
 8   belanja_daging      3818 non-null   float64
 9   belanja_ikan        3818 non-null   float64
 10  belanja_kue         3818 non-null   float64
 11  pembelian_diskon    3818 non-null   float64
 12  pembelian_web       3818 non-null   float64
 13  pembelian_toko      3818 non-null   float64
 14  keluhan             3818 non-null   float64
dtypes: float64(12), int64(1), object(2)
memory usage: 447.5

### Label Encoding

In [70]:
label_encoder = LabelEncoder()
train_feature_dat['pendidikan_encoded'] = label_encoder.fit_transform(train_feature_dat['pendidikan'])
train_feature_dat =train_feature_dat.drop(columns='pendidikan')
test_dat['pendidikan_encoded'] = label_encoder.fit_transform(test_dat['pendidikan'])
test_dat =test_dat.drop(columns='pendidikan')

train_feature_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(train_feature_dat['status_pernikahan'])
train_feature_dat =train_feature_dat.drop(columns='status_pernikahan')
test_dat['status_pernikahan_encoded'] = label_encoder.fit_transform(test_dat['status_pernikahan'])
test_dat =test_dat.drop(columns='status_pernikahan')


# EDA (Exploratory Data Analysis)

# SPLIT TRAIN AND TEST

In [71]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(train_feature_dat, train_label_dat, test_size=0.3, random_state=42)


In [72]:
train_feature_dat

Unnamed: 0,tahun_kelahiran,pendapatan,jumlah_anak_balita,jumlah_anak_remaja,terakhir_belanja,belanja_buah,belanja_daging,belanja_ikan,belanja_kue,pembelian_diskon,pembelian_web,pembelian_toko,keluhan,pendidikan_encoded,status_pernikahan_encoded
0,1979,1.144832e+08,0.00000,1.000000,47.23155,50575.0,260967.0,50575.000000,20230.00000,2.0,2.0,5.0,0.0,5,4
1,1950,8.406300e+07,0.29308,0.353723,70.00000,6069.0,44506.0,80920.000000,20230.00000,9.0,6.0,4.0,0.0,5,4
2,1966,1.275326e+08,0.00000,0.000000,45.00000,117611.0,265460.0,96341.000000,145573.00000,1.0,1.0,7.0,0.0,5,3
3,1961,1.655796e+08,0.00000,0.000000,90.00000,206346.0,1613901.0,27725.000000,125868.00000,0.0,7.0,8.0,0.0,2,4
4,1970,1.177032e+08,1.00000,1.000000,78.00000,90563.0,311757.0,40358.000000,33875.00000,7.0,6.0,5.0,0.0,5,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3812,1955,7.819947e+07,0.00000,0.000000,33.00000,6069.0,25977.0,3856.000000,5784.00000,5.0,1.0,0.0,0.0,2,3
3813,1947,1.093060e+08,0.00000,1.000000,44.00000,0.0,50575.0,81428.997792,0.00000,3.0,6.0,3.0,0.0,1,4
3814,1974,1.046210e+08,0.00000,2.000000,68.00000,2023.0,62713.0,8092.000000,0.00000,7.0,5.0,7.0,0.0,2,3
3815,1957,1.108500e+08,1.00000,1.000000,67.00000,18207.0,70805.0,24276.000000,63377.97058,4.0,5.0,4.0,0.0,3,4


# FEATURE SCALING

In [73]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [74]:
print(X_train.shape)
print(X_test.shape)

(2671, 15)
(1146, 15)


#  MODEL

In [75]:
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import statsmodels.api as sm

def metrics(y_true, y_pred):
    print("F1 Score  :", f1_score(y_true, y_pred, average='macro'))

def train_eval_models(models: dict, X_train, X_test, y_train, y_test):
    for model in models:
        m = model
        m.fit(X_train, y_train)
        y_pred = m.predict(X_test)
        print(model.__class__.__name__, models[model])
        metrics(y_test, y_pred)

### Gradient Boost

In [76]:
gdb = GradientBoostingClassifier()
gdb.fit(X_train, y_train)

y_pred = gdb.predict(X_test)
metrics(y_test, y_pred)

F1 Score  : 0.5622729758556305


### Kaggle Submission


In [79]:
submission = pd.read_csv(SAMPLE_SUBMISSION_PATH)
submission_data = test_dat.drop(columns='ID')
submission['jumlah_promosi'] = gdb.predict(test_dat)
submission.to_csv('../submissions/submission_1.csv', index=False)