**Link: https://www.kaggle.com/competitions/playground-series-s3e4**

In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


**Import Data**

In [3]:
train_data = pd.read_csv('train.csv', index_col = 'id')
test_data = pd.read_csv('test.csv', index_col = 'id')
train_data['Source'] = 'train'; test_data['Source'] = 'test'
train_x, test_x = train_data.drop('Class', axis = 1), test_data
all_x = pd.concat([train_x, test_x])

**EDA**

In [4]:
def plt_data(data: pd.DataFrame, ncol, nrow):
    fig, axs = plt.subplots(nrow, ncol, figsize = (20, 50))
    for feature, ax in zip(data.columns.drop('Source'), axs.flatten()):
        #print(feature)
        train_feature, test_feature = data.loc[data['Source'] == 'train', feature], data.loc[data['Source'] == 'test', feature]
        sns.kdeplot(data = train_feature, fill = True, ax = ax, common_grid = True)
        sns.kdeplot(data = test_feature, fill = True, ax = ax, common_grid = True)
        ax.set_title(f'Feature-{feature}'); ax.grid(visible = True, which = 'major')
    plt.subplots_adjust(wspace = 0.2, hspace = 0.3)
    plt.show()

**Compare Target Feature**

In [5]:
train_data['Class'].describe()

count    219129.000000
mean          0.002140
std           0.046214
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max           1.000000
Name: Class, dtype: float64

In [15]:
positive_account = len(train_data['Class'][train_data.Class == 1])
negative_account = len(train_data['Class'][train_data.Class == 0])
positive_rate = positive_account/(positive_account + negative_account)
print(f"Positive Data samples: {positive_account}", '\n', f"Negative Data samples: {negative_account}", '\n', "Positive Rate: {:.2%}".format(positive_rate), sep = '')

Positive Data samples: 469
Negative Data samples: 218660
Positive Rate: 0.21%


Transform Time Feature

In [16]:
all_x.Time = all_x.Time % (24 * 3600)
train_x = all_x.loc[all_x['Source'] == 'train', 'Time':'Amount']

**Model**

In [28]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, RocCurveDisplay


from sklearn.model_selection import StratifiedKFold # defend with the unbalance positive data!!!(re-sampling)
from xgboost import XGBClassifier


# because target class rate is very low, so use (strtified)Kfold algorithm to get efficient model,
# and there has another way to deal with the unbalanced dataset -> anomaly detection(semi-supervised learning method)


def score(name, model, tra_x, tra_y, cro_x, cro_y):
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    kfold = StratifiedKFold(n_splits = 6, shuffle = True, random_state = 5)
    for tra, cro in kfold.split(tra_x, tra_y):
        pipe.fit(tra_x.iloc[tra], tra_y.iloc[tra])
    pre_y = pipe.predict_proba(cro_x)[:, 1]
    print(f'model {name} score:', roc_auc_score(cro_y, pre_y))
    return pipe

tra_x, cro_x, tra_y, cro_y = train_test_split(train_x, train_data['Class'], test_size = 0.2)

In [36]:
# Logistic Regression
log_model = LogisticRegression(random_state = 5, max_iter = 500)
ran_model = RandomForestClassifier(n_estimators = 200, max_depth = 8, random_state = 5, n_jobs = -1, max_features = 'log2')
xgb_model = XGBClassifier(max_leaves = 2, max_depth = 12, n_jobs = -1, random_state = 5)
#score("Logistic Regression", log_model, tra_x, tra_y, cro_x, cro_y)
#score('Random Forest Classifier', ran_model, tra_x, tra_y, cro_x, cro_y)
score('XGB Classifier', xgb_model, tra_x, tra_y, cro_x, cro_y)

model XGB Classifier score: 0.7989221199373127


In [59]:
model = score("Logistic Regression", log_model, tra_x, tra_y, cro_x, cro_y)
y_pre = model.predict_proba(test_x.drop('Source', axis = 1))[:,1]
final_db = pd.DataFrame({'id':test_x.index,'Class':y_pre})
final_db.to_csv('final.csv', index_label = 'id', index = False)

model Logistic Regression score: 0.8449650085841728


Final Score -> 0.817