In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from imblearn.over_sampling import SMOTENC
from imblearn.over_sampling import SMOTENC
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

# 1. Load Data

#### one-hot编码后的特征数为49
x_train: (1572164, 49)
   - fraud_bool--- 0:1=786082:786082
    
x_test: (357908, 49)
   - fraud_bool--- 0:1=196530:2192

#### label encoding后的特征数为28
x_train: (1572164, 28)
   - fraud_bool--- 0:1=786082:786082

x_test: (357908, 28)
    - fraud_bool--- 0:1=196530:2192

In [14]:
train_set_resampled= pd.read_csv('./datasets/base_train_resampled.csv')
x_train_resampled= train_set_resampled.drop(columns=['fraud_bool'])
y_train_resampled= train_set_resampled['fraud_bool']

test_set= pd.read_csv('./datasets/base_test_scaled.csv')
x_test= test_set.drop(columns=['fraud_bool'])
y_test= test_set['fraud_bool']

train_set = pd.read_csv('./datasets/base_train_scaled.csv')
x_train_scaled = train_set.drop(columns=['fraud_bool'])
y_train = train_set['fraud_bool']

# 2. Modeling
使用未经过重采样的数据集进行建模，作为baseline model，并与之后重采样的模型进行对比。

准备使用的模型：Logistic Regression，Random Forest，XGBoost

## 2.1 Logistic Regression

In [17]:
# 逻辑回归分类器
log_reg = LogisticRegression()
log_reg.fit(x_train_scaled, y_train)
y_pred = log_reg.predict(x_test)
print('test set:Scaled Classification Report: \n', classification_report(y_test, y_pred,digits=4))

log_reg2 = LogisticRegression()
log_reg2.fit(x_train_resampled, y_train_resampled)
y_pred2 = log_reg2.predict(x_test)
print('test set:Resampled Classification Report: \n', classification_report(y_test, y_pred2,digits=4))

y_train_pred=log_reg2.predict(x_train_resampled)
print('train set:Resampled Classification Report: \n', classification_report(y_train_resampled, y_train_pred,digits=4))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


test set:Scaled Classification Report: 
               precision    recall  f1-score   support

           0     0.9890    0.9999    0.9945    196530
           1     0.5000    0.0055    0.0108      2192

    accuracy                         0.9890    198722
   macro avg     0.7445    0.5027    0.5026    198722
weighted avg     0.9836    0.9890    0.9836    198722



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


test set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.9949    0.8674    0.9268    196530
           1     0.0484    0.6049    0.0897      2192

    accuracy                         0.8645    198722
   macro avg     0.5217    0.7362    0.5082    198722
weighted avg     0.9845    0.8645    0.9176    198722

train set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.8887    0.8671    0.8778    786082
           1     0.8703    0.8914    0.8807    786082

    accuracy                         0.8793   1572164
   macro avg     0.8795    0.8793    0.8792   1572164
weighted avg     0.8795    0.8793    0.8792   1572164



## 2.2 Random Forest

In [18]:
# 未采样
rf = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1, class_weight='balanced', max_depth=10)
rf.fit(x_train_scaled, y_train)
y_pred = rf.predict(x_test)
print('test set:Scaled Classification Report: \n', classification_report(y_test, y_pred,digits=4))

y_train_pred=rf.predict(x_train_scaled)
print('train set:Scaled Classification Report: \n', classification_report(y_train, y_train_pred,digits=4))

test set:Scaled Classification Report: 
               precision    recall  f1-score   support

           0     0.9959    0.8747    0.9314    196530
           1     0.0569    0.6779    0.1050      2192

    accuracy                         0.8725    198722
   macro avg     0.5264    0.7763    0.5182    198722
weighted avg     0.9856    0.8725    0.9223    198722

train set:Scaled Classification Report: 
               precision    recall  f1-score   support

           0     0.9975    0.8756    0.9326    786082
           1     0.0673    0.8017    0.1242      8803

    accuracy                         0.8748    794885
   macro avg     0.5324    0.8386    0.5284    794885
weighted avg     0.9872    0.8748    0.9236    794885



In [26]:
rf2 = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1,max_depth=10)
rf2.fit(x_train_resampled, y_train_resampled)
y_pred2 = rf2.predict(x_test)
print('test set:Resampled Classification Report: \n', classification_report(y_test, y_pred2,digits=4))

y_train_pred=rf2.predict(x_train_resampled)
print('train set:Resampled Classification Report: \n', classification_report(y_train_resampled, y_train_pred,digits=4))

test set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.9946    0.9147    0.9530    196530
           1     0.0677    0.5552    0.1206      2192

    accuracy                         0.9107    198722
   macro avg     0.5311    0.7349    0.5368    198722
weighted avg     0.9844    0.9107    0.9438    198722

train set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.9236    0.9148    0.9192    786082
           1     0.9156    0.9244    0.9200    786082

    accuracy                         0.9196   1572164
   macro avg     0.9196    0.9196    0.9196   1572164
weighted avg     0.9196    0.9196    0.9196   1572164



## 2.3 XGBoost


In [24]:
import xgboost as xgb

# 重采样前的数据集
# objective="binary:logistic" 二分类问题
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(x_train_scaled, y_train)
y_pred = xgb_model.predict(x_test)
print('test set:Scaled Classification Report: \n', classification_report(y_test, y_pred,digits=4))

y_train_pred=xgb_model.predict(x_train_scaled)
print('train set:Scaled Classification Report: \n', classification_report(y_train, y_train_pred,digits=4))

test set:Scaled Classification Report: 
               precision    recall  f1-score   support

           0     0.9894    0.9995    0.9944    196530
           1     0.4681    0.0401    0.0739      2192

    accuracy                         0.9889    198722
   macro avg     0.7287    0.5198    0.5342    198722
weighted avg     0.9837    0.9889    0.9843    198722

train set:Scaled Classification Report: 
               precision    recall  f1-score   support

           0     0.9900    0.9998    0.9948    786082
           1     0.8160    0.0937    0.1681      8803

    accuracy                         0.9897    794885
   macro avg     0.9030    0.5467    0.5815    794885
weighted avg     0.9880    0.9897    0.9857    794885



In [23]:
# 重采样后的数据集
xgb_model2 = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model2.fit(x_train_resampled, y_train_resampled)
y_pred2 = xgb_model2.predict(x_test)
print('test set:Resampled Classification Report: \n', classification_report(y_test, y_pred2,digits=4))

y_train_pred=xgb_model2.predict(x_train_resampled)
print('train set:Resampled Classification Report: \n', classification_report(y_train_resampled, y_train_pred,digits=4))

test set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.9909    0.9923    0.9916    196530
           1     0.2085    0.1811    0.1938      2192

    accuracy                         0.9834    198722
   macro avg     0.5997    0.5867    0.5927    198722
weighted avg     0.9822    0.9834    0.9828    198722

train set:Resampled Classification Report: 
               precision    recall  f1-score   support

           0     0.9859    0.9928    0.9894    786082
           1     0.9928    0.9858    0.9893    786082

    accuracy                         0.9893   1572164
   macro avg     0.9893    0.9893    0.9893   1572164
weighted avg     0.9893    0.9893    0.9893   1572164

