In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [3]:
df = pd.read_csv('datasets/Base.csv')
df

Unnamed: 0,fraud_bool,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,payment_type,zip_count_4w,...,has_other_cards,proposed_credit_limit,foreign_request,source,session_length_in_minutes,device_os,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0,0.3,0.986506,-1,25,40,0.006735,102.453711,AA,1059,...,0,1500.0,0,INTERNET,16.224843,linux,1,1,0,0
1,0,0.8,0.617426,-1,89,20,0.010095,-0.849551,AD,1658,...,0,1500.0,0,INTERNET,3.363854,other,1,1,0,0
2,0,0.8,0.996707,9,14,40,0.012316,-1.490386,AB,1095,...,0,200.0,0,INTERNET,22.730559,windows,0,1,0,0
3,0,0.6,0.475100,11,14,30,0.006991,-1.863101,AB,3483,...,0,200.0,0,INTERNET,15.215816,linux,1,1,0,0
4,0,0.9,0.842307,-1,29,40,5.742626,47.152498,AA,2339,...,0,200.0,0,INTERNET,3.743048,other,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0,0.8,0.124690,-1,143,30,0.051348,-0.826239,AB,530,...,0,1500.0,0,INTERNET,16.967770,other,0,1,0,7
999996,0,0.9,0.824544,-1,193,30,0.009591,0.008307,AC,408,...,1,1000.0,0,INTERNET,1.504109,macintosh,0,1,0,7
999997,0,0.8,0.140891,-1,202,10,0.059287,50.609995,AA,749,...,0,200.0,0,INTERNET,16.068595,other,0,1,0,7
999998,0,0.9,0.002480,52,3,30,0.023357,-1.313387,AB,707,...,0,200.0,0,INTERNET,1.378683,linux,1,1,0,7


In [4]:
# 查看fraud bool具体情况
df['fraud_bool'].value_counts()

fraud_bool
0    988971
1     11029
Name: count, dtype: int64

In [7]:
# 查找缺失值
df.isnull().sum()

fraud_bool                          0
income                              0
name_email_similarity               0
prev_address_months_count           0
current_address_months_count        0
customer_age                        0
days_since_request                  0
intended_balcon_amount              0
payment_type                        0
zip_count_4w                        0
velocity_6h                         0
velocity_24h                        0
velocity_4w                         0
bank_branch_count_8w                0
date_of_birth_distinct_emails_4w    0
employment_status                   0
credit_risk_score                   0
email_is_free                       0
housing_status                      0
phone_home_valid                    0
phone_mobile_valid                  0
bank_months_count                   0
has_other_cards                     0
proposed_credit_limit               0
foreign_request                     0
source                              0
session_leng

In [8]:
# 筛选出数值型特征作为输入特征
X = df.select_dtypes(include=[np.number])
# 删去fraud_bool列
X = X.drop('fraud_bool', axis=1)
X

Unnamed: 0,income,name_email_similarity,prev_address_months_count,current_address_months_count,customer_age,days_since_request,intended_balcon_amount,zip_count_4w,velocity_6h,velocity_24h,...,phone_mobile_valid,bank_months_count,has_other_cards,proposed_credit_limit,foreign_request,session_length_in_minutes,keep_alive_session,device_distinct_emails_8w,device_fraud_count,month
0,0.3,0.986506,-1,25,40,0.006735,102.453711,1059,13096.035018,7850.955007,...,1,9,0,1500.0,0,16.224843,1,1,0,0
1,0.8,0.617426,-1,89,20,0.010095,-0.849551,1658,9223.283431,5745.251481,...,1,2,0,1500.0,0,3.363854,1,1,0,0
2,0.8,0.996707,9,14,40,0.012316,-1.490386,1095,4471.472149,5471.988958,...,1,30,0,200.0,0,22.730559,0,1,0,0
3,0.6,0.475100,11,14,30,0.006991,-1.863101,3483,14431.993621,6755.344479,...,1,1,0,200.0,0,15.215816,1,1,0,0
4,0.9,0.842307,-1,29,40,5.742626,47.152498,2339,7601.511579,5124.046930,...,1,26,0,200.0,0,3.743048,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,0.8,0.124690,-1,143,30,0.051348,-0.826239,530,6732.602414,3010.048099,...,1,31,0,1500.0,0,16.967770,0,1,0,7
999996,0.9,0.824544,-1,193,30,0.009591,0.008307,408,1574.293294,2716.495767,...,1,-1,1,1000.0,0,1.504109,0,1,0,7
999997,0.8,0.140891,-1,202,10,0.059287,50.609995,749,1258.864938,3601.322892,...,1,31,0,200.0,0,16.068595,0,1,0,7
999998,0.9,0.002480,52,3,30,0.023357,-1.313387,707,7048.137128,6521.395012,...,1,1,0,200.0,0,1.378683,1,1,0,7


In [9]:
# 划分训练集和测试集
x_train, x_test, y_train, y_test = train_test_split(X, df['fraud_bool'], test_size=0.2, random_state=0)

# 标准化
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)


In [10]:
# 逻辑回归
lr = LogisticRegression()
lr.fit(x_train_scaled, y_train)
y_test_pred = lr.predict(x_test_scaled)
y_train_pred = lr.predict(x_train_scaled)

# classification_report
print('classification_report on train set\n', classification_report(y_train, y_train_pred))
print('classification_report on test set\n', classification_report(y_test, y_test_pred))

classification_report on train set
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    791170
           1       0.63      0.00      0.01      8830

    accuracy                           0.99    800000
   macro avg       0.81      0.50      0.50    800000
weighted avg       0.99      0.99      0.98    800000

classification_report on test set
               precision    recall  f1-score   support

           0       0.99      1.00      0.99    197801
           1       0.55      0.00      0.01      2199

    accuracy                           0.99    200000
   macro avg       0.77      0.50      0.50    200000
weighted avg       0.98      0.99      0.98    200000



In [None]:
# svm
from sklearn.svm import SVC
svc = SVC(random_state=0, kernel='rbf')
svc.fit(x_train_scaled, y_train)
y_test_pred = svc.predict(x_test_scaled)
y_train_pred = svc.predict(x_train_scaled)

# classification_report
print('classification_report on train set\n', classification_report(y_train, y_train_pred))
print('classification_report on test set\n', classification_report(y_test, y_test_pred))