In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

# 读取CSV文件
data = pd.read_csv("creditcard.csv")

# 下采样，使得两个样本同样少
# Number of data points in the minority class
number_records_fraud = len(data[data.Class == 1])  # 计算异常样本的个数
fraud_indices = np.array(data[data.Class == 1].index)  # 异常样本在原数据的索引值

# Picking the indices of the normal classes
normal_indices = data[data.Class == 0].index  # 获得原数据正常样本的索引值

# Out of the indices we picked, randomly select "x" number (number_records_fraud)
random_normal_indices = np.random.choice(
    normal_indices, number_records_fraud, replace=False
)  # 通过索引进行随机的选择
random_normal_indices = np.array(random_normal_indices)

# Appending the 2 indices
under_sample_indices = np.concatenate(
    [fraud_indices, random_normal_indices]
)  # 将class=1和class=0 的选出来的索引值进行合并

# Under sample dataset
under_sample_data = data.iloc[under_sample_indices, :]

# 观察特征可视化后选择剔除部分特征
droplist = [
    "V8",
    "V13",
    "V15",
    "V20",
    "V21",
    "V22",
    "V23",
    "V24",
    "V25",
    "V26",
    "V27",
    "V28",
    "Time",
]
data_new = under_sample_data.drop(droplist, axis=1)

# 切分训练集和测试集
X = data_new.iloc[:, data_new.columns != "Class"]
y = data_new.iloc[:, data_new.columns == "Class"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# 建立模型

# 1. 决策树
tree_model = DecisionTreeClassifier(max_depth=4, criterion="entropy")
tree_model.fit(X_train, y_train)
tree_yhat = tree_model.predict(X_test)


# 2. k最近邻居
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
knn_yhat = knn.predict(X_test)


# 3. 逻辑斯蒂回归
lr = LogisticRegression()
lr.fit(X_train, y_train)
lr_yhat = lr.predict(X_test)


# 4. 支持向量机
svm = SVC()
svm.fit(X_train, y_train)
svm_yhat = svm.predict(X_test)


# 5. 随机森林
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
rf_yhat = rf.predict(X_test)


# 6. XGBoost
xgb = XGBClassifier()
xgb.fit(X_train, y_train)
xgb_yhat = xgb.predict(X_test)

print(f"tree accuracy: {accuracy_score(y_test, tree_yhat)}")
print(f"knn accuracy: {accuracy_score(y_test, knn_yhat)}")
print(f"lr accuracy: {accuracy_score(y_test, lr_yhat)}")
print(f"svm accuracy: {accuracy_score(y_test, svm_yhat)}")
print(f"rf accuracy: {accuracy_score(y_test, rf_yhat)}")
print(f"xgb accuracy: {accuracy_score(y_test, xgb_yhat)}")