# 数据预处理

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('./Data/insurance_claims.csv')
data.shape

(1000, 40)

## 删除特征

In [3]:
def missing_values_table(df):
    '''构建计算缺失值数量的函数'''
    # 每列总缺失值数量
    mis_val = df.isnull().sum()
    
    # 缺失值的百分比
    mis_val_percent = 100*df.isnull().sum() / len(df)
    
    # 将结果级联成一个表格
    mis_val_table = pd.concat([mis_val,mis_val_percent],axis=1)
    
    # 给表格的列重命名
    mis_val_table = mis_val_table.rename(columns = {0 : "缺失值数量", 1 : "缺失值占比"})
    
    # 按照缺失值占比降序排序
    mis_val_table = mis_val_table[mis_val_table.iloc[:,1] != 0].sort_values("缺失值占比", ascending=False).round(1)
    
    # 打印一些总结信息
    print("输入的对象一共有" + str(df.shape[1]-1) + "个特征.\n"
         "其中一共有" + str(mis_val_table.shape[0]) + "个特征具有缺失值.")
    
    # 返回缺失值表格  
    return mis_val_table

In [4]:
mis_val_table = missing_values_table(data)
mis_val_table

输入的对象一共有39个特征.
其中一共有1个特征具有缺失值.


Unnamed: 0,缺失值数量,缺失值占比
_c39,1000,100.0


In [5]:
# _c39 列全部都是 np.nan,因此需要删除
data.drop(columns='_c39',inplace=True)

In [6]:
# 删除：保单编号policy_number，被保人邮编insured_zip, 
# 事故所在的具体位置incident_location,'policy_bind_date','incident_date'
data.drop(columns=['incident_location','policy_bind_date','incident_date',
                   'insured_occupation','policy_number','insured_zip'],
          inplace=True)

## 分类变量编码

In [7]:
data.shape

(1000, 33)

In [8]:
label_dict = {'fraud_reported':{'Y':1,'N':0}}
data = data.replace(label_dict)

In [9]:
cal_columns = list(data.select_dtypes(include='object').columns)
len(cal_columns)

16

In [10]:
for col in cal_columns:
    data[col] = LabelEncoder().fit_transform(data[col])

In [12]:
# 9 个连续性变量应该在过采样后进行标准化
continous_columns = ['months_as_customer','age','policy_annual_premium','capital-gains',
                     'capital-loss','total_claim_amount','injury_claim','property_claim','vehicle_claim']
len(continous_columns)

9

## 离散变量处理

In [13]:
rest_columns = ['bodily_injuries','policy_deductable','witnesses','number_of_vehicles_involved',
                'umbrella_limit','auto_year','incident_hour_of_the_day']
len(rest_columns)

7

**umbrella_limit 异常值处理及编码**

In [14]:
Counter(data['umbrella_limit'])  # 可以看到 -1000000 是异常值，我们将其修改为 1000000

Counter({0: 798,
         5000000: 46,
         6000000: 57,
         4000000: 39,
         3000000: 12,
         8000000: 8,
         7000000: 29,
         9000000: 5,
         10000000: 2,
         -1000000: 1,
         2000000: 3})

In [16]:
data.loc[290,'umbrella_limit']= 1000000

In [17]:
data['umbrella_limit'] = (data['umbrella_limit']/1000000).astype(int)    # umbrella_limit 是有序变量

**policy_deductable编码**

In [18]:
Counter(data['policy_deductable'])

Counter({1000: 351, 2000: 307, 500: 342})

In [20]:
mapping_dict = {'policy_deductable':{500:0,1000:1,2000:2}}
data = data.replace(mapping_dict)

In [21]:
data.columns

Index(['months_as_customer', 'age', 'policy_state', 'policy_csl',
       'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
       'insured_sex', 'insured_education_level', 'insured_hobbies',
       'insured_relationship', 'capital-gains', 'capital-loss',
       'incident_type', 'collision_type', 'incident_severity',
       'authorities_contacted', 'incident_state', 'incident_city',
       'incident_hour_of_the_day', 'number_of_vehicles_involved',
       'property_damage', 'bodily_injuries', 'witnesses',
       'police_report_available', 'total_claim_amount', 'injury_claim',
       'property_claim', 'vehicle_claim', 'auto_make', 'auto_model',
       'auto_year', 'fraud_reported'],
      dtype='object')

In [22]:
data.rename(columns={'fraud_reported': 'label'},inplace=True)

In [20]:
data.to_csv('./Data/insurance_claims_predeal.csv', index=False, encoding='utf-8')

# 数据划分

In [13]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.preprocessing import LabelEncoder,StandardScaler
import random

In [14]:
data = pd.read_csv('./Data/insurance_claims_predeal.csv')
data.shape

(1000, 33)

In [15]:
Counter(data.label)

Counter({1: 247, 0: 753})

**数据集划分2**

测试集：正样本47 + 负样本148，占比0.241  
剩余：正样本200 + 负样本605 【五折交叉验证】   
* 验证集：正样本40 + 负样本121 ,占比0.248
* 训练集：正样本160 + 负样本484，占比0.248

In [16]:
import numpy as np
import random
import os
def set_seed(seed=42):
    seed = int(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [17]:
set_seed(seed=42)
fraud_index = data[data.label == 1].index.tolist()
random.shuffle(fraud_index)
nofraud_index = data[data.label == 0].index.tolist()
random.shuffle(nofraud_index)

testmask = fraud_index[:47] + nofraud_index[:148]
random.shuffle(testmask)
data_test = pd.DataFrame.copy(data.iloc[testmask],deep=True)
data_test.index = range(data_test.shape[0])

rest_mask = fraud_index[47:]+nofraud_index[148:]
random.shuffle(rest_mask)
data_rest = pd.DataFrame.copy(data.iloc[rest_mask],deep=True)
data_rest.index = range(data_rest.shape[0])

In [18]:
def evaluate(clf,X_train,Y_train,X_test,Y_test):
    from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,roc_auc_score
    evaluate_dict = {}
    def GM(y_true,y_pred):
        import numpy as np
        recall = (y_pred[y_true==1]==1).mean()
        tnr = (y_pred[y_true==0]==0).mean()
        return np.sqrt(recall*tnr)
    
    clf.fit(X_train,Y_train)
    y_pred = clf.predict(X_test)
    y_proba = clf.predict_proba(X_test)[:,1]
    
    #evaluate_dict['Accuracy'] = accuracy_score(Y_test,y_pred)
    evaluate_dict['AUC'] = roc_auc_score(Y_test,y_proba)
    evaluate_dict['Recall'] = recall_score(Y_test,y_pred,average='binary',pos_label=1)
    evaluate_dict['F1'] = f1_score(Y_test,y_pred,average='binary',pos_label=1)
    evaluate_dict['GM'] = GM(Y_test,y_pred)
    #evaluate_dict['FPR'] = (y_pred[Y_test==0]==1).mean()
    #evaluate_dict['TNR'] = (y_pred[Y_test==0]==0).mean()
    
    return evaluate_dict

In [19]:
data_rest[:3]

Unnamed: 0,months_as_customer,age,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_sex,insured_education_level,insured_hobbies,...,witnesses,police_report_available,total_claim_amount,injury_claim,property_claim,vehicle_claim,auto_make,auto_model,auto_year,label
0,254,45,0,1,0,1083.64,0,1,3,13,...,0,0,79680,13280,13280,53120,2,0,2004,0
1,289,45,2,1,2,1221.41,0,0,5,12,...,1,1,2700,300,300,2100,6,6,2006,0
2,172,35,0,0,2,1219.04,0,1,4,13,...,0,1,79750,14500,14500,50750,9,29,1999,0


# 网格调参

In [20]:
from imblearn.over_sampling import ADASYN,SMOTE
from xgboost.sklearn import XGBClassifier
from sdv.tabular import CTGAN
from sdv.sampling import Condition

In [21]:
def train_valid(data_rest,clf,osmethod,Normlization=True):
    metrics = []
    rest_nofraud_mask = data_rest[data_rest.label == 0].index.tolist()
    rest_fraud_mask = data_rest[data_rest.label == 1].index.tolist()
    for k in range(5):
        validmask = rest_fraud_mask[k*40:(k+1)*40] + rest_nofraud_mask[k*121:(k+1)*121]
        trainmask = list(set(rest_fraud_mask+rest_nofraud_mask)-set(validmask))
        data_train = pd.DataFrame.copy(data_rest.iloc[trainmask],deep=True)
        data_valid = pd.DataFrame.copy(data_rest.iloc[validmask],deep=True)
        data_train.index = range(data_train.shape[0])
        data_valid.index = range(data_valid.shape[0])
        if osmethod =='ADASYN':
            X_train = data_train.values[:,:-1]
            Y_train = data_train.values[:,-1]
            adasyn = ADASYN(n_jobs=-1,random_state = 42,sampling_strategy=1.0)
            new_Xtrain, new_Ytrain = adasyn.fit_resample(X_train, Y_train)
        elif osmethod =='SMOTE':
            X_train = data_train.values[:,:-1]
            Y_train = data_train.values[:,-1]
            smote = SMOTE(n_jobs=-1,random_state = 42,sampling_strategy=1.0)
            new_Xtrain, new_Ytrain = smote.fit_resample(X_train, Y_train)
        elif osmethod =='CTGAN':
            label_dict = Counter(data_train['label'])
            num_rows = label_dict[0]-label_dict[1]
            learn_df = data_train[data_train['label']==1]
            ctgan = CTGAN()
            ctgan.fit(learn_df)
            condition = Condition({ 'label': 1}, num_rows)
            df = ctgan.sample_conditions(conditions=[condition])
            new_data_train = pd.concat([data_train,df],axis=0)
            new_Xtrain, new_Ytrain = new_data_train.values[:,:-1], new_data_train.values[:,-1]
        
        index = np.arange(new_Xtrain.shape[0])
        random.seed(42)
        random.shuffle(index)
        new_Xtrain = new_Xtrain[index]
        new_Ytrain = new_Ytrain[index]

        X_valid = data_valid.values[:,:-1]
        Y_valid = data_valid.values[:,-1]

        if Normlization:
            new_Xtrain = StandardScaler().fit_transform(new_Xtrain)
            X_valid = StandardScaler().fit_transform(X_valid)

        evaluate_dict = evaluate(clf,new_Xtrain,new_Ytrain,X_valid,Y_valid)
        metrics.append(list(evaluate_dict.values()))
    return metrics

In [None]:
set_seed(seed=42)
best_score = 0
for n_estimators in [200]:
    for max_depth in [7]:
        for learning_rate in [0.04]:
            clf=XGBClassifier(n_estimators=n_estimators,
                              max_depth = max_depth,
                              learning_rate = learning_rate
                              ) 
            metrics = train_valid(data_rest, clf, osmethod ='CTGAN', Normlization=True)
            data_metrics = pd.DataFrame(metrics,columns=['AUC','Recall','F1','GM'])
            score = data_metrics.AUC.mean()
            if score > best_score:
                best_score = score
                best_parameters={'n_estimators':n_estimators,
                                 'max_depth':max_depth,
                                 'learning_rate':learning_rate}

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 



  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

In [None]:
best_parameters

# 测试

In [20]:
from imblearn.over_sampling import ADASYN,SMOTE
from sdv.tabular import CTGAN
from sdv.sampling import Condition
from sklearn.preprocessing import StandardScaler
from xgboost.sklearn import XGBClassifier

In [21]:
import torch
import torch.nn as nn
import os
import random
import numpy as np

def set_seed(seed=42):
    seed = int(seed)
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = True

In [22]:
def test(data_rest,data_test,clf,osmethod='CTGAN',Normlization=True):
    if osmethod =='ADASYN':
        X_train = data_rest.values[:,:-1]
        Y_train = data_rest.values[:,-1]
        adasyn = ADASYN(n_jobs=-1,random_state = 42,sampling_strategy=1.0)
        new_Xtrain, new_Ytrain = adasyn.fit_resample(X_train, Y_train)
    elif osmethod =='SMOTE':
        X_train = data_rest.values[:,:-1]
        Y_train = data_rest.values[:,-1]
        smote = SMOTE(n_jobs=-1,random_state = 42,sampling_strategy=1.0)
        new_Xtrain, new_Ytrain = smote.fit_resample(X_train, Y_train)
    elif osmethod =='CTGAN':
        label_dict = Counter(data_rest['label'])
        num_rows = label_dict[0]-label_dict[1]
        learn_df = data_rest[data_rest['label']==1]
        ctgan = CTGAN()
        ctgan.fit(learn_df)
        condition = Condition({ 'label': 1}, num_rows)
        df = ctgan.sample_conditions(conditions=[condition])
        new_data_rest = pd.concat([data_rest,df],axis=0)
        new_Xtrain, new_Ytrain = new_data_rest.values[:,:-1], new_data_rest.values[:,-1]
    
    index = np.arange(new_Xtrain.shape[0])
    random.seed(42)
    random.shuffle(index)
    new_Xtrain = new_Xtrain[index]
    new_Ytrain = new_Ytrain[index]
    
    X_test = data_test.values[:,:-1]
    Y_test = data_test.values[:,-1]
    
    if Normlization:
        new_Xtrain = StandardScaler().fit_transform(new_Xtrain)
        X_test = StandardScaler().fit_transform(X_test)
    
    evaluate_dict = evaluate(clf,new_Xtrain,new_Ytrain,X_test,Y_test)
    return evaluate_dict

In [23]:
set_seed(seed=42)
clf = XGBClassifier(n_estimators=200,max_depth = 7, learning_rate=0.04, random_state=42)

evaluate_dict = test(data_rest,data_test,clf,osmethod='CTGAN',Normlization=False)

  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(n_clusters=self.n_components, n_init=1,
  label = cluster.KMeans(

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[column_name] = data[column_name].to_numpy().flatten()
A value is trying to be set 

Sampling conditions: 100%|████████████████████████████████████████████| 405/405 [00:00<00:00, 1593.58it/s]




In [24]:
evaluate_dict

{'AUC': 0.8786658999424958,
 'Recall': 0.7446808510638298,
 'F1': 0.7368421052631579,
 'GM': 0.8241781938556382}