In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, auc

import pandas as pd
from einops import rearrange, reduce, repeat
from einops.layers.torch import Rearrange
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np

import torch

df = pd.read_csv('dataset_B_05_2020.csv')

In [None]:
BITS = 4
def decimal_to_bits(x, bits = BITS):
    """ expects image tensor ranging from 0 to 1, outputs bit tensor ranging from -1 to 1 """


    mask = 2 ** torch.arange(bits - 1, -1, -1)
    mask = rearrange(mask, 'd -> 1 d ')
    x = rearrange(x, 'b c  -> b c 1')

    bits = ((x & mask) != 0).float()
    bits = rearrange(bits, 'b c d  -> b (c d)')
    bits = bits * 2 - 1
    return bits

def bits_to_decimal(x, bits = BITS):
    """ expects bits from -1 to 1, outputs image tensor from 0 to 1 """

    x = (x > 0).int()
    mask = 2 ** torch.arange(bits - 1, -1, -1, dtype = torch.int32)

    mask = rearrange(mask, 'd -> 1 d')
    x = rearrange(x, 'b (c d) -> b c d', d = bits)
    dec = reduce(x * mask, 'b c d  -> b c ', 'sum')
    return dec.clamp(0,10) # may changed based on range of input x


# calculate the sample value of t based on x0 and t.
def q_x(x_0,t):
    x_0 = decimal_to_bits(x_0)
    noise = torch.randn_like(x_0)
    alphas_t = alphas_bar_sqrt[t]
    alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
    return (alphas_t * x_0 + alphas_1_m_t * noise) 

In [None]:
'''Don't run this code repeatedly'''
# mapping labels to 0 and 1
df['status']=df['status'].map({'legitimate':0,'phishing':1})
# select 1 
df_1 = df[df['status']==1]
# extract features
df_2 = df_1.iloc[:,[21,86,87]]
df_diffusion = df_2[['google_index','page_rank']]

In [None]:
experi_number = 5
if experi_number == 1:
    # control group
    dataset = torch.Tensor(df_diffusion.values)
elif experi_number == 2:
    # normalize
    scaler = StandardScaler()
    np_scaled = scaler.fit_transform(df_diffusion)
    dataset = torch.Tensor(np_scaled)
elif experi_number == 3:
    # Binary Encoder all
    dataset = torch.Tensor(df_diffusion.values).int()
    dataset = decimal_to_bits(dataset)
elif experi_number == 4:
    # Binary Encoder partly
    dataset = torch.Tensor(df_diffusion.values).int()
    dataset = np.concatenate([(dataset[:,0]*2-1).unsqueeze(1),decimal_to_bits(dataset[:,1].unsqueeze(1))],axis=1)
    dataset = torch.Tensor(dataset).float()
elif experi_number == 5:
    # partly bianry encoder+segement
    dataset = torch.Tensor(df_diffusion.values).int()
    dataset = np.concatenate([(dataset[:,0]*2-1).unsqueeze(1),decimal_to_bits(dataset[:,1].unsqueeze(1))],axis=1)
    dataset = torch.Tensor(dataset).float()

In [None]:
# train parameter
shape = dataset.shape
batch_size = 128
num_epoch = 3000
if_segment = True

#bata
num_steps = 1000
beta_min= 1.e-4
beta_max= 0.02

In [None]:
#beta
betas = torch.linspace(-6,6,num_steps)
betas = torch.sigmoid(betas)*(beta_max - beta_min) + beta_min

#alpha、alpha_prod、alpha_prod_previous、alpha_bar_sqrt
alphas = 1-betas
alphas_prod = torch.cumprod(alphas,0)
alphas_prod_p = torch.cat([torch.tensor([1]).float(),alphas_prod[:-1]],0)
alphas_bar_sqrt = torch.sqrt(alphas_prod)
one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)

assert alphas.shape==alphas_prod.shape==alphas_prod_p.shape==\
alphas_bar_sqrt.shape==one_minus_alphas_bar_log.shape\
==one_minus_alphas_bar_sqrt.shape
print("all the same shape",betas.shape)

In [None]:
import torch
import torch.nn as nn

class MLPDiffusion(nn.Module):
    def __init__(self,n_steps,inout_units=5,num_units=128):
        super(MLPDiffusion,self).__init__()
        
        self.linears = nn.ModuleList(
            [
                nn.Linear(inout_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,num_units),
                nn.ReLU(),
                nn.Linear(num_units,inout_units)
            ]
        )
        self.step_embeddings = nn.ModuleList(
            [
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units),
                nn.Embedding(n_steps,num_units)
            ]
        )
    def forward(self,x,t):
#         x = x_0
        for idx,embedding_layer in enumerate(self.step_embeddings):
            t_embedding = embedding_layer(t)
            x = self.linears[2*idx](x)
            x += t_embedding
            x = self.linears[2*idx+1](x)
            
        x = self.linears[-1](x)
        
        return x

In [None]:
def diffusion_loss_fn(model,x_0,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,n_steps):
    """loss for any time t"""
    
    batch_size = x_0.shape[0]
    
    # random t, avoiding repetitive t
    if batch_size %2 ==0:
        t = torch.randint(0,n_steps,size=(batch_size//2,))
        t = torch.cat([t,n_steps-1-t],dim=0)
        t = t.unsqueeze(-1)
    else:
        t = torch.randint(0,n_steps,size=(batch_size//2+1,))
        t = torch.cat([t,n_steps-1-t],dim=0)
        t = t[:-1]
        t = t.unsqueeze(-1)
    
    #coef of x0
    a = alphas_bar_sqrt[t]
    
    #coef eps
    aml = one_minus_alphas_bar_sqrt[t]
    
    # eps
    e = torch.randn_like(x_0) 
    x = x_0*a+e*aml
#     print(t.shape)

    output = model(x,t.squeeze(-1))
    
    return (e - output).square().mean()

In [None]:
def p_sample_loop(model,shape,n_steps,betas,one_minus_alphas_bar_sqrt):
    """from x[T] restore x[T-1]、x[T-2]|...x[0]"""
    cur_x = torch.randn(shape)
    x_seq = [cur_x]
    for i in reversed(range(n_steps)):
        cur_x = p_sample(model,cur_x,i,betas,one_minus_alphas_bar_sqrt)
        x_seq.append(cur_x)
        
    print('Exp 1 to 5')
    return x_seq

def p_sample(model,x,t,betas,one_minus_alphas_bar_sqrta):
    """from last time sample xt"""
    t = torch.tensor([t])
    
    coeff = betas[t] / one_minus_alphas_bar_sqrt[t]
    
    eps_theta = model(x,t)
    
    # Segment inverse diffusion
    if t==0 and if_segment==True:
        t = torch.tensor([t])
        x_0 = (x - betas[t].sqrt()*eps_theta)/(1-betas[t]).sqrt() 
        sample = x_0
    else:
        mean = (1/(1-betas[t]).sqrt())*(x-(coeff*eps_theta))

        z = torch.randn_like(x)
        sigma_t = betas[t].sqrt()

        sample = mean + sigma_t * z
    
    return (sample)


# 1 Control Group

In [None]:
seed = 1234

dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=True)

plt.rc('text',color='blue')

model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

for t in range(num_epoch):
    for idx,batch_x in enumerate(dataloader):
        loss = diffusion_loss_fn(model,batch_x,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()
        
    if(t%100==0):
        print(loss)
        x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
        fig,axs = plt.subplots(1,10,figsize=(28,3))
        for i in range(1,11):
            cur_x = x_seq[i*100].detach()
            axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
            axs[i-1].set_axis_off();
            axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')

In [None]:
#sample x0
generate = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
x_0 = generate[1000].detach()

# add labels for generative data
n_1 = np.ones((5715,1))
X_gen = np.concatenate([x_0,n_1],axis=1)

# 2 Normalize

In [None]:
seed = 1234

print('Training model...')

dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=True)

plt.rc('text',color='blue')

model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

for t in range(num_epoch):
    for idx,batch_x in enumerate(dataloader):
        loss = diffusion_loss_fn(model,batch_x,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()
        
    if(t%100==0):
        print(loss)
        x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
        fig,axs = plt.subplots(1,10,figsize=(28,3))
        for i in range(1,11):
            cur_x = x_seq[i*100].detach()
            axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
            axs[i-1].set_axis_off();
            axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')

# 3 BE all

In [None]:
seed = 1234

print('Training model...')

dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=True)

plt.rc('text',color='blue')

model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

for t in range(num_epoch):
    for idx,batch_x in enumerate(dataloader):
        loss = diffusion_loss_fn(model,batch_x,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()
        
    if(t%100==0):
        print(loss)
        x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
        fig,axs = plt.subplots(1,10,figsize=(28,3))
        for i in range(1,11):
            cur_x = x_seq[i*100].detach()
            axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
            axs[i-1].set_axis_off();
            axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')

# 4 BE partly

In [None]:
seed = 1234

print('Training model...')

dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=True)

plt.rc('text',color='blue')

model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

for t in range(num_epoch):
    for idx,batch_x in enumerate(dataloader):
        loss = diffusion_loss_fn(model,batch_x,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()
        
    if(t%500==0):
        print(loss)
        x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
        fig,axs = plt.subplots(1,10,figsize=(28,3))
        for i in range(1,11):
            cur_x = x_seq[i*100].detach()
            axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
            axs[i-1].set_axis_off();
            axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')

# 5 BE partly + Segment

In [None]:
seed = 1234

print('Training model...')

dataloader = torch.utils.data.DataLoader(dataset,batch_size=batch_size,shuffle=True)

plt.rc('text',color='blue')

model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)

for t in range(num_epoch):
    for idx,batch_x in enumerate(dataloader):
        loss = diffusion_loss_fn(model,batch_x,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()
    
    if(t%500==0):
        print(loss)
        x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
        fig,axs = plt.subplots(1,10,figsize=(28,3))
        for i in range(1,11):
            cur_x = x_seq[i*100].detach()
            axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
            axs[i-1].set_axis_off();
            axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')    
#     if(t%200==0):
#         print(loss)
#         x_seq = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
        
#         fig,axs = plt.subplots(1,2,figsize=(6,3))
#         for i in range(1,3):
#             cur_x = x_seq[i+998].detach()
#             axs[i-1].scatter(cur_x[:,-2],cur_x[:,-1],color='red',edgecolor='white');
#             axs[i-1].set_axis_off();
#             axs[i-1].set_title('$q(\mathbf{x}_{'+str(i*10)+'})$')

# 6 Limited data training

In [None]:
for j in range(1,21):

    num_samples = 20
    #randomly select
    indices = torch.randperm(dataset.size(0))[:num_samples*j]
    sampled_dataset = dataset[indices]

    model = MLPDiffusion(n_steps=num_steps,inout_units=shape[1])  #num_steps=1000, inout
    optimizer = torch.optim.Adam(model.parameters(),lr=1e-3)
    
    for t in range(num_epoch):
   
        loss = diffusion_loss_fn(model,sampled_dataset,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,num_steps)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(),1.)
        optimizer.step()

        if(t%500==0):
            print(loss)

    print(j)   

    generate = p_sample_loop(model,shape,num_steps,betas,one_minus_alphas_bar_sqrt)
    x_0 = generate[1000].detach()

    decimal_x = np.concatenate([(x_0[:,[0]]>0).int(),bits_to_decimal(x_0[:,1:])],axis=1)

    n_1 = np.ones((5715,1))
    X_gen = np.concatenate([decimal_x,n_1],axis=1)

# 7 predict x0 instead of e

In [None]:
def diffusion_loss_fn(model,x_0,alphas_bar_sqrt,one_minus_alphas_bar_sqrt,n_steps):
    """loss"""
    print('exp 7')
    batch_size = x_0.shape[0]
    
    if batch_size %2 ==0:
        t = torch.randint(0,n_steps,size=(batch_size//2,))
        t = torch.cat([t,n_steps-1-t],dim=0)
        t = t.unsqueeze(-1)
    else:
        t = torch.randint(0,n_steps,size=(batch_size//2+1,))
        t = torch.cat([t,n_steps-1-t],dim=0)
        t = t[:-1]
        t = t.unsqueeze(-1)
    
    a = alphas_bar_sqrt[t]
    aml = one_minus_alphas_bar_sqrt[t]
    e = torch.randn_like(x_0)
    
    x = x_0*a+e*aml
    output = model(x,t.squeeze(-1))
    

    return (x_0 - output).square().mean()

In [None]:
def p_sample_loop(model,shape,n_steps,betas,one_minus_alphas_bar_sqrt):
    cur_x = torch.randn(shape)
    x_seq = [cur_x]
    for i in reversed(range(n_steps)):
        cur_x = p_sample(model,cur_x,i,betas,one_minus_alphas_bar_sqrt)
        x_seq.append(cur_x)
        
    print('exp 7')
    return x_seq

def p_sample(model,x,t,betas,one_minus_alphas_bar_sqrta):
    t = torch.tensor([t])
    
    coeff = betas[t] / one_minus_alphas_bar_sqrt[t]
    
    x_0 = model(x,t)
    
    pred_noise = (x - alphas_bar_sqrt[t] * x_0) / one_minus_alphas_bar_sqrt[t].clamp(min = 1e-8)

            # calculate x next

    img = x_start * alpha_next + pred_noise * sigma_next
    
    return (sample)


In [None]:
num_X_gen = 4000
num_X_pos = 0
acc_lst = [] #实验组
pre_lst = []
rec_lst = []
f1_lst = []

acc_duizhao_lst = [] #对照组
pre_duizhao_lst = []
rec_duizhao_lst = []
f1_duizhao_lst = []

acc_cossen_lst =[0]#损失敏感组
pre_cossen_lst = [0]
rec_cossen_lst = [0]
f1_cossen_lst = [0]

acc_upsam_lst = [0] #上采样组
pre_upsam_lst = [0]
rec_upsam_lst = [0]
f1_upsam_lst = [0]

acc_smote_lst = [0] #smote组
pre_smote_lst = [0]
rec_smote_lst = [0]
f1_smote_lst = [0]


X_neg = np.array(df[['google_index','page_rank','status']][df['status']==0])
X_pos = np.array(df[['google_index','page_rank','status']][df['status']==1])

for i in range(201):
    np.random.shuffle(X_neg)  
    np.random.shuffle(X_pos)
    np.random.shuffle(X_gen)
    
    X_y_train = np.concatenate([X_gen[:num_X_gen-20*i,:],X_pos[:20*i,:],X_neg[:4000,:]],axis=0)
    np.random.shuffle(X_y_train)
    X_y_test = np.concatenate([X_pos[4000:5000,:],X_neg[4000:5000,:]],axis=0)
    np.random.shuffle(X_y_test)
    
    clf =xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    clf.fit(X_y_train[:,:-1],X_y_train[:,-1])

    y_tes = X_y_test[:,-1]
    y_pre = clf.predict(X_y_test[:,:-1])

    accuracy = accuracy_score(y_tes, y_pre)
    precision = precision_score(y_tes, y_pre)
    recall = recall_score(y_tes, y_pre)
    f1 = f1_score(y_tes, y_pre)
#     print(f'Accuracy: {accuracy:.2f}')

    acc_lst.append(accuracy)
    pre_lst.append(precision)
    rec_lst.append(recall)
    f1_lst.append(f1)
    
    if i == 5:
        y_pred_prob = clf.predict_proba(X_y_test[:,:-1])[:,1]
        fpr, tpr, thresholds = roc_curve(y_tes, y_pred_prob)
        roc_auc = auc(fpr, tpr)

        plt.figure()
        plt.plot(fpr, tpr,lw=2, label='SRDM (area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Classifier ROC')
        plt.legend(loc="lower right")
#         plt.show()
        
for i in range(201):
    np.random.shuffle(X_neg)  
    np.random.shuffle(X_pos)
    np.random.shuffle(X_gen)
    
    X_y_train = np.concatenate([X_pos[:20*i,:],X_neg[:4000,:]],axis=0)
    np.random.shuffle(X_y_train)
    X_y_test = np.concatenate([X_pos[4000:5000,:],X_neg[4000:5000,:]],axis=0)
    np.random.shuffle(X_y_test)
    
    clf =xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    clf.fit(X_y_train[:,:-1],X_y_train[:,-1])
    y_tes = X_y_test[:,-1]
    y_pre = clf.predict(X_y_test[:,:-1])

    accuracy = accuracy_score(y_tes, y_pre)
    precision = precision_score(y_tes, y_pre)
    recall = recall_score(y_tes, y_pre)
    f1 = f1_score(y_tes, y_pre)
    
    acc_duizhao_lst.append(accuracy)
    pre_duizhao_lst.append(precision)
    rec_duizhao_lst.append(recall)
    f1_duizhao_lst.append(f1)
    
    if i == 5:
        y_pred_prob =clf.predict_proba(X_y_test[:,:-1])[:,1]
        fpr, tpr, thresholds = roc_curve(y_tes, y_pred_prob)
        roc_auc = auc(fpr, tpr)

#         plt.figure()
        plt.plot(fpr, tpr, lw=2, label='control group(area = %0.2f)' % roc_auc)
        plt.plot([0, 1], [0, 1],  lw=2, linestyle='--')


    if i > 0:
        #cost sens
        scale_pos_weight = 4000 / (40*i)
        clf = xgb.XGBClassifier(scale_pos_weight=scale_pos_weight,use_label_encoder=False, eval_metric='logloss')
        clf.fit(X_y_train[:,:-1],X_y_train[:,-1])
        y_tes = X_y_test[:,-1]
        y_pre = clf.predict(X_y_test[:,:-1])

        accuracy = accuracy_score(y_tes, y_pre)
        precision = precision_score(y_tes, y_pre)
        recall = recall_score(y_tes, y_pre)
        f1 = f1_score(y_tes, y_pre)
        
        acc_cossen_lst.append(accuracy)
        pre_cossen_lst.append(precision)
        rec_cossen_lst.append(recall)
        f1_cossen_lst.append(f1)
        
        if i == 5:
            y_pred_prob =clf.predict_proba(X_y_test[:,:-1])[:,1]
            fpr, tpr, thresholds = roc_curve(y_tes, y_pred_prob)
            roc_auc = auc(fpr, tpr)
#             plt.figure()
            plt.plot(fpr, tpr, lw=2, label='cost sensitive(area = %0.2f)' % roc_auc)
            plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
#             plt.show()
        
        # oversample
        ros = RandomOverSampler(random_state=0)
        X_resampled, y_resampled = ros.fit_resample(X_y_train[:,:-1], X_y_train[:,-1])

        clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        clf.fit(X_resampled, y_resampled)
        y_tes = X_y_test[:,-1]
        y_pre = clf.predict(X_y_test[:,:-1])

        accuracy = accuracy_score(y_tes, y_pre)
        precision = precision_score(y_tes, y_pre)
        recall = recall_score(y_tes, y_pre)
        f1 = f1_score(y_tes, y_pre)
        
        acc_upsam_lst.append(accuracy)
        pre_upsam_lst.append(precision)
        rec_upsam_lst.append(recall)
        f1_upsam_lst.append(f1)
        
        if i == 5:
            y_pred_prob =clf.predict_proba(X_y_test[:,:-1])[:,1]
            fpr, tpr, thresholds = roc_curve(y_tes, y_pred_prob)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr,  lw=2, label='upper sampling (area = %0.2f)' % roc_auc)
            plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
   

    
        #SMOTE
        sm = SMOTE(random_state=42)

        X_resampled, y_resampled = sm.fit_resample(X_y_train[:,:-1], X_y_train[:,-1])
        clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        clf.fit(X_resampled, y_resampled)
        y_tes = X_y_test[:,-1]
        y_pre = clf.predict(X_y_test[:,:-1])

        accuracy = accuracy_score(y_tes, y_pre)
        precision = precision_score(y_tes, y_pre)
        recall = recall_score(y_tes, y_pre)
        f1 = f1_score(y_tes, y_pre)
        
        acc_smote_lst.append(accuracy)
        pre_smote_lst.append(precision)
        rec_smote_lst.append(recall)
        f1_smote_lst.append(f1)
        
        if i == 5:
            y_pred_prob =clf.predict_proba(X_y_test[:,:-1])[:,1]
            fpr, tpr, thresholds = roc_curve(y_tes, y_pred_prob)
            roc_auc = auc(fpr, tpr)
            plt.plot(fpr, tpr, lw=2, label='smote (area = %0.2f)' % roc_auc)
            plt.plot([0, 1], [0, 1], lw=2, linestyle='--')
            plt.legend(loc="lower right")
            plt.show()