In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer #dictionary vectorization
from imblearn.over_sampling import SMOTE #upsampleing SMOTE
from sklearn.model_selection import StratifiedKFold, cross_val_score #cross validation
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, BaggingClassifier

## basic summary of data

In [2]:
def set_summary(df):
    print('{:-^60}'.format('Data overview:'))
    print(df.tail(2))
    print('{:-^60}'.format('Data dtypes:'))
    print(df.dtypes)
    print('{:-^60}'.format('Data DESC:'))
    print(df.describe().round(2).T)

In [3]:
def na_summary(df):
    '''
    find nan value in data
    :param df: dataframe
    :return: 
    '''
    na_cols = df.isnull().any(axis=0) 
    print('{:-^60}'.format('NA cols:'))
    print(na_cols)
    na_lines = df.isnull().any(axis=1) 
    print('Total number of NA lines is: {0}'.format(na_lines.sum()))
    
def label_samples_summary(df):
    '''
    show the labels count
    :param df: dataframe
    :return: 无
    '''
    print('{:-^60}'.format('Labels samples count:'))
    print(df.iloc[:,-1].value_counts())

### function to turn string into discrete value and store in dictionary

In [4]:
#字符串分类转整数分类，用于将分类变量中的字符串转换为数值索引分类。为了能够进行矩阵计算，需要将字符串的分类变量转换为数值型分类变量。
def str2int(set, convert_object, unique_object, training=True):
    '''
    将分类变量中的字符串转换为数值索引分类
    :param set: 数据集
    :param convert_object: DictVectorizer转换对象，当training为true时为空；当training为false时则从训练阶段得到的对象
    :param unique_object: 唯一值列表，当training为true时为空；当training为false时则从训练阶段得到唯一值列表
    :training: 是否为训练阶段
    :return: 训练阶段返回model_dvtransform, unique_list,train_part_data;预测阶段返回predict_part_data
    '''
    convert_cols = ['cat','attribution','pro_id','pro_brand','order_source','pay_type','use_id','city']
    final_convert_matrix = set[convert_cols] #获得要转换的数据集合
    #final_convert_matrix , 某几个column的数据集合
    
    lines = set.shape[0]
    dict_list = [] #总空列表，用于存放字符串与对应索引组成的字典
    if training == True:#训练阶段
        unique_list = [] #总唯一值列表，用于存放每列唯一值
        for col_name in convert_cols: #循环读取每个列
            cols_unique_value = set[col_name].unique().tolist() #获取列的唯一值列表
            unique_list.append(cols_unique_value) #将唯一值列表追加到总列表
        #unique_list 包含每个特征的所有unique的值列表 [[], [] , []]
        for line_index in range(lines): #读取每行索引
            each_record = final_convert_matrix.iloc[line_index] #获取每行数据
            #each_record : 每行数据
            
            for each_index, each_data in enumerate(each_record): #读取每行对应的索引值
                list_value = unique_list[each_index] #查找唯一值在列表中的位置
                #list_value是each_index的特征下的所有唯一值列表
                each_record[each_index] = list_value.index(each_data)#将每个值映射成唯一值的索引
                #list_value.index(each_data) 是这个属性值在唯一值列表中的index
            
            #columns名字与column中存的字符串对应的unique index 组成字典
            each_dict = dict(zip(convert_cols, each_record))#将每个值与对应的索引组成字典
            dict_list.append(each_dict)#将字典追加到总列表
        model_dvtransform = DictVectorizer(sparse=False, dtype=np.int64)#建立转换模型对象
        model_dvtransform.fit(dict_list)#应用分类转换训练
        train_part_data = model_dvtransform.transform(dict_list)#转换训练集
        return model_dvtransform, unique_list, train_part_data
        '''
        最后输出一个与输入matrix类似的,但是里面的值换成了对应unique_list中的index
        '''
    
    else:
        for line_index in range(lines):
            each_record = final_convert_matrix.iloc[line_index]
            for each_index, each_data in enumerate(each_record):
                list_value = unique_object[each_index]
                each_record[each_index] = list_value.index(each_data)
            each_dict = dict(zip(convert_cols,each_record))
            dict_list.append(each_dict)
        predict_part_data = convert_object.transform(dict_list)
        return predict_part_data

### turn time into different parts as features

In [5]:
# time attribute expanding
def datetime2int(set):
    '''
    将日期和时间数据拓展出其他属性，例如星期几、周几、小时、分钟等。
    :param set: 数据集
    :return: 拓展后的属性矩阵
    '''
    date_set = list(map(lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d'), set['order_date']))
    weekday_data = list(map(lambda data: data.weekday(), date_set)) #week
    daysinmonth_data = list(map(lambda data: data.day, date_set)) #date
    month_data = list(map(lambda data: data.month, date_set)) #month
    
    time_set = list(map(lambda times: pd.datetime.strptime(times, '%H:%M:%S'), set['order_time']))
    second_data = list(map(lambda data: data.second, time_set)) #sec
    minute_data = list(map(lambda data: data.minute, time_set)) #min
    hour_data = list(map(lambda data: data.hour, time_set)) #hour
    
    final_set = [] 
    final_set.extend((weekday_data, daysinmonth_data, month_data, second_data, minute_data, hour_data))
    final_matrix = np.array(final_set).T 
    return final_matrix

### upsample of minority abnormal orders

In [6]:
def sample_balance(X, y):
    '''
    :param X: input
    :param y: label
    :return: balacend X,y
    '''
    model_smote = SMOTE() 
    x_smote_resampled, y_smote_resampled = model_smote.fit_resample(X, y) 
    return x_smote_resampled, y_smote_resampled 

In [7]:
dtypes = {'order_id': np.object,
          'pro_id': np.object,
          'use_id': np.object}
raw_data = pd.read_table('data/abnormal_orders.txt', delimiter=',', dtype=dtypes)

In [8]:
set_summary(raw_data)

-----------------------Data overview:-----------------------
          order_id  order_date order_time       cat attribution      pro_id  \
134188  4285770012  2013-09-19   23:55:06      家居日用          GO  1000335947   
134189  4285770056  2013-05-20   23:58:59  生活电器厨卫电器          GO  1000009280   

       pro_brand  total_money  total_quantity order_source pay_type  \
134188       炊大师         79.0               1           抢购     合并支付   
134189        海尔        799.0               1           抢购     合并支付   

            use_id city  abnormal_label  
134188      shukun  东莞市               0  
134189  544975322_  海口市               0  
------------------------Data dtypes:------------------------
order_id           object
order_date         object
order_time         object
cat                object
attribution        object
pro_id             object
pro_brand          object
total_money       float64
total_quantity      int64
order_source       object
pay_type           object
use_id        

In [9]:
na_summary(raw_data)

--------------------------NA cols:--------------------------
order_id          False
order_date        False
order_time        False
cat                True
attribution       False
pro_id            False
pro_brand          True
total_money        True
total_quantity    False
order_source      False
pay_type          False
use_id            False
city               True
abnormal_label    False
dtype: bool
Total number of NA lines is: 1429


In [10]:
label_samples_summary(raw_data)

-------------------Labels samples count:--------------------
0    105733
1     28457
Name: abnormal_label, dtype: int64


## data preprocessing

In [10]:
drop_na_set = raw_data.dropna() #throw those row with nan value

In [11]:
X_raw = drop_na_set.iloc[:, 1:-1] #throw id column and the label column
y_raw = drop_na_set.iloc[:, -1]   #extract the label column
model_dvtransform, unique_object, str2int_data = str2int(X_raw, None, None, training=True) #turn string into int
datetime2int_data = datetime2int(X_raw) #expand the time attribute
combine_set = np.hstack((str2int_data, datetime2int_data))
constant_set = X_raw[['total_money','total_quantity']] 
X_combine = np.hstack((combine_set, constant_set))
X, y = sample_balance(X_combine, y_raw)

### having same data size in different labels

In [12]:
from collections import Counter
labelCnt = Counter(y)
print(labelCnt)

Counter({1: 104477, 0: 104477})


In [16]:
#randomForest
model_rf = RandomForestClassifier(n_estimators=20, random_state=0)
cv = StratifiedKFold(8)
cv_score = cross_val_score(model_rf, X, y, cv=cv)
print('{:-^60}'.format('Cross val score:'))
print(cv_score)
print('Mean scores is: %.2f' % cv_score.mean())

----------------------Cross val score:----------------------
[0.71803216 0.89218989 0.96550536 0.97695253 0.93514548 0.91856191
 0.92128034 0.92522398]
Mean scores is: 0.91


In [17]:
#logistic
model_lr = LogisticRegression(random_state=0)
cv = StratifiedKFold(8)
cv_score = cross_val_score(model_lr, X, y, cv=cv)
print('{:-^60}'.format('Cross val score:'))
print(cv_score)
print('Mean scores is: %.2f' % cv_score.mean())



----------------------Cross val score:----------------------
[0.68158499 0.87672282 0.91347626 0.88223583 0.81619449 0.83620492
 0.82743702 0.84416877]
Mean scores is: 0.83


In [18]:
#bagging
model_BagC = BaggingClassifier(n_estimators=20, random_state=0)
cv = StratifiedKFold(8)
cv_score = cross_val_score(model_BagC, X, y, cv=cv)
print('{:-^60}'.format('Cross val score:'))
print(cv_score)
print('Mean scores is: %.2f' % cv_score.mean())

----------------------Cross val score:----------------------
[0.68139357 0.83212098 0.9611026  0.97745023 0.93491577 0.91990198
 0.92503254 0.92660234]
Mean scores is: 0.89


In [19]:
#randomforest,logistic,bagging,voting classifier
model_rf = RandomForestClassifier(n_estimators=20, random_state=0)
model_lr = LogisticRegression(random_state=0)
model_BagC = BaggingClassifier(n_estimators=20, random_state=0)
estimators = [('randomforest', model_rf),('Logistic', model_lr),('bagging', model_BagC)]
model_vot = VotingClassifier(estimators=estimators, voting='soft', weights=[0.9, 1.2, 1.1], n_jobs=-1)
cv = StratifiedKFold(8)
cv_score = cross_val_score(model_vot, X, y, cv=cv)
print('{:-^60}'.format('Cross val score:'))
print(cv_score)
print('Mean scores is: %.2f' % cv_score.mean())
model_vot.fit(X, y)

----------------------Cross val score:----------------------
[0.77411945 0.92557427 0.97293262 0.97117152 0.9257657  0.90906654
 0.90929627 0.91526916]
Mean scores is: 0.91


VotingClassifier(estimators=[('randomforest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
        ...estimators=20, n_jobs=None, oob_score=False, random_state=0,
         verbose=0, warm_start=False))],
         flatten_transform=None, n_jobs=-1, voting='soft',
         weights=[0.9, 1.2, 1.1])

In [20]:
X_raw_data = pd.read_csv('data/new_abnormal_orders.csv', dtype=dtypes)
X_raw_new = X_raw_data.iloc[:, 1:]
str2int_data_new = str2int(X_raw_data, model_dvtransform, unique_object, training=False)
datetime2int_data_new = datetime2int(X_raw_new)
combine_set_new = np.hstack((str2int_data_new, datetime2int_data_new))
constant_set_new = X_raw_data[['total_money','total_quantity']]
X_combine_new = np.hstack((combine_set_new, constant_set_new))
y_predict = model_vot.predict(X_combine_new)
print('{:-^60}'.format('Predicted Labels:'))
print(y_predict)

---------------------Predicted Labels:----------------------
[1 0 0 0 0 0 0]


In [13]:
#one class SVM
X_white = drop_na_set.loc[drop_na_set['abnormal_label'] == 0] #all normal dataset
X_abnormal = drop_na_set.loc[drop_na_set['abnormal_label'] == 1] #all normal dataset

X_white_raw = X_white.iloc[:, 1:-1] 
y_white_raw = X_white.iloc[:, -1]   

X_abnormal_raw = X_abnormal.iloc[:, 1:-1] 
y_abnormal_raw = X_abnormal.iloc[:, -1]   


str2int_data = str2int(X_white_raw, model_dvtransform, unique_object, training=False) 
datetime2int_data = datetime2int(X_white_raw) 
combine_set = np.hstack((str2int_data, datetime2int_data)) 
constant_set = X_white_raw[['total_money','total_quantity']] 
X_white_changed = np.hstack((combine_set, constant_set)) 

str2int_data = str2int(X_abnormal_raw, model_dvtransform, unique_object, training=False) 
datetime2int_data = datetime2int(X_abnormal_raw) 
combine_set = np.hstack((str2int_data, datetime2int_data)) 
constant_set = X_abnormal_raw[['total_money','total_quantity']] 
X_abnormal_changed = np.hstack((combine_set, constant_set))

In [14]:
from sklearn import svm
import time

start = time.time()
clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma='auto')
clf.fit(X_white_changed[:50000])
y_pred_white = clf.predict(X_white_changed)
y_pred_abnormal = clf.predict(X_abnormal_changed)
n_error_train = y_pred_white[y_pred_white == -1].size
n_error_test = y_pred_abnormal[y_pred_abnormal == 1].size
print('time:',time.time() - start)
print('n_normal_acc:',1 - (n_error_train / len(y_pred_white)))
print('n_abnormal_acc:',1 - (n_error_test / len(y_pred_abnormal)))
TP = len(y_pred_abnormal) - n_error_test
FN = n_error_test
FP = n_error_train
precision = TP / (TP + FP)
recall = TP / (TP + FN)
print('precision:',precision)
print('recall:',recall)

time: 568.980211019516
n_normal_acc: 0.28391894866812795
n_abnormal_acc: 1.0
precision: 0.2743409183495315
recall: 1.0


In [15]:
X_raw_data = pd.read_csv('data/new_abnormal_orders.csv', dtype=dtypes)
X_raw_new = X_raw_data.iloc[:, 1:]
str2int_data_new = str2int(X_raw_data, model_dvtransform, unique_object, training=False)
datetime2int_data_new = datetime2int(X_raw_new)
combine_set_new = np.hstack((str2int_data_new, datetime2int_data_new))
constant_set_new = X_raw_data[['total_money','total_quantity']]
X_combine_new = np.hstack((combine_set_new, constant_set_new))
y_predict = clf.predict(X_combine_new)
print('{:-^60}'.format('Predicted Labels:'))
print(y_predict)

---------------------Predicted Labels:----------------------
[-1  1  1  1  1  1  1]
