In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
# 导入评估指标模块
from sklearn.metrics import accuracy_score, auc, confusion_matrix, f1_score, precision_score, recall_score, roc_curve
# 导入表格库
import prettytable
# 导入dot插件库
import pydotplus
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


In [2]:
labelled_data = pd.read_csv('labelled_whitelist_shop2.csv')
labelled_data.shape
# labelled_data['kmeans_label']

(536, 17)

In [3]:
labelled_data.info()
# labelled_data.tail(20)
print(labelled_data['kmeans.labels'].value_counts())
print(labelled_data['agglomerative.average.labels'].value_counts())
print(labelled_data['agglomerative.complete.labels'].value_counts())
print(labelled_data['meanshift.labels'].value_counts())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536 entries, 0 to 535
Data columns (total 17 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   manage_shop_indicator          536 non-null    float64
 1   officialstore_indicator        536 non-null    float64
 2   preferred_shop_indicator       536 non-null    float64
 3   crossborder_indicator          536 non-null    float64
 4   shop_category                  536 non-null    float64
 5   new_seller_flag                536 non-null    float64
 6   seller_centre_login_L30D       536 non-null    float64
 7   shop_sku_number                536 non-null    float64
 8   shop_follower_number           536 non-null    float64
 9   shop_L180D_order               536 non-null    float64
 10  weighted_shop_rating           536 non-null    float64
 11  kmeans.labels                  536 non-null    int64  
 12  agglomerative.average.labels   536 non-null    int

In [4]:
#process non-whitelist data
non_whitelist = pd.read_csv('non_whitelist_filled.csv')
nw_data = non_whitelist.groupby("shop_index").mean()
nw_data = nw_data.drop(columns=['decorated_indicator','Unnamed: 0','performance_date','masked_item_impression','masked_order','masked_shop_page_view','masked_shop_click_from_search','masked_campaign_tab_click','masked_other_tab_click'])
nw_data.head(20)
nw_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26018 entries, 119700 to 1486953315
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   manage_shop_indicator     26018 non-null  float64
 1   officialstore_indicator   26018 non-null  float64
 2   preferred_shop_indicator  26018 non-null  float64
 3   crossborder_indicator     26018 non-null  float64
 4   shop_category             26018 non-null  float64
 5   new_seller_flag           26018 non-null  float64
 6   seller_centre_login_L30D  26018 non-null  float64
 7   shop_sku_number           26018 non-null  float64
 8   shop_follower_number      26018 non-null  float64
 9   shop_L180D_order          26018 non-null  float64
 10  weighted_shop_rating      26018 non-null  float64
dtypes: float64(11)
memory usage: 2.4 MB


In [29]:
# agglomerative.average.labels 
#define x,y
x = labelled_data.iloc[:,:11]
y = labelled_data.iloc[:,-5]
#get train/test data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=2018)


In [30]:
#Decision Tree Classifier
#model training
dt_model = DecisionTreeClassifier(random_state=2018)
dt_model.fit(x_train, y_train)
#test the model with test dataset
pre_y = dt_model.predict(x_test)
accuracy_s = accuracy_score(y_test, pre_y)
accuracy_s
y_score = dt_model.predict_proba(x_test)
y_score

#prediction on non-whitelist data
prediction1 = dt_model.predict(nw_data)
prediction1
prob1 = dt_model.predict_proba(nw_data)
prob1[0:20]

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.]])

In [31]:
#random forest(currently good)
from sklearn.ensemble import RandomForestClassifier
# Create the model with 100 trees
rf_model = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
rf_model.fit(x_train, y_train)

# Actual class predictions
rf_predictions = rf_model.predict(x_test)

# Probabilities for each class
rf_probs =rf_model.predict_proba(x_test)[:, 1]
# rf_probs
rf_predictions

#prediction on non_whitelist data
predictions = rf_model.predict(nw_data)
print(predictions[20:40])
proba = rf_model.predict_proba(nw_data)
print(proba[20:40])
proba = list(proba)
count = 0
for i in proba:
    if max(i)<0.7:
        count += 1
print("unmatched num: ",count)

[0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0]
[[0.89 0.09 0.01 0.   0.01]
 [0.98 0.02 0.   0.   0.  ]
 [0.95 0.   0.05 0.   0.  ]
 [0.34 0.51 0.05 0.08 0.02]
 [0.97 0.02 0.01 0.   0.  ]
 [0.96 0.03 0.01 0.   0.  ]
 [0.93 0.04 0.03 0.   0.  ]
 [0.77 0.15 0.07 0.   0.01]
 [0.96 0.02 0.02 0.   0.  ]
 [0.1  0.86 0.01 0.03 0.  ]
 [0.75 0.03 0.18 0.   0.04]
 [0.94 0.03 0.03 0.   0.  ]
 [0.95 0.   0.05 0.   0.  ]
 [0.97 0.02 0.01 0.   0.  ]
 [0.92 0.03 0.05 0.   0.  ]
 [0.96 0.   0.04 0.   0.  ]
 [0.86 0.06 0.08 0.   0.  ]
 [0.75 0.19 0.04 0.01 0.01]
 [0.96 0.   0.03 0.   0.01]
 [0.96 0.01 0.02 0.   0.01]]
unmatched num:  7682


In [89]:
#GradientBoostingClassifier
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_curve,auc
from sklearn import metrics
from sklearn.model_selection import GridSearchCV 
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

#data
x = labelled_data.iloc[:,:11]
y = labelled_data.iloc[:,-5]
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=0,train_size=0.7)
ss_x = StandardScaler()
ss_y = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)

#model training
model_GBDT = GradientBoostingClassifier(random_state=10)
model_GBDT.fit(x_train,y_train)
y_pred = model_GBDT.predict(x_train)
y_predprob = model_GBDT.predict_proba(x_train)[:,1]

#predict on non whitelist data
model_GBDT.predict(nw_data.head(20))
prob_GBDT = model_GBDT.predict_proba(nw_data)
prob_GBDT
count_GBDT = 0
for i in range(len(prob_GBDT)):
    if np.max(prob_GBDT[i])<0.8:
        count_GBDT += 1
print(count_GBDT)
#     print(np.max(prob_GBDT[i]))

0


0.8298457409063978

In [11]:
#XGBoost (----------------to be edited, 还是报错)
import numpy as np
import pandas as pd 
import xgboost as xgb
import matplotlib.pyplot as plt
plt.style.use("ggplot")
%matplotlib inline

In [12]:
x = labelled_data.iloc[:,:-4]
y = labelled_data.iloc[:,-4]
X_train, X_test, y_train, y_test = train_test_split(x, y,random_state=0,train_size=0.7)

In [13]:
params = {
    'booster': 'gbtree',
    'objective': 'multi:softmax',
    'num_class': 3,
    'gamma': 0.1,
    'max_depth': 6,
    'lambda': 2,
    'subsample': 0.7,
    'colsample_bytree': 0.75,
    'min_child_weight': 3,
    'eta': 0.1,
    'seed': 1,
    'nthread': 4,
}

plst = list(params.items())

dtrain = xgb.DMatrix(X_train, y_train) # 生成数据集格式
num_rounds = 500
model = xgb.train(plst, dtrain, num_rounds) # xgboost模型训练

# 对测试集进行预测
dtest = xgb.DMatrix(X_test)
y_pred = model.predict(dtest)

# 计算准确率
accuracy = accuracy_score(y_test,y_pred)
print("accuarcy: %.2f%%" % (accuracy*100.0))

# 显示重要特征
plot_importance(model)
plt.show()

XGBoostError: [20:53:38] C:/Users/Administrator/workspace/xgboost-win64_release_1.4.0/src/objective/multiclass_obj.cu:120: SoftmaxMultiClassObj: label must be in [0, num_class).

In [22]:
# agglomerative.complete.labels 
#define x,y
x1 = labelled_data.iloc[:,:11]
y1 = labelled_data.iloc[:,-4]
#get train/test data
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size=0.3, random_state=2018)





In [24]:
#Decision Tree Classifier
#model training
dt_model1 = DecisionTreeClassifier(random_state=2018)
dt_model1.fit(x1_train, y1_train)
#test the model with test dataset
pre_y1 = dt_model1.predict(x1_test)
accuracy_s1 = accuracy_score(y1_test, pre_y1)
accuracy_s1
y1_score = dt_model1.predict_proba(x1_test)
y1_score

#prediction on non-whitelist data
prediction1 = dt_model1.predict(nw_data)
prediction1
prob1 = dt_model1.predict_proba(nw_data)
prob1[0:20]

array([[1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.]])

In [27]:
#random forest(currently good)
from sklearn.ensemble import RandomForestClassifier
# Create the model with 100 trees
rf_model1 = RandomForestClassifier(n_estimators=100, 
                               bootstrap = True,
                               max_features = 'sqrt')
# Fit on training data
rf_model1.fit(x1_train, y1_train)

# Actual class predictions
rf_predictions1 = rf_model1.predict(x1_test)

# Probabilities for each class
rf_probs1 =rf_model1.predict_proba(x1_test)[:, 1]
# rf_probs
rf_predictions

#prediction on non_whitelist data
predictions1 = rf_model1.predict(nw_data)
print(predictions1[20:40])
proba1 = rf_model1.predict_proba(nw_data)
print(proba1[20:40])
proba1 = list(proba1)
count1 = 0
for i in proba1:
    if max(i)<0.7:
        count1 += 1
print("unmatched num: ",count1)

[0 0 0 2 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0]
[[0.88 0.04 0.07 0.   0.01]
 [0.87 0.07 0.05 0.   0.01]
 [0.88 0.09 0.02 0.   0.01]
 [0.19 0.23 0.42 0.11 0.05]
 [0.89 0.08 0.02 0.   0.01]
 [0.27 0.69 0.01 0.   0.03]
 [0.78 0.15 0.05 0.   0.02]
 [0.78 0.04 0.12 0.02 0.04]
 [0.82 0.13 0.05 0.   0.  ]
 [0.28 0.01 0.66 0.04 0.01]
 [0.18 0.76 0.02 0.   0.04]
 [0.88 0.1  0.01 0.   0.01]
 [0.9  0.07 0.02 0.   0.01]
 [0.91 0.06 0.03 0.   0.  ]
 [0.85 0.08 0.06 0.   0.01]
 [0.89 0.08 0.02 0.   0.01]
 [0.74 0.21 0.02 0.01 0.02]
 [0.89 0.01 0.06 0.04 0.  ]
 [0.89 0.07 0.03 0.   0.01]
 [0.88 0.08 0.04 0.   0.  ]]
unmatched num:  9266


In [None]:
x1 = labelled_data.iloc[:,:11]
y1 = labelled_data.iloc[:,-4]
x_train, x_test, y_train, y_test = train_test_split(x, y,random_state=0,train_size=0.7)
ss_x = StandardScaler()
ss_y = StandardScaler()
x_train = ss_x.fit_transform(x_train)
x_test = ss_x.transform(x_test)

#model training
model_GBDT = GradientBoostingClassifier(random_state=10)
model_GBDT.fit(x_train,y_train)
y_pred = model_GBDT.predict(x_train)
y_predprob = model_GBDT.predict_proba(x_train)[:,1]

#predict on non whitelist data
model_GBDT.predict(nw_data.head(20))
prob_GBDT = model_GBDT.predict_proba(nw_data)
prob_GBDT
count_GBDT = 0
for i in range(len(prob_GBDT)):
    if np.max(prob_GBDT[i])>0.9:
        count_GBDT += 1
print(count_GBDT)
#     print(np.max(prob_GBDT[i]))