In [1]:
# -*-coding:utf-8-*-
!pip install seaborn
import numpy as np
import pandas as pd
import matplotlib as mpl
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
import warnings
from sklearn.metrics import f1_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import chi2, SelectPercentile
import gc
import time
from datetime import timedelta, datetime
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore")

Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/


In [2]:
print('reading data......')
trainFilepath = '/home/aistudio/data/data10020/round1_iflyad_anticheat_traindata.txt'
testFilepath = '/home/aistudio/data/data10020/round1_iflyad_anticheat_testdata_feature.txt'
df_train = pd.read_csv(trainFilepath, sep='\t')
df_test = pd.read_csv(testFilepath, sep='\t')
dataset = pd.concat([df_train, df_test], ignore_index=True)
dataset['label'] = dataset['label'].fillna(-1).astype(int)
del df_train, df_test
gc.collect()
print('Finished!')

reading data......
Finished!


In [3]:
# NA in dataset
col_na=dict()
for col in dataset.keys():
    col_na.setdefault(col,0)
    col_na[col] = dataset[col].isna().sum()
print('各个变量缺失值情况：\n',col_na)

各个变量缺失值情况：
 {'macmd5': 0, 'ip': 0, 'openudidmd5': 0, 'carrier': 0, 'reqrealip': 0, 'ver': 351016, 'osv': 13929, 'h': 0, 'make': 135343, 'dvctype': 0, 'ppi': 0, 'ntt': 0, 'os': 0, 'apptype': 0, 'sid': 0, 'mediashowid': 0, 'imeimd5': 0, 'province': 0, 'pkgname': 0, 'orientation': 0, 'label': 0, 'model': 4562, 'adidmd5': 0, 'w': 0, 'lan': 400862, 'city': 16660, 'idfamd5': 0, 'nginxtime': 0, 'adunitshowid': 0}


In [4]:
# 填充object类型特征的NA
object_col_name = dataset.dtypes[dataset.dtypes.values=='object'].keys().tolist()
for object_col in object_col_name:
    dataset[object_col] = dataset[object_col].fillna('Unknown')

In [5]:
# 数据清洗与预处理
print('data preprocessing......')
# 'orientation'中有异常值 90 和 2 ，将之都归为0
dataset.orientation[(dataset.orientation==2) | (dataset.orientation==90)] = 0

# 'carrier'中异常值 -1 ，归为0
dataset.carrier[dataset.carrier == -1] = 0

# 'ntt' 处理，0、7均为未知 -> 0； 1、2宽带 -> 1； 3未知移动网 -> 2；4、5、6移动网 -> 3
dataset.ntt[(dataset.ntt == 0) | (dataset.ntt == 7)] = 0
dataset.ntt[(dataset.ntt == 1) | (dataset.ntt == 2)] = 1
dataset.ntt[dataset.ntt == 3] = 2
dataset.ntt[(dataset.ntt >= 4) & (dataset.ntt <= 6)] = 3
# dataset.ntt[(dataset.carrier <= 0) | (dataset.carrier > 46003)]

# 处理时间特征
print('gaining time features......')
start = time.time()
dataset['datetime'] = pd.to_datetime(dataset['nginxtime']/1000,unit='s') + timedelta(hours=8)
dataset['hour'] = dataset.datetime.dt.hour
dataset['minute'] = dataset.datetime.dt.minute
dataset['day'] = dataset.datetime.dt.day - dataset.datetime.dt.day.min()
dataset['respond_time'] = abs(dataset['sid'].apply(lambda x : x.split('-')[-1]).astype(float)-dataset['nginxtime'])
dataset = dataset.drop(columns = ['datetime','nginxtime'])
print('Finished in %d seconds!' %(time.time()-start))
del start
# 处理 'make'
def making(x):
    x = x.lower()
    if 'iphone' in x or 'apple' in x or '苹果' in x:
        return 'iphone'
    elif 'huawei' in x or 'honor' in x or '华为' in x or '荣耀' in x:
        return 'huawei'
    elif 'xiaomi' in x or '小米' in x or 'redmi' in x:
        return 'xiaomi'
    elif '魅族' in x:
        return 'meizu'
    elif '金立' in x:
        return 'gionee'
    elif '三星' in x or 'samsung' in x:
        return 'samsung'
    elif 'vivo' in x:
        return 'vivo'
    elif 'oppo' in x:
        return 'oppo'
    elif 'lenovo' in x or '联想' in x:
        return 'lenovo'
    elif 'nubia' in x:
        return 'nubia'
    elif 'oneplus' in x or '一加' in x:
        return 'oneplus'
    elif 'smartisan' in x or '锤子' in x:
        return 'smartisan'
    elif '360' in x or '360手机' in x:
        return '360'
    elif 'zte' in x or '中兴' in x:
        return 'zte'
    else:
        return 'others'
dataset['make'] = dataset['make'].astype(str).apply(lambda x : x.lower())
dataset['make'] = dataset['make'].apply(making)
dataset.os[dataset.make == 'iphone'] = 'ios'
dataset.os[dataset.make != 'iphone'] = 'android'

# 处理'lan’
def lan(x):
    x = x.lower()
    if x in ['zh-cn','zh','cn','zh_cn','zh_cn_#hans','zh-']:
        return 'zh-cn'
    elif x in ['tw','zh-tw','zh_tw']:
        return 'zh-tw'
    elif 'en' in x:
        return 'en'
    elif 'hk' in x:
        return 'zh-hk'
    else:
        return x
dataset['lan'] = dataset['lan'].astype(str).apply(lambda x : x.lower())
dataset['lan'] = dataset['lan'].apply(lan)

# 粗略处理 'ver'
def ver_trans(x):
    x = str(x)
    for i in range(0,30):
        for j in range(0,10):
            if '3.'+str(j)+'.'+str(i) in x or '30'+str(j)+str(i) in x:
                return '3.'+str(j)+'.'+str(i)
    if '521000' in x or '5.2.1' in x:
        return '5.2.1'
    else:
        return x
dataset['ver']=dataset.ver.apply(ver_trans).fillna('Unknown')


data preprocessing......
gaining time features......
Finished in 8 seconds!


In [6]:
# 统计同一设备下的ip数
start = time.time()
print('counting ip number in different macmd5......')
temp1 = dataset.groupby(['macmd5','ip'])
temp2 = temp1.size().reset_index(name='mac_ip').drop('ip',axis=1)
temp3 = temp2.groupby(by='macmd5').count().reset_index()
dataset = pd.merge(dataset,temp3,how='left',on='macmd5').fillna(0)
print('Finished in %d sec' %(time.time()-start))
del start,temp1,temp2,temp3
dataset.mac_ip[dataset.macmd5=='empty'] = 0
start = time.time()
print('counting ip number in different imeimd5......')
temp1 = dataset.groupby(['imeimd5','ip'])
temp2 = temp1.size().reset_index(name='imei_ip').drop('ip',axis=1)
temp3 = temp2.groupby(by='imeimd5').count().reset_index()
dataset = pd.merge(dataset,temp3,how='left',on='imeimd5').fillna(0)
print('Finished in %d sec' %(time.time()-start))
del start,temp1,temp2,temp3
dataset.imei_ip[dataset.imeimd5=='empty'] = 0

counting ip number in different macmd5......
Finished in 9 sec
counting ip number in different imeimd5......
Finished in 15 sec


In [7]:
# One-Hot 编码特征
onehot_col_name = ['mediashowid','osv','province','dvctype','apptype','os','make','lan','ntt','carrier','ppi','city']
ohe = OneHotEncoder()
print('One-Hot Encoding......')
ohe_result = ohe.fit_transform(dataset[onehot_col_name])
# 高维稀疏特征，通过TruncatedSVD方法降维
tsvd = TruncatedSVD(n_components=150)
decomposition_feature = tsvd.fit_transform(ohe_result)
mm = MinMaxScaler()
decomposition_feature = mm.fit_transform(decomposition_feature)
print('降维后数据方差解释率为：',tsvd.explained_variance_ratio_.sum())

One-Hot Encoding......
降维后数据方差解释率为： 0.9115884842818075


In [8]:
decom = dict()
for i in range(decomposition_feature.shape[1]):
    col_name = 'TSVD'+str(i)
    decom.setdefault(col_name,[])
    decom[col_name] = decom[col_name]+decomposition_feature[:,i].tolist()
decom = pd.DataFrame(decom)
del decomposition_feature
print(decom.head())

      TSVD0     TSVD1    TSVD10   TSVD100   TSVD101   TSVD102   TSVD103  \
0  0.730870  0.158168  0.579168  0.418900  0.218006  0.456457  0.402786   
1  0.535318  0.677373  0.636917  0.391467  0.190527  0.456830  0.409331   
2  0.912436  0.206114  0.552902  0.376713  0.235333  0.466022  0.399146   
3  0.530898  0.744761  0.336806  0.393727  0.198086  0.466891  0.401612   
4  0.855237  0.347230  0.287094  0.407842  0.210157  0.487930  0.395314   

    TSVD104   TSVD105   TSVD106  ...    TSVD90    TSVD91    TSVD92    TSVD93  \
0  0.236495  0.412124  0.365799  ...  0.406208  0.483100  0.305829  0.440837   
1  0.215957  0.407825  0.354410  ...  0.355379  0.476560  0.342578  0.459193   
2  0.236696  0.414698  0.368558  ...  0.483426  0.483188  0.212300  0.342869   
3  0.218846  0.411580  0.354502  ...  0.369387  0.478911  0.354852  0.439324   
4  0.237139  0.412057  0.365827  ...  0.383559  0.486163  0.334982  0.468335   

     TSVD94    TSVD95    TSVD96    TSVD97    TSVD98    TSVD99  
0  0

In [9]:
# 统计特征值出现次数
def get_counts_feature(data,col_name):
    print(col_name,'value counting......')
    temp = pd.DataFrame(data[col_name].value_counts().reset_index())
    temp.columns = [col_name,'counts']
    result = pd.merge(data,temp,how='left',on=col_name)['counts']
    return result.fillna(0)
counts_col = ['pkgname', 'adunitshowid', 'ip', 'reqrealip',
                    'adidmd5', 'imeimd5', 'idfamd5', 'macmd5','reqrealip','ver','model']
for col in counts_col:
    dataset[col+'_counts'] = get_counts_feature(dataset,col_name=col)
del col
counts_col_name = list(map(lambda x : x+'_counts',counts_col))

pkgname value counting......
adunitshowid value counting......
ip value counting......
reqrealip value counting......
adidmd5 value counting......
imeimd5 value counting......
idfamd5 value counting......
macmd5 value counting......
reqrealip value counting......
ver value counting......
model value counting......


In [10]:
# LabelEncode
def labelcoder(col,data):
    lbe = LabelEncoder()
    result = lbe.fit_transform(data[col])
    result_name = col+'_LabelCode'
    data[result_name] = result
    return data
label_code_col = ['ip','reqrealip','w','h']
for i in label_code_col:
    dataset = labelcoder(i, dataset)
labelcode_col = list(map(lambda x : x+'_LabelCode',label_code_col))

In [23]:
numerical_col_name = ['hour','minute','orientation','day','respond_time','mac_ip','imei_ip']+counts_col_name+labelcode_col
surplus_col_name= set(dataset.keys().tolist())-set(onehot_col_name)-set(numerical_col_name)-set(counts_col)
print(surplus_col_name)
x = pd.concat([dataset[numerical_col_name],decom],axis=1)
y = dataset.label
del decom
data = pd.concat([pd.concat([x,y],axis=1),dataset.sid],axis=1)

{'label', 'openudidmd5', 'h', 'w', 'sid'}


In [24]:
# 特征选择后划分数据集
x = data.drop(['sid','label'],axis=1)
trainx = x[:1000000]
predict_x = x[1000000:]
def featureSelect(x_train,y_train,x_val,func=chi2,percentile=80):
    model = SelectPercentile(func,percentile=percentile)
    model.fit(x_train,y_train)
    x_train = model.transform(x_train)
    x_val = model.transform(x_val)
    return x_train,x_val
trainx,predict_x = featureSelect(trainx,data.label[data.label != -1].values,predict_x)
x_train,x_val,y_train,y_val = train_test_split(trainx,data.label[data.label != -1].values,
train_size=0.8,random_state=6)

In [25]:
!pip install lightgbm
from lightgbm import LGBMClassifier
!pip install xgboost
from xgboost import XGBClassifier

In [26]:
def f1_metric(labels, preds):
    score = f1_score(labels, np.round(preds))
    return 'f1', score, True

def f1_xgb(preds,dtrain):
    labels = dtrain.get_label()
    score = f1_score(labels, np.round(preds))
    return 'f1', -score

In [27]:
# 基于hyperopt进行参数调优
!pip install hyperopt
from hyperopt import fmin, tpe, hp, partial

In [31]:
# LGB 参数调优
# 定义参数空间
space_lgb = {"max_depth": hp.randint("max_depth", 15),
         "n_estimators": hp.randint("n_estimators", 5000),
         "learning_rate": hp.uniform("learning_rate", 0.001, 0.5),
         "subsample": hp.uniform("subsample", 0.3, 1),
         "colsample_bytree":hp.uniform("colsample_bytree", 0.3, 1),
         "num_leaves": hp.randint("num_leaves", 10),
         "subsample_freq": hp.randint("subsample_freq",5),
         "reg_alpha": hp.uniform("reg_alpha", 0.1, 3),
         "reg_lambda": hp.uniform("reg_lambda", 0.1, 3)
         }
def argsDict_tranform_lgb(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 6
    argsDict["n_estimators"] = argsDict['n_estimators'] + 300
    argsDict["learning_rate"] = argsDict["learning_rate"] * 0.02 + 0.05
    argsDict["num_leaves"] = argsDict["num_leaves"] * 8 + 20
    if isPrint:
        print(argsDict)
    else:
        pass
    return argsDict

def get_tranformer_score(tranformer):
    model = tranformer
    prediction = model.predict(x_val, num_iteration=model.best_iteration_)
    return -1*f1_score(y_val,np.round(prediction))

# 创建模型工场
def lightgbm_factory(argsDict):
    argsDict = argsDict_tranform_lgb(argsDict)
    lgb = LGBMClassifier(max_depth=argsDict['max_depth'],n_estimators=argsDict['n_estimators'],
                        learning_rate=argsDict['learning_rate'],subsample=argsDict['subsample'],
                        num_leaves=argsDict['num_leaves'],objective='binary',
                        colsample_bytree=argsDict['colsample_bytree'],reg_alpha=argsDict['reg_alpha'],
                        reg_lambda=argsDict['reg_lambda'],random_seed=66)
    lgb.fit(x_train,y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_names=['train', 'val'],
    eval_metric=f1_metric,
    verbose = 50,
    early_stopping_rounds=50
    )
    return get_tranformer_score(lgb)
    
algo = partial(tpe.suggest, n_startup_jobs=1)
best = fmin(lightgbm_factory, space_lgb, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
print('The best params are:',best)

In [33]:
best =  {'num_leaves': 7, 'max_depth': 12, 'colsample_bytree': 0.3273079033503896,
'reg_alpha': 2.984483664182527, 'subsample': 0.3126121982112917,'reg_lambda': 0.2449221888407772,
'learning_rate': 0.49692793913575733, 'subsample_freq': 3, 'n_estimators': 3854}
best = argsDict_tranform_lgb(best)
lgb = LGBMClassifier(max_depth=best['max_depth'],n_estimators=best['n_estimators'],
                        learning_rate=best['learning_rate'],subsample=best['subsample'],
                        num_leaves=best['num_leaves'],objective='binary',
                        colsample_bytree=best['colsample_bytree'],reg_alpha=best['reg_alpha'],
                        subsample_freq=best['subsample_freq'],reg_lambda=best['reg_lambda'],random_seed=66)

print('feature percentile=%d' %i)
trainx,predict_x = featureSelect(trainx,data.label[data.label != -1].values,predict_x)
x_train,x_val,y_train,y_val = train_test_split(trainx,data.label[data.label != -1].values,
train_size=0.8,random_state=6)
# lgb
lgb.fit(x_train,y_train,eval_set=[(x_train, y_train), (x_val, y_val)],
eval_names=['train', 'val'],eval_metric=f1_metric,early_stopping_rounds=100,verbose=50)

In [18]:
# XGB 参数调优
# 定义参数空间
space_xgb = {
    'n_estimators':hp.randint('n_estimators',2000),
    'max_depth':hp.randint('max_depth',10),
    'subsample':hp.uniform('subsample',0.5,1),
    'min_child_weight':hp.randint('min_child_weight',500),
    'eta':hp.uniform('eta',0,0.4),
    'colsample_bytree':hp.uniform('colsample_bytree',0.5,1),
    "reg_alpha": hp.uniform("reg_alpha", 0.1, 3),
    "reg_lambda": hp.uniform("reg_lambda", 0.1, 3)
}

def argsDict_tranform_xgb(argsDict, isPrint=False):
    argsDict["max_depth"] = argsDict["max_depth"] + 6
    argsDict["n_estimators"] = argsDict['n_estimators'] + 500
    if isPrint:
        print(argsDict)
    else:
        pass
    return argsDict

def get_tranformer_score(tranformer):
    model = tranformer
    prediction = model.predict(x_val)
    return -1*f1_score(y_val,prediction)
    
# 创建模型工场
def xgb_factory(argsDict):
    argsDict = argsDict_tranform_xgb(argsDict)
    xgb = XGBClassifier(max_depth=argsDict['max_depth'],n_estimators=argsDict['n_estimators'],
                        learning_rate=argsDict['eta'],subsample=argsDict['subsample'],
                        colsample_bytree=argsDict['colsample_bytree'],reg_alpha=argsDict['reg_alpha'],
                        reg_lambda=argsDict['reg_lambda'],random_seed=66,n_jobs=8)
    xgb.fit(x_train,y_train,
    eval_set=[(x_train, y_train), (x_val, y_val)],
    eval_metric=f1_xgb,
    verbose = 50,
    early_stopping_rounds=50)
    return get_tranformer_score(xgb)
algo = partial(tpe.suggest, n_startup_jobs=1)
best_xgb = fmin(xgb_factory, space_xgb, algo=algo, max_evals=20, pass_expr_memo_ctrl=None)
print('The best params are:',best_xgb)

In [22]:
best_xgb = {'min_child_weight': 334, 'n_estimators': 69, 'max_depth': 2,
            'eta': 0.12371645516780436, 'reg_alpha': 2.9121568142476426,
            'subsample': 0.9430955192093492, 'reg_lambda': 1.4594989969245042,
            'colsample_bytree': 0.9982012497911378}
best_xgb = argsDict_tranform_xgb(best_xgb)
xgb = XGBClassifier(max_depth=best_xgb['max_depth'],n_estimators=best_xgb['n_estimators'],
                        learning_rate=best_xgb['eta'],subsample=best_xgb['subsample'],
                        colsample_bytree=best_xgb['colsample_bytree'],reg_alpha=best_xgb['reg_alpha'],
                        reg_lambda=best_xgb['reg_lambda'],random_seed=66,n_jobs=8)

In [27]:
# Extra Tree
from sklearn.ensemble import ExtraTreesClassifier
param_grid = {'n_estimators':[300,600,1200]}
model2 = ExtraTreesClassifier(verbose=10,n_jobs=8)
kfold = StratifiedKFold(n_splits=3, shuffle = True,random_state=7)
gsv2 = GridSearchCV(model2,param_grid,scoring = 'f1',cv = kfold)
print('fitting......')
gsv2_result = gsv2.fit(x_train,y_train)
print("Best of ETC: %f using %s" % (gsv2_result.best_score_,gsv2_result.best_params_))

In [19]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier(n_estimators=1000,n_jobs=8,verbose=10)

In [20]:
# stacking Class
class Ensemble(object):
    def __init__(self, n_splits, stacker, base_models):
        self.n_splits = n_splits
        self.stacker = stacker
        self.base_models = base_models

    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=66).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        
        for i, clf in enumerate(self.base_models):
            S_test_i = np.zeros((T.shape[0], self.n_splits))
            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]
                print ("Fit Model %d fold %d" % (i, j))
                if str(clf)[:3].lower() == 'xgb':
                    clf.fit(X_train, y_train,
                    eval_set=[(X_train, y_train), (X_holdout, y_holdout)],
                    eval_metric=f1_xgb,verbose = 50,early_stopping_rounds=50)
                elif str(clf)[:3].lower == 'lgb':
                    clf.fit(X_train,y_train,eval_set=[(X_train, y_train), (X_holdout, y_holdout)],
                    eval_names=['train', 'val'],eval_metric=f1_metric,early_stopping_rounds=100,verbose=50)
                else:
                    clf.fit(X_train,y_train)
                try:
                    y_pred = clf.predict_proba(X_holdout)[:]
                    S_train[test_idx, i] = y_pred[:,1]
                except:
                    y_pred = clf.predict(X_holdout)[:]
                    S_train[test_idx, i] = y_pred
                try:
                    S_test_i[:, j] = clf.predict_proba(T)[:,1][:]
                except:
                    S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
        self.stacker.fit(S_train, y)
        res = self.stacker.predict(S_test)[:]
        return res

In [24]:
from sklearn.linear_model import LinearRegression
stack = Ensemble(n_splits=3,
        stacker=LinearRegression(),
        base_models=(xgb,etc,lgb))
# y_val_pre = stack.fit_predict(x_train, y_train, x_val)
# f1_metric(y_val,np.round(y_val_pre))

In [25]:
result = np.round(stack.fit_predict(trainx,data.label[data.label != -1].values, predict_x))
submit = pd.DataFrame()
submit['sid'] = data[data.label == -1]['sid']
submit['label'] = result
filename = 'submission'+ datetime.now().strftime('%m%d_%H%M')+'.csv'
submit.to_csv(filename,index=False)