In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [24]:
train = pd.read_csv('C:/Users/Administrator/Desktop/移动广告反欺诈算法挑战/移动广告反欺诈算法挑战赛-0621/移动广告反欺诈算法挑战赛/round1_iflyad_anticheat_traindata.txt',encoding='utf-8',sep='\t')
test = pd.read_csv('C:/Users/Administrator/Desktop/移动广告反欺诈算法挑战/移动广告反欺诈算法挑战赛-0621/移动广告反欺诈算法挑战赛/round1_iflyad_anticheat_testdata_feature.txt',encoding='utf-8',sep='\t')

In [3]:
def _process(df):
    #缺失值处理
    #city用众数填充
    df['city'] = df['city'].fillna('北京市')
    #lan用众数填充
    language = ['zh-CN','Zh-CN', 'zh-cn', 'cn', 'zh_CN', 'zh', 'ZH', 'CN', 'tw','zh_CN_#Hans','zh-TW','zh-HK','zh-US','zh_HK_#Hant',
               'zh-MO','zh-','zh_TW']
    df.loc[df['lan'].isin(language),'lan'] = 'cn'
    df['lan'] = df['lan'].fillna('cn')
    #make用众数填充
    df['make'] = df['make'].fillna('oppo')
    #model用众数填充
    df['model'] = df['model'].fillna('PBAMOO')
    #osv用众数填充
    df['osv'] = df['osv'].fillna('8.1.0')
    #ver用众数填充
    df['ver'] = df['ver'].fillna('30927000')
    
    #无用的特征，先删除
    df.drop(['sid','nginxtime','ip','reqrealip'],axis=1,inplace=True)
    
    #运营商
    df.loc[df['carrier'] == 46000,'carrier'] = 1
    df.loc[df['carrier'] == 46001,'carrier'] = 2
    df.loc[df['carrier'] == 46003,'carrier'] = 3
    df['make'] = df['make'].str.lower()
    df['os'] = df['os'].str.lower()
    
    #将分类太多的特征取数量排前20的值，其他变成others，方便one-hot
    citys = []
    for i in df['city'].value_counts().head(20).index:
        citys.append(i)
    df.loc[~df['city'].isin(citys),'city'] = 'others'

    makes = []
    for i in df['make'].value_counts().head(20).index:
        makes.append(i)
    df.loc[~df['make'].isin(makes),'make'] = 'others'

    models = []
    for i in df['model'].value_counts().head(20).index:
        models.append(i)
    df.loc[~df['model'].isin(models),'model'] = 'others'

    osvs = []
    for i in df['osv'].value_counts().head(20).index:
        osvs.append(i)
    df.loc[~df['osv'].isin(osvs),'osv'] = 'others'
    return df

In [4]:
train_data = _process(train)
test_data = _process(test)

In [5]:
##结合数据，方便提取特征：axis=0 纵向合并；axis=1 横向合并
data=pd.concat([train_data,test_data],axis=0).reset_index(drop=True)

#用sklearn进行labelEncode
def one_hot_col(col):
    '''标签编码'''
    lbl = LabelEncoder()
    lbl.fit(col)
    return lbl

object_cols=list(data.dtypes[data.dtypes==np.object].index) ##返回字段名为object类型的字段
##对object类型的字段进行标签编码：
for col in object_cols:
    if col!='sid':
        data[col]=one_hot_col(data[col].astype(str)).transform(data[col].astype(str))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  


In [6]:
##划分数据：
train1=data[:train_data.shape[0]].drop('label',axis=1)
label1=train['label'].values
test1=data[train_data.shape[0]:].reset_index(drop=True).drop('label',axis=1)

In [7]:
X = train1
y = label1

In [8]:
X_train,X_test,y_train,y_test = train_test_split(X,y,train_size=0.8)



In [52]:
# 训练模型
model1 = xgb.XGBClassifier()
model1.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=3, min_child_weight=1, missing=None,
       n_estimators=100, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [53]:
# 对测试集进行预测
preds = model1.predict(X_test)

predictions = [round(value) for value in preds]

test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))

Test Accuracy: 89.14%


In [15]:
model2 = xgb.XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model2.fit(X_train, y_train)

preds = model2.predict(X_test)

predictions = [round(value) for value in preds]

test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))

Test Accuracy: 93.14%


In [21]:
model3 = xgb.XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=4,
 min_child_weight=6,                          
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,
 seed=27)
model3.fit(X_train, y_train)

preds = model3.predict(X_test)

predictions = [round(value) for value in preds]

test_accuracy = accuracy_score(y_test, predictions)
print("Test Accuracy: %.2f%%" % (test_accuracy * 100.0))

Test Accuracy: 92.94%


In [22]:
result_label = model2.predict(test1)

In [26]:
sid = test['sid']
xgb_result2 = pd.DataFrame({'sid':sid,'label':result_label})
xgb_result2.to_csv('xgb_result.csv',index=None)