### 前言
这个baseline是参考某大佬[github开源](https://github.com/biaobiao2/DC_phone/blob/master/code/baseline.py)    
在开源基础上进行一些简单的修改，线上实际可达0.92+     
具体.92的模型或参数较乱，可能线上达不到.92，各位大佬自行探索一下     

In [18]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler

from scipy.stats import stats
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option("display.max_colwidth",100)
pd.set_option('display.width',1000)
import os
os.chdir("F:/MyCode/SiChuan/诈骗电话识别/")

In [2]:
def get_user_feats(df_train,df_test):
    '''
    @Param：
        训练集和测试集的user表
    @Function：
        对地区特征的处理采用坏样本率替换，取消原baseline的编码替换
    @Return：
        返回地区处理后user表
    '''
    print('USER')
    # 空值替换为未知地域
    df_train[['city_name','county_name']] = df_train[['city_name','county_name']].fillna('未知')
    df_test[['city_name','county_name']] = df_test[['city_name','county_name']].fillna('未知')
    
    # 计算地区的坏样本率：地区坏样本率为某地区的坏样本占该地区所有样本的比值
    # 以及整体的坏样本率：整体坏样本率为该地区的坏样本占所有样本的比值
    city_1 = df_train.groupby(['city_name','label'])['label'].count()[:,1] \
        / df_train.groupby('city_name')['label'].count()
    city_2 = df_train.groupby(['city_name','label'])['label'].count()[:,1] \
        / df_train.groupby('label')['city_name'].count()[1]
    
    df_train["city_name_1"] = df_train["city_name"].replace(city_1.to_dict())
    df_train["city_name_2"] = df_train["city_name"].replace(city_2.to_dict())
    df_test["city_name_1"] = df_test["city_name"].replace(city_1.to_dict())
    df_test["city_name_2"] = df_test["city_name"].replace(city_2.to_dict())
    
    # 计算分公司的坏样本率：同上
    # 及整体的坏样本率：同上
    county_1 = df_train.groupby(['county_name','label'])['label'].count()[:,1] \
        / df_train.groupby('county_name')['label'].count()
    county_2 = df_train.groupby(['county_name','label'])['label'].count()[:,1] \
        / df_train.groupby('label')['county_name'].count()[1]
    
    # 对于county_2计算中无诈骗电话的分公司在county_2中未出现，用0进行填充
    ind = [x for x in county_1.index if x not in county_2.index]
    ind1 = [x for x in set(df_test.county_name) if x not in list(df_train.county_name)]
    # 在测试集中有部分分公司未在训练集中出现，无法采集坏样本率同样用0填充
    ind = ind + ind1
    ser = pd.Series(0,index=ind)
    county_2 = county_2.append(ser)
    county_1 = county_1.append(pd.Series(0,index=ind1))
    county_1 = county_1.fillna(0)
    
    df_train["county_name_1"] = df_train["county_name"].replace(county_1.to_dict())
    df_train["county_name_2"] = df_train["county_name"].replace(county_2.to_dict())
    df_test["county_name_1"] = df_test["county_name"].replace(county_1.to_dict())
    df_test["county_name_2"] = df_test["county_name"].replace(county_2.to_dict())
    
    df_train.drop(['city_name','county_name'],axis=1,inplace=True)
    df_test.drop(['city_name','county_name'],axis=1,inplace=True)
    
    return df_train,df_test

In [3]:
# 流量统计
def get_app_feats(df):
    '''
    @Function：
        流量统计函数，该函数在原有baseline基础上,选用3月份数据
    '''
    print('APP')
    df = df[df['month_id'] == '2020-03']
    phones_app = df[["phone_no_m"]].copy()
    phones_app = phones_app.drop_duplicates(subset=['phone_no_m'], keep='last')
    tmp = df.groupby("phone_no_m")["busi_name"].agg(busi_count="nunique")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    '''
    使用的流量统计
    '''
    tmp = df.groupby("phone_no_m")["flow"].agg(flow_mean="mean", 
                                               flow_median = "median", 
                                               flow_min  = "min", 
                                               flow_max = "max", 
                                               flow_var = "var",
                                               flow_sum = "sum")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["month_id"].agg(month_ids ="nunique")
    phones_app = phones_app.merge(tmp, on="phone_no_m", how="left")
    # 月流量使用统计
    phones_app["flow_month"] = phones_app["flow_sum"] / phones_app["month_ids"]

    return phones_app

In [4]:
# 通话记录统计
def get_voc_feat(df):
    '''
    @Function：
        在原有baseline基础上加入week统计指标，同时只选取3月份数据
        参考2016诈骗电话活动规律与行为特征分析报告，指出诈骗电话周一到周五活跃度较高
        
        通话记录还有很多其他强特有待挖掘
    '''
    print('VOC')
    df["start_datetime"] = pd.to_datetime(df['start_datetime'])
    df = df[df['start_datetime'] >= '2020-03-01 00:00:00']
    df["hour"] = df['start_datetime'].dt.hour
    df["day"] = df['start_datetime'].dt.day
    df["week"] = df['start_datetime'].dt.weekday
    
    # print(df.head())
    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    #对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(opposite_count="count", opposite_unique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """主叫通话
    """
    df_call = df[df["calltype_id"]==1].copy()
    tmp = df_call.groupby("phone_no_m")["imei_m"].agg(voccalltype1="count", imeis="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    phone_no_m["voc_calltype1"] = phone_no_m["voccalltype1"] / phone_no_m["opposite_count"] 
    tmp = df_call.groupby("phone_no_m")["city_name"].agg(city_name_call="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df_call.groupby("phone_no_m")["county_name"].agg(county_name_call="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    """和固定通话者的对话统计
    """
    tmp = df.groupby(["phone_no_m","opposite_no_m"])["call_dur"].agg(count="count", sum="sum")
    phone2opposite = tmp.groupby("phone_no_m")["count"].agg(phone2opposite_mean="mean", phone2opposite_median="median", phone2opposite_max="max")
    phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
    phone2opposite = tmp.groupby("phone_no_m")["sum"].agg(phone2oppo_sum_mean="mean", phone2oppo_sum_median="median", phone2oppo_sum_max="max")
    phone_no_m = phone_no_m.merge(phone2opposite, on="phone_no_m", how="left")
    
    """通话时间长短统计
    """
    tmp = df.groupby("phone_no_m")["call_dur"].agg(call_dur_mean="mean", call_dur_median="median", call_dur_max="max", call_dur_min="min")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    tmp = df.groupby("phone_no_m")["city_name"].agg(city_name_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["county_name"].agg(county_name_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    tmp = df.groupby("phone_no_m")["calltype_id"].agg(calltype_id_unique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    """通话时间点偏好
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(voc_hour_mode = lambda x:stats.mode(x)[0][0], 
                                               voc_hour_mode_count = lambda x:stats.mode(x)[1][0], 
                                               voc_hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    tmp = df.groupby("phone_no_m")["day"].agg(voc_day_mode = lambda x:stats.mode(x)[0][0], 
                                               voc_day_mode_count = lambda x:stats.mode(x)[1][0], 
                                               voc_day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    tmp = df.groupby("phone_no_m")["week"].agg(voc_week_mode = lambda x:stats.mode(x)[0][0], 
                                               voc_week_mode_count = lambda x:stats.mode(x)[1][0], 
                                               voc_week_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    return phone_no_m


In [5]:
# 短信记录统计
def get_sms_feats(df):
    '''
    @Function：
        在原有baseline基础上同样选取3月份数据，同样加入week特征
    '''
    print('SMS')
    df['request_datetime'] = pd.to_datetime(df['request_datetime'] )
    df = df[df['request_datetime'] >= '2020-03-01 00:00:00']

    
    df["hour"] = df['request_datetime'].dt.hour
    df["day"] = df['request_datetime'].dt.day
    df["week"] = df['request_datetime'].dt.weekday
    #df["month"] = df['request_datetime'].dt.month

    phone_no_m = df[["phone_no_m"]].copy()
    phone_no_m = phone_no_m.drop_duplicates(subset=['phone_no_m'], keep='last')
    # 对话人数和对话次数
    tmp = df.groupby("phone_no_m")["opposite_no_m"].agg(sms_count="count", sms_nunique="nunique")
    tmp["sms_rate"] = tmp["sms_count"]/tmp["sms_nunique"]
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    """短信下行比例
    """
    calltype2 = df[df["calltype_id"]==2].copy()
    calltype2 = calltype2.groupby("phone_no_m")["calltype_id"].agg(calltype_2="count")
    phone_no_m = phone_no_m.merge(calltype2, on="phone_no_m", how="left")
    phone_no_m["calltype_rate"] = phone_no_m["calltype_2"] / phone_no_m["sms_count"]
    """短信时间
    """
    tmp = df.groupby("phone_no_m")["hour"].agg(hour_mode = lambda x:stats.mode(x)[0][0], 
                                               hour_mode_count = lambda x:stats.mode(x)[1][0], 
                                               hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    tmp = df.groupby("phone_no_m")["day"].agg(day_mode = lambda x:stats.mode(x)[0][0], 
                                               day_mode_count = lambda x:stats.mode(x)[1][0], 
                                               day_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    tmp = df.groupby("phone_no_m")["week"].agg(hour_mode = lambda x:stats.mode(x)[0][0], 
                                               hour_mode_count = lambda x:stats.mode(x)[1][0], 
                                               hour_nunique="nunique")
    phone_no_m = phone_no_m.merge(tmp, on="phone_no_m", how="left")
    
    return phone_no_m

In [20]:
# 模型训练
def train_model():
    print('Model')
    test_app_feat=pd.read_csv('user_data/test_app_feat.csv')
    test_voc_feat=pd.read_csv('user_data/test_voc_feat.csv')
    test_sms_feat=pd.read_csv("user_data/test_sms_feat.csv")
    test_user    = pd.read_csv('user_data/test_user_feat.csv')
    test_user = test_user.merge(test_app_feat, on="phone_no_m", how="left")
    test_user = test_user.merge(test_voc_feat, on="phone_no_m", how="left")
    test_user = test_user.merge(test_sms_feat, on="phone_no_m", how="left")
    
    train_app_feat = pd.read_csv("user_data/train_app_feat.csv")
    train_voc_feat = pd.read_csv("user_data/train_voc_feat.csv")
    train_sms_feat = pd.read_csv("user_data/train_sms_feat.csv")
    train_user     = pd.read_csv('user_data/train_user_feat.csv')
    
    drop_r = ["arpu_201908","arpu_201909","arpu_201910","arpu_201911","arpu_201912","arpu_202001","arpu_202002"]
    
    # train_user['arpu_202004'] = train_user[drop_r].min(axis=1)
    
    train_user.drop(drop_r, axis=1,inplace=True)
    train_user.rename(columns={"arpu_202003":"arpu_202004"},inplace=True)
    
    
    train_user = train_user.merge(train_app_feat, on="phone_no_m", how="left")
    train_user = train_user.merge(train_voc_feat, on="phone_no_m", how="left")
    train_user = train_user.merge(train_sms_feat, on="phone_no_m", how="left")
    sub = test_user[["phone_no_m"]].copy()
    
    train_label = train_user[["label"]].copy()
    
    test_user.drop(["phone_no_m"], axis=1,inplace=True)
    train_user.drop(["phone_no_m", "label"], axis=1,inplace=True)
    kf = StratifiedKFold(n_splits = 5, shuffle=True, random_state=1024)
    test_p = []
    threshold = 0
    scores = 0
    
    
#     train_user.fillna(0,inplace=True)
#     test_user.fillna(0,inplace=True)

    model = lgb.LGBMClassifier(n_estimators=1000)
    # model = XGBClassifier()
    # model = GradientBoostingClassifier(random_state=20200618,
    #                                     learning_rate=0.1,
    #                                     max_depth=7,
    #                                     n_estimators=1000,
                                       
    #                                     )
    for i, (train_index, vaild_index) in enumerate(kf.split(train_user, train_label["label"])):
        print('\nFold_{} Training ================================\n'.format(i+1))
        train_x = train_user.iloc[train_index]
        train_y = train_label.iloc[train_index]
        valid_x = train_user.iloc[vaild_index]
        valid_y = train_label.iloc[vaild_index]
        
        # LGB
        model = model.fit(train_x,
                          train_y, 
                          eval_names=['train', 'valid'],
                          eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                          early_stopping_rounds=10000,
                          eval_metric='auc',
                          categorical_feature=[],
                          verbose=500
                          )
        
        # XGB
        # model = model.fit(train_x,train_y)
        
        
        # GBDT
        # model = model.fit(train_x, 
        #                   train_y,                          
        #                   )
        # model.score(valid_x, valid_y)
        
        vaild_preds = model.predict(valid_x, 
                                    num_iteration=model.best_iteration_
                                  )
        bp, bestf1= score_vail(vaild_preds, valid_y)
        print("score: ", bestf1 , bp)
        scores += bestf1
        threshold += bp
        test_pre = model.predict(test_user, 
                                num_iteration=model.best_iteration_
                               )
        test_pre = MinMaxScaler().fit_transform(test_pre.reshape(-1, 1))
        test_pre = test_pre.reshape(-1, )
        test_p.append(test_pre)
    threshold = threshold/len(test_p)
    print(threshold)
    print("五折平均分数: ", scores/len(test_p))
    sc = scores/len(test_p)
    test_p = np.array(test_p)
    test_p = test_p.mean(axis=0)
    sub["prob"] = test_p
    sub["label"] = sub["prob"] > round(np.percentile(sub["prob"], threshold), 4)
    sub[["phone_no_m", "label"]].to_csv('submissions/xgb_{}.csv'.format(sc),index=False,encoding='utf-8')

In [15]:
def score_vail(vaild_preds, real):
    """f1阈值搜索
    """
#     import matplotlib.pylab as plt
#     plt.figure(figsize=(16,5*10))
    best = 0
    bp = 0
    score = []
    for i in range(600):
        p = 32+i*0.08
        threshold_test = round(np.percentile(vaild_preds, p), 4)
        pred_int = vaild_preds>threshold_test
        ff = f1_score(pred_int,real)
        score.append(ff)
        
        if ff>=best:
            best = ff
            bp = p
#     plt.plot(range(len(score)), score)
#     plt.show()
    return bp, best

In [16]:
def feats():
    
    test_voc=pd.read_csv('test/test_voc.csv',)
    test_voc_feat = get_voc_feat(test_voc)
    test_voc_feat.to_csv("user_data/test_voc_feat.csv", index=False)

    test_app=pd.read_csv('test/test_app.csv',)
    test_app['month_id'] = '2020-03'
    test_app_feat = get_app_feats(test_app)
    test_app_feat.to_csv("user_data/test_app_feat.csv", index=False)
    
    test_sms=pd.read_csv('test/test_sms.csv',)
    test_sms_feat = get_sms_feats(test_sms)
    test_sms_feat.to_csv("user_data/test_sms_feat.csv", index=False)
     
    train_voc=pd.read_csv('train/train_voc.csv',)
    train_voc_feat = get_voc_feat(train_voc)
    train_voc_feat.to_csv("user_data/train_voc_feat.csv", index=False)

    train_app=pd.read_csv('train/train_app.csv',)
    train_app_feat = get_app_feats(train_app)
    train_app_feat.to_csv("user_data/train_app_feat.csv", index=False)

    train_sms=pd.read_csv('train/train_sms.csv',)
    train_sms_feat = get_sms_feats(train_sms)
    train_sms_feat.to_csv("user_data/train_sms_feat.csv", index=False)
    
    train_user = pd.read_csv('train/train_user.csv')
    test_user = pd.read_csv('test/test_user.csv')
    train_user_feat, test_user_feat = get_user_feats(train_user, test_user)
    train_user_feat.to_csv("user_data/train_user_feat.csv", index=False)
    test_user_feat.to_csv("user_data/test_user_feat.csv", index=False)

In [12]:

feats()

VOC
APP
SMS


  if (await self.run_code(code, result,  async_=asy)):


VOC


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  del sys.path[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app


APP
SMS


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


user


In [21]:

train_model()

Model


Training until validation scores don't improve for 10000 rounds


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[500]	train's auc: 1	train's binary_logloss: 0.000298692	valid's auc: 0.970862	valid's binary_logloss: 0.348924
[1000]	train's auc: 1	train's binary_logloss: 0.000285364	valid's auc: 0.972047	valid's binary_logloss: 0.407378
Did not meet early stopping. Best iteration is:
[141]	train's auc: 1	train's binary_logloss: 0.0105144	valid's auc: 0.969475	valid's binary_logloss: 0.184055
score:  0.9054054054054055 71.6


Training until validation scores don't improve for 10000 rounds


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


[500]	train's auc: 1	train's binary_logloss: 0.000292733	valid's auc: 0.955313	valid's binary_logloss: 0.485305
[1000]	train's auc: 1	train's binary_logloss: 0.000285203	valid's auc: 0.955251	valid's binary_logloss: 0.562786
Did not meet early stopping. Best iteration is:
[127]	train's auc: 1	train's binary_logloss: 0.0118361	valid's auc: 0.95538	valid's binary_logloss: 0.231246
score:  0.8876712328767123 72.32


Training until validation scores don't improve for 10000 rounds

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))



[500]	train's auc: 1	train's binary_logloss: 0.000295559	valid's auc: 0.95923	valid's binary_logloss: 0.459909
[1000]	train's auc: 1	train's binary_logloss: 0.000285296	valid's auc: 0.959408	valid's binary_logloss: 0.53849
Did not meet early stopping. Best iteration is:
[136]	train's auc: 1	train's binary_logloss: 0.0107576	valid's auc: 0.961947	valid's binary_logloss: 0.218963
score:  0.890125173852573 73.2


Training until validation scores don't improve for 10000 rounds

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))



[500]	train's auc: 1	train's binary_logloss: 7.14129e-06	valid's auc: 0.956282	valid's binary_logloss: 0.495367
[1000]	train's auc: 1	train's binary_logloss: 1.41684e-06	valid's auc: 0.956214	valid's binary_logloss: 0.567218
Did not meet early stopping. Best iteration is:
[93]	train's auc: 1	train's binary_logloss: 0.0229136	valid's auc: 0.954943	valid's binary_logloss: 0.213066
score:  0.8845618915159944 73.2




  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 10000 rounds
[500]	train's auc: 1	train's binary_logloss: 9.95188e-06	valid's auc: 0.963356	valid's binary_logloss: 0.422107
[1000]	train's auc: 1	train's binary_logloss: 1.52234e-06	valid's auc: 0.963885	valid's binary_logloss: 0.49094
Did not meet early stopping. Best iteration is:
[108]	train's auc: 1	train's binary_logloss: 0.018931	valid's auc: 0.966648	valid's binary_logloss: 0.183391
score:  0.9046979865771813 71.2
72.304
五折平均分数:  0.8944923380455734
