# 特征处理与特征构造

In [1]:
import pandas as pd
pd.set_option('display.max_colwidth', 200)
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)

from columns import base_columns
li = []
for col in base_info.columns:
    ratio = base_info[base_info[col].isnull()].shape[0]/24865
    num = len(base_info[col].value_counts())
    dict_ = {'列名':col,'缺失量':ratio,'类别数':num,'字段类型':base_info[col].dtypes}
    dict_['含义'] = base_columns.get(dict_['列名'])
    li.append(dict_)
pd.DataFrame(li)

## 非类别特征衍生

In [3]:
base_info['year'] = base_info['opfrom'].apply(lambda x:int(x.split('-')[0]))
base_info['month'] = base_info['opfrom'].apply(lambda x:int(x.split('-')[1]))
base_info['oprange'] = base_info['opto'].apply(lambda x:int(x[:4]) if not pd.isna(x) else x)
base_info['oprange'] = base_info['oprange'] - base_info['year'].apply(int)
base_info['oprange'] = base_info['oprange'].apply(lambda x:x if not pd.isna(x) else 999)
base_info.drop(['opto','opfrom'],axis=1,inplace=True)
base_info['reccap'] = base_info['reccap'].apply(lambda x:False if pd.isna(x) else True)

from columns import base_columns
li = []
for col in category3_columns:
    ratio = base_info[base_info[col].isnull()].shape[0]/24865
    num = len(base_info[col].value_counts())
    dict_ = {'列名':col,'缺失量':ratio,'类别数':num,'字段类型':base_info[col].dtypes}
    dict_['含义'] = base_columns.get(dict_['列名'])
    li.append(dict_)
pd.DataFrame(li)

## 类别特征衍生

In [4]:
category1_columns = ['state','regtype','compform','venind']#10个以下类别的
category2_columns = ['oplocdistrict','industryphy','enttype']#10-30个类别的
category3_columns = ['industryco','enttypeitem','opform','enttypeminu','enttypegb']#超过30个类别的
category4_columns = ['dom','opscope','orgid','jobid','oploc']#有规律的特征
category_columns = category1_columns + category2_columns + category3_columns + category4_columns

for col in category_columns:
    base_info[col] = base_info[col].apply(lambda x:str(x) if not pd.isna(x) else x)

## base数据特征工程

### 规律变量特征衍生

In [5]:
base_info['dom_1'] = base_info['dom'].str[:1]#16
base_info['dom_2'] = base_info['dom'].str[:16]#53
base_info['dom_3'] = base_info['dom'].str[:32]#412
base_info['dom_4'] = base_info['dom'].str[:48]#1792
base_info.drop(['dom'],axis=1,inplace=True)

base_info['orgid_1'] = base_info['orgid'].str[:4]#3
base_info['orgid_2'] = base_info['orgid'].str[:7]#12
base_info['orgid_3'] = base_info['orgid'].str[:12]#41

base_info['jobid_1'] = base_info['jobid'].str[:2]#3
base_info['jobid_2'] = base_info['jobid'].str[:10]#7
base_info['jobid_3'] = base_info['jobid'].str[:14]#46

base_info['oploc_1'] = base_info['oploc'].str[:1]#16
base_info['oploc_2'] = base_info['oploc'].str[:2]#165
base_info['oploc_3'] = base_info['oploc'].str[:16]#231
base_info['oploc_4'] = base_info['oploc'].str[:32]#752
base_info.drop(['oploc'],axis=1,inplace=True)
category5_columns = ['dom_1','dom_2','dom_3','dom_4','orgid','orgid_1','orgid_2','orgid_3','jobid','jobid_1','jobid_2','jobid_3','oploc_1','oploc_2','oploc_3','oploc_4']

In [None]:
stpwrdpath = "stop_words.txt"
stpwrd_dic = open(stpwrdpath, 'rb')
stpwrd_content = stpwrd_dic.read()
stpwrdlst = stpwrd_content.splitlines()
stpwrd_dic.close()

In [None]:
import jieba
corpus = base_info['opscope'].apply(lambda x:' '.join(jieba.cut(x))).tolist()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
cntVector = CountVectorizer(stop_words=stpwrdlst)
cntTf = cntVector.fit_transform(corpus)
lda = LatentDirichletAllocation(n_components=20,
                                learning_offset=50.,
                                random_state=0)
docres = lda.fit_transform(cntTf)


In [None]:
lda_feature = pd.DataFrame(docres)
lda_feature.columns = ['lad_'+str(i) for i in range(1,21)]
lda_feature['id'] = base_info['id']

In [None]:
base_info = base_info.merge(lda_feature,on=['id'],how='left')
base_feature = base_info.drop(['opscope'],axis=1)


## 训练模型

In [6]:
# 少类别特征
for col in category1_columns + category2_columns:
    base_info[col] = base_info[col].astype('category')

# 多类别特征
def categorys_count_rank(data,col):
    '''
    针对类别较多的变量，按出现的次数对其进行编码
    '''
    category_dict = dict(data[col].value_counts().rank(method='dense'))
    return data[col].replace(category_dict)
for col in category2_columns + category3_columns + category5_columns:
    base_info[col+'_count_rank'] = categorys_count_rank(base_info,col)


def categorys_label_feature(data,col):
    '''
    针对类别较少的变量，进行特征衍生
    特征1:在label=1的数据中，该变量所占比
    特征2:该类别下，label=1的比例
    '''
    data1 = data[data['label']==1]
    category_dict1, category_dict2 = {}, {}
    for value in data[col].value_counts().index.tolist():
        ratio1 = data1[data1[col]==value].shape[0]/data1.shape[0]
        category_dict1[value] = ratio1
        ratio2 = data[(data[col]==value)&(data['label']==1)].shape[0]/data[data[col]==value].shape[0]
        category_dict2[value] = ratio2
    return category_dict1,category_dict2

for col in category2_columns + category3_columns + category5_columns:
    category_dict1,category_dict2 = categorys_label_feature(base_info,col)
    base_info[col+'_label_feature1'] = base_info[col].apply(lambda x:category_dict1.get(x,x))
    base_info[col+'_label_feature2'] = base_info[col].apply(lambda x:category_dict2.get(x,x))

base_info.drop(category3_columns + category5_columns,axis=1,inplace=True)

In [None]:
import lightgbm as lgb
X_train = train_data.drop(['id','label'],axis=1)
y_train = train_data['label']
X_test = test_data.drop(['id','label'],axis=1)
lgb_train = lgb.Dataset(X_train,label=y_train)

In [None]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'learning_rate': 0.01, 
    'num_leaves': 50, 
    'max_depth': 6,
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    }
cv_results = lgb.cv(params, lgb_train, 
                    num_boost_round=2000, 
                    nfold=5, 
                    shuffle=True, 
                    metrics='binary_logloss,auc',
                    early_stopping_rounds=400, 
                    verbose_eval=50, 
                    categorical_feature=category1_columns + category2_columns,
                    show_stdv=True, seed=0)

In [None]:
print('best n_estimators:', len(cv_results['auc-mean']))
print('best cv score:', cv_results['auc-mean'][-1])

In [None]:
from sklearn.model_selection import GridSearchCV
estimator = lgb.LGBMClassifier(
    boosting_type='gbdt', 
    objective='binary', 
    metrics='binary_logloss,auc',
    learning_rate=0.01, 
    bagging_fraction=0.8, 
    feature_fraction=0.8,
    n_estimators=706,
    njobs=2,
    max_depth=6,
    num_leaves=25
)
estimator.fit(X_train,y_train,categorical_feature=category1_columns + category2_columns)

In [None]:
y_pred = estimator.predict_proba(X_test)
result = pd.DataFrame()
result['id'] = test_data['id']
result['score'] = [i[1] for i in y_pred]
result.to_csv('result.csv',index=False)

In [2]:
base_info = pd.read_csv('./train/base_info.csv')
label = pd.read_csv('./train/entprise_info.csv')
base_info = base_info.merge(label,on=['id'],how='left')
drop_columns = ['parnum','exenum','ptbusscope','midpreindcode','protype','forreccap','forregcap','congro']#缺失值太多，直接舍弃
base_info.drop(drop_columns,axis=1,inplace=True)

In [3]:
base_columns = ['id':'企业唯一标识', 'oplocdistrict':'行政区划代码', 'industryphy':'行业类别代码', 'industryco':'行业细类代码', 
                'dom':'经营地址', #
                'opscope':'经营范围', #LDA处理
                'enttype':'企业类型', 'enttypeitem':'企业类型小类', #0.33
                'opfrom':'经营期限起', 'opto':'经营期限止', #0.65缺失之填充“9999-12-31”
                'state':'状态', 'orgid':'机构标识', 'jobid':'职位标识', 'adbusign':'是否广告经营', 'townsign':'是否城镇', 'regtype':'主题登记类型',
                'empnum':'从业人数', #0.21
                'compform':'组织形式', #0.57,缺失值作为第三类
                'parnum':'合伙人数', #删除0.91,转化称bool变量
                'exenum':'执行人数', #删除
                'opform':'经营方式', #0.64#删除
                'ptbusscope':'兼营范围', #1删除
                'venind':'风险行业', #0.66
                'enttypeminu':'企业类型细类', #0.71删除
                'midpreindcode':'中西部优势产业代码', #1删除
                'protype':'项目类型', #删除
                'oploc':'经营场所', #删除
                'regcap':'注册资本（金）', #0.01
                'reccap':'实缴资本', #0.72,转化成是否为空
                'forreccap':'实缴资本（外方）', #删除
                'forregcap':'注册资本（外方）', #删除
                'congro':'投资总额', #删除
                'enttypegb':'企业（机构）类型'
                ]

(24865, 34)

In [None]:
#类别超多>100
category1 = ['industryco',]
#类别较多10-30
category2 = ['enttypeitem','enttypeitem','enttypegb']
#存在规律类别
category3 = ['dom','opscope','orgid','jobid','oploc']
#类别较少<10
category4 = ['oplocdistrict','industryphy','enttype','state','adbusign','townsign','regtype','compform','venind']

base_info['opfrom'] = base_info['opfrom'].str[:10]
base_info['opto'] = base_info['opto'].str[:4].fillna('9999')


In [22]:
base_info['enttype'].value_counts()

1100    14085
9600     8193
4500     2180
2100      141
1200       81
2200       40
3100       37
3200       29
9100       23
5100       20
6100       19
4400        8
5800        3
3500        2
6800        2
3400        1
5400        1
Name: enttype, dtype: int64

In [8]:
train_data = base_feature[base_feature['label'].notnull()]
test_data = base_feature[base_feature['label'].isnull()]

## 参数1:n_estimators

In [10]:
params = {
    'boosting_type': 'gbdt', 
    'objective': 'binary', 
    'learning_rate': 0.1, 
    'num_leaves': 50, 
    'max_depth': 6,
    'subsample': 0.8, 
    'colsample_bytree': 0.8, 
    }
cv_results = lgb.cv(params, lgb_train, 
                    num_boost_round=1000, 
                    nfold=5, 
                    shuffle=True, 
                    metrics='binary_logloss,auc',
                    early_stopping_rounds=100, 
                    verbose_eval=50, 
                    categorical_feature=short_categorys,
                    show_stdv=True, seed=0)

## 参数2:max_depth,num_leaves

In [None]:
estimator = lgb.LGBMClassifier(
    boosting_type='gbdt', 
    objective='binary', 
    metrics='binary_logloss,auc',
    learning_rate=0.1, 
    bagging_fraction=0.8, 
    feature_fraction=0.8,
    n_estimators=98,
    njobs=2
)
param_grid = {
    'max_depth': [2,3,4],
    'num_leaves':[10,15,20]
}
gbm = GridSearchCV(estimator,param_grid,cv=5)
gbm.fit(X_train,y_train)
gbm.best_params_,gbm.best_score_

In [7]:
import pandas as pd
x = pd.read_csv('result.csv')

In [8]:
x['id'].value_counts()

e9f7b28ec10e04707ba878b89e6c2d362b107a817342f9c6    2
f000950527a6feb6021414e7554476d0cf027e4d1dc3e864    1
f000950527a6feb6cb47be6daff20d71af700226c9047ea7    1
e9f7b28ec10e047036ecf3701c5e3d58d703c0b8192f1e68    1
d8071a739aa75a3bf05a79bf7f26ecf8ad9b52d060b8faf2    1
e9f7b28ec10e0470114e0094ac7afee09f69ba51f808d722    1
f1c1045b13d18329d0913745443552947bcc4bbc3a2d1e2a    1
f000950527a6feb638468abc79eda66fd48fc018b8b43229    1
e9f7b28ec10e0470df8f4662efb233711b33c9c0809662ab    1
f1c1045b13d18329d447d7013c1b322c46ac671936e88474    1
ed38190adf12fceb52155e51e3e6576fda26e707ac90c29e    1
ed38190adf12fceb56e6c30fb2c24feb8e5a75cb72530fd6    1
da8691b210adb3f6aef7f82ff03bded18841fd7b1b405451    1
59b38c56de3836835cdfe9f4e2d51d7450911703ecdbfcf7    1
d8071a739aa75a3b4e60b4c2c41fb122d4a26f8680e3038e    1
755db3b5c5f74eb409854d84d1a303b795fc3a2812568998    1
beb4aaaa89e0a0aeca97c467a612082a7c5722d9250d6557    1
4a8fc4f3fb4d8a0fda7f2062186b69d2e35d4950f70dd02d    1
d8071a739aa75a3bf0bd2aca4b60