### AD+LR版本

In [3]:
# -*- coding: utf-8 -*-
import time
import warnings 
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore') 

In [4]:
# load data
train = pd.read_csv('../data/pre/train.csv')
test = pd.read_csv('../data/pre/test.csv')
ad = pd.read_csv('../data/pre/ad.csv')

# process data
train = pd.merge(train, ad, on='creativeID')
test = pd.merge(test, ad, on='creativeID')
y_train = train['label'].values

In [5]:
train.head()

Unnamed: 0,label,clickTime,conversionTime,creativeID,userID,positionID,connectionType,telecomsOperator,adID,camgaignID,advertiserID,appID,appPlatform
0,0,170000,,3089,2798058,293,1,1,1321,83,10,434,1
1,0,170001,,3089,195578,3659,0,2,1321,83,10,434,1
2,0,170014,,3089,1462213,3659,0,3,1321,83,10,434,1
3,0,170030,,3089,1985880,5581,1,1,1321,83,10,434,1
4,0,170047,,3089,2152167,5581,1,1,1321,83,10,434,1


In [6]:
# feature engineering/encoding
encoder = OneHotEncoder()
# 素材ID、广告ID、推广计划ID、账户ID、APPID、App平台
# 账户-推广计划-广告-素材
features = ['creativeID', 'adID', 'camgaignID', 'advertiserID', 'appID', 'appPlatform']
for i, feat in enumerate(features):
    # 采用稀疏矩阵存储格式
    x_train = encoder.fit_transform(train[feat].values.reshape(-1, 1))
    x_test = encoder.transform(test[feat].values.reshape(-1, 1))
    if i == 0:
        X_train, X_test = x_train, x_test
    else:
        # sparse.hstack横向合并稀疏矩阵存储格式
        X_train, X_test = sparse.hstack((X_train, x_train)), sparse.hstack((X_test, x_test))

In [None]:
# model training
st_time = time.time()
lr = LogisticRegression()
lr.fit(X_train, y_train)
proba_test = lr.predict_proba(X_test)[:, 1]
print('训练共耗时: {}'.format(time.time()-st_time))

In [None]:
# submission
df = pd.DataFrame({'instanceID': test['instanceID'].values, 'proba': proba_test})
df.sort_values('instanceID', inplace=True)
df.to_csv('../data/submission.csv', index=False)