In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import GradientBoostingClassifier

In [3]:
# 全量数据

train_data = pd.read_csv('./data_format1/train_format1.csv')
test_data = pd.read_csv('./data_format1/test_format1.csv')
user_info = pd.read_csv('./data_format1/user_info_format1.csv')
user_log = pd.read_csv('./data_format1/user_log_format1.csv')


In [3]:
# 样本提取
'''
user_info.set_index('user_id', inplace=True)
user_log.set_index('user_id', inplace=True)
train_data_sample = train_data.sample(n=15000)
test_data_sample = test_data.sample(n=5000)
sample_user_id = train_data_sample['user_id'].tolist() + test_data_sample['user_id'].tolist()
user_info_sample = user_info.loc[sample_user_id]
user_log_sample = user_log.loc[sample_user_id]
train_data_sample.set_index('user_id', inplace=True)
test_data_sample.set_index('user_id', inplace=True)
'''

In [4]:
# 保存样本
'''
train_data_sample.to_csv('./sample/train_data_sample.csv')
test_data_sample.to_csv('./sample/test_data_sample.csv')
user_info_sample.to_csv('./sample/user_info_sample.csv')
user_log_sample.to_csv('./sample/user_log_sample.csv')
'''

In [2]:
# 加载样本
'''
train_data = pd.read_csv('./sample/train_data_sample.csv')
test_data = pd.read_csv('./sample/test_data_sample.csv')
user_info = pd.read_csv('./sample/user_info_sample.csv')
user_log = pd.read_csv('./sample/user_log_sample.csv')
'''

In [4]:
user_log.rename(columns={'seller_id': 'merchant_id'}, inplace=True)

In [5]:
# 数据清洗
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int64')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
user_info['age_range'].fillna(0, inplace=True)
user_info = user_info.replace([8], [7])
user_info['age_range'] = user_info['age_range'].astype('int64')
user_info['gender'].fillna(2, inplace=True)
user_info['gender'] = user_info['gender'].astype('int64')

In [6]:
print(train_data.info())
print(user_info.info())
print(user_log.info(null_counts=True))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 260864 entries, 0 to 260863
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype
---  ------       --------------   -----
 0   user_id      260864 non-null  int64
 1   merchant_id  260864 non-null  int64
 2   label        260864 non-null  int64
dtypes: int64(3)
memory usage: 6.0 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 424170 entries, 0 to 424169
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype
---  ------     --------------   -----
 0   user_id    424170 non-null  int64
 1   age_range  424170 non-null  int64
 2   gender     424170 non-null  int64
dtypes: int64(3)
memory usage: 9.7 MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54925330 entries, 0 to 54925329
Data columns (total 7 columns):
 #   Column       Non-Null Count     Dtype         
---  ------       --------------     -----         
 0   user_id      54925330 non-null  int64         
 1   item_id      549253

In [7]:
train_data['origin'] = 'train'
test_data['origin'] = 'test'
matrix = pd.concat([train_data, test_data], ignore_index=True, sort=False)
matrix.drop('prob', axis=1, inplace=True)
matrix = matrix.merge(user_info, on='user_id', how='left')

In [8]:
# 用户特征处理
groups = user_log.groupby(['user_id'])
temp = groups.size().reset_index().rename(columns={0: 'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['cat_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['merchant_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u6'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u6']], on='user_id', how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0: 'u7', 1: 'u8', 2: 'u9', 3: 'u10'})
matrix = matrix.merge(temp, on='user_id', how='left')

# 商户特征处理
groups = user_log.groupby(['merchant_id'])
temp = groups.size().reset_index().rename(columns={0: 'm1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id': 'm2', 'item_id': 'm3', 'cat_id': 'm4', 'brand_id': 'm5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0: 'm6', 1: 'm7', 2: 'm8', 3: 'm9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')
temp = train_data[train_data['label'] == -1].groupby('merchant_id').size().reset_index().rename(columns={0: 'm10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')



In [9]:
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0: 'um1'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id': 'um2', 'cat_id': 'um3', 'brand_id': 'um4'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0: 'um5', 1: 'um6', 2: 'um7', 3: 'um8'})
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['last', 'first'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')

  after removing the cwd from sys.path.


In [10]:
matrix['r1'] = matrix['u9'] / matrix['u7']
matrix['r2'] = matrix['m8'] / matrix['m6']
matrix['r3'] = matrix['um7'] / matrix['um5']
matrix.fillna(0, inplace=True)
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='gen')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)

In [11]:
train_data = matrix[matrix['origin'] == 'train'].drop('origin', axis=1)
test = matrix[matrix['origin'] == 'test'].drop(['origin', 'label'], axis=1)
x_train, y_train = train_data.drop('label', axis=1), train_data['label']

In [24]:
train_x, vali_x, train_y, vali_y = train_test_split(x_train, y_train, test_size=0.2)
train_x_1, train_x_2, train_y_1, train_y_2 = train_test_split(train_x, train_y, test_size=0.5)

model = GradientBoostingClassifier(n_estimators=1000)
model.fit(train_x_1, train_y_1)

gbdt_enc = OneHotEncoder(categories='auto')
gbdt_enc.fit(model.apply(train_x_1)[:,:,0])
gbdt_lr = LogisticRegression(solver='lbfgs', max_iter=100)
gbdt_lr.fit(gbdt_enc.transform(model.apply(train_x_2)[:,:,0]), train_y_2)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [25]:
prob = gbdt_lr.predict_proba(gbdt_enc.transform(model.apply(test)[:,:,0]))[:,1]
test_data['prob'] = pd.Series(prob)
test_data.drop(['origin'], axis=1, inplace=True)

KeyError: "['origin'] not found in axis"

In [27]:
test_data.to_csv('prediction_gbdt_lr.csv', index=False)

In [26]:
test_data

Unnamed: 0,user_id,merchant_id,prob
0,163968,4605,0.050149
1,360576,1581,0.057705
2,98688,1964,0.066401
3,98688,3645,0.034876
4,295296,3361,0.029333
...,...,...,...
261472,228479,3111,0.066125
261473,97919,2341,0.026716
261474,97919,3971,0.104288
261475,32639,3536,0.041227
