In [2]:
import pandas as pd
import numpy as np
import pickle

In [4]:
raw = pd.read_csv('train.csv')
train_raw = raw[raw['order_pay_time'] <= '2013-07-31 23:59:59']
raw.sort_values(by='order_pay_time', ascending=True, inplace=True)

label_raw = set(raw[raw['order_pay_time'] > '2013-07-31 23:59:59']['customer_id'].dropna())

# print the keys
print(raw.keys())

# print the first 3 rows
print(raw.head(3))

# print max 'order_pay_time'
print(raw['order_pay_time'].max())

# print how many data
print(len(raw))

Index(['order_detail_id', 'order_id', 'order_total_num', 'order_amount',
       'order_total_payment', 'order_total_discount', 'order_pay_time',
       'order_status', 'order_count', 'is_customer_rate',
       'order_detail_status', 'order_detail_goods_num', 'order_detail_amount',
       'order_detail_payment', 'order_detail_discount', 'customer_province',
       'customer_city', 'member_id', 'customer_id', 'customer_gender',
       'member_status', 'is_member_actived', 'goods_id', 'goods_class_id',
       'goods_price', 'goods_status', 'goods_has_discount', 'goods_list_time',
       'goods_delist_time'],
      dtype='object')
         order_detail_id  order_id  order_total_num  order_amount  \
2021321          3349677   2975399              1.0         199.0   
898720           2042828   1890070              1.0         239.9   
1064326          2236278   2054851              1.0         199.0   

         order_total_payment  order_total_discount       order_pay_time  \
2021321      

In [5]:
def preprocess_data(raw):
    # gender fill 0 if null
    data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))

    # last user-good activity
    data[['goods_id_last','goods_status_last','goods_price_last','goods_has_discount_last','goods_list_time_last',
          'goods_delist_time_last']]= \
          raw.groupby('customer_id')[['goods_id', 'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time']].last()
    
    # last user-order activity
    data[['order_total_num_last','order_amount_last','order_total_payment_last','order_total_discount_last','order_pay_time_last',
          'order_status_last','order_count_last','is_customer_rate_last','order_detail_status_last', 'order_detail_goods_num_last', 
          'order_detail_amount_last','order_detail_payment_last', 'order_detail_discount_last']]= \
        raw.groupby('customer_id')[['order_total_num', 'order_amount','order_total_payment', 'order_total_discount', 'order_pay_time',
               'order_status', 'order_count', 'is_customer_rate','order_detail_status', 'order_detail_goods_num', 
                'order_detail_amount','order_detail_payment', 'order_detail_discount']].last()  

    # last user-member activity
    data[['member_id_last','member_status_last','is_member_actived_last']]= \
        raw.groupby('customer_id')[['member_id','member_status','is_member_actived']].last()
    
    # goods_price
    data[['goods_price_mean', 'goods_price_std', 'goods_price_max', 'goods_price_min']] = \
        raw.groupby('customer_id', as_index=False)['goods_price'].agg(['mean', 'std', 'max', 'min']).drop(['customer_id'], axis=1)
    
    # order_total_payment
    data[['order_total_payment_mean', 'order_total_payment_std', 'order_total_payment_max', 'order_total_payment_min']] = \
        raw.groupby('customer_id', as_index=False)['order_total_payment'].agg(['mean', 'std', 'max', 'min']).drop(['customer_id'], axis=1)
   
    # user total order count
    data[['order_count']] = raw.groupby('customer_id', as_index=False)['order_id'].count().drop(['customer_id'], axis=1)

    # user total goods count
    data[['goods_count']] = raw.groupby('customer_id', as_index=False)['goods_id'].count().drop(['customer_id'], axis=1)

    # # customer_province
    # data[['customer_province']] = raw.groupby('customer_id')['customer_province'].transform('last')

    # # customer_city
    # data[['customer_city']] = raw.groupby('customer_id')['customer_city'].last()

    # is_customer_rate
    data[['customer_rate_mean', 'customer_rate_sum']] =  raw.groupby('customer_id', as_index=False)['is_customer_rate'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order discount
    data[['order_detail_discount_mean', 'order_detail_discount_sum']] =  raw.groupby('customer_id', as_index=False)['order_detail_discount'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # member_status
    data[['member_status_mean', 'member_status_sum']] =  raw.groupby('customer_id', as_index=False)['member_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # goods_status
    data[['goods_status_mean', 'goods_status_sum']] =  raw.groupby('customer_id', as_index=False)['goods_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # is_member_actived
    data[['is_member_actived_mean', 'is_member_actived_sum']] =  raw.groupby('customer_id', as_index=False)['is_member_actived'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_status
    data[['order_status_mean', 'order_status_sum']] =  raw.groupby('customer_id', as_index=False)['order_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # user goods count
    data[['order_detail_goods_num_mean', 'order_detail_goods_num_sum']] =  raw.groupby('customer_id', as_index=False)['order_detail_goods_num'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # goods_has_discount
    data[['goods_has_discount_mean', 'goods_has_discount_sum']] =  raw.groupby('customer_id', as_index=False)['goods_has_discount'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_total_payment
    data[['order_total_payment_mean', 'order_total_payment_sum']] =  raw.groupby('customer_id', as_index=False)['order_total_payment'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_total_num
    data[['order_total_num_mean', 'order_total_num_sum']] =  raw.groupby('customer_id', as_index=False)['order_total_num'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # time
    data['order_pay_time_last'] = pd.to_datetime(data['order_pay_time_last'])
    data['order_pay_time_last_m'] = data['order_pay_time_last'].dt.month
    data['order_pay_time_last_d'] = data['order_pay_time_last'].dt.day
    data['order_pay_time_last_h'] = data['order_pay_time_last'].dt.hour
    data['order_pay_time_last_min'] = data['order_pay_time_last'].dt.minute
    data['order_pay_time_last_s'] = data['order_pay_time_last'].dt.second
    data['order_pay_time_last_weekday'] = data['order_pay_time_last'].dt.weekday
    
    # order_pay_time_last diff
    t_min=pd.to_datetime('2012-10-11 00:00:00')
    data['order_pay_time_last_diff'] = (data['order_pay_time_last']-t_min).dt.days
    
    # goods_list_time last diff    
    data['goods_list_time_last'] =pd.to_datetime(data['goods_list_time_last'])    
    data['goods_list_time_diff'] = (data['goods_list_time_last']-t_min).dt.days
    
    # goods_delist_time last diff
    data['goods_delist_time_last'] =pd.to_datetime(data['goods_delist_time_last'])    
    data['goods_delist_time_diff'] = (data['goods_delist_time_last']-t_min).dt.days
    
    # goods_time_diff
    data['goods_time_diff'] =  data['goods_delist_time_diff']-data['goods_list_time_diff']

    return data

In [6]:
# preprocess data
train_data = preprocess_data(train_raw)
train_data['label'] = train_data.index.isin(label_raw).astype(int)
train_data.drop(['goods_list_time_last','goods_delist_time_last','order_pay_time_last'],axis=1,inplace=True)

# print the keys
print(train_data.keys())
# print the first 3 rows
print(train_data.head(3))
# print the shape of the data
print(train_data.shape)

Index(['customer_gender', 'goods_id_last', 'goods_status_last',
       'goods_price_last', 'goods_has_discount_last', 'order_total_num_last',
       'order_amount_last', 'order_total_payment_last',
       'order_total_discount_last', 'order_status_last', 'order_count_last',
       'is_customer_rate_last', 'order_detail_status_last',
       'order_detail_goods_num_last', 'order_detail_amount_last',
       'order_detail_payment_last', 'order_detail_discount_last',
       'member_id_last', 'member_status_last', 'is_member_actived_last',
       'goods_price_mean', 'goods_price_std', 'goods_price_max',
       'goods_price_min', 'order_total_payment_mean',
       'order_total_payment_std', 'order_total_payment_max',
       'order_total_payment_min', 'order_count', 'goods_count',
       'customer_rate_mean', 'customer_rate_sum', 'order_detail_discount_mean',
       'order_detail_discount_sum', 'member_status_mean', 'member_status_sum',
       'goods_status_mean', 'goods_status_sum', 'is_membe

In [8]:
from sklearn.feature_selection import mutual_info_classif

def fill_nulls_with_mean(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:  # Only apply to numeric columns
            if df[column].isnull().any():
                mean_value = df[column].mean()
                df[column] = df[column].fillna(mean_value)  # Avoid using inplace=True
    return df

def select_features_by_mutual_info(data, target_feature, top_n=20):
    data = fill_nulls_with_mean(data)
    X = data.drop(columns=[target_feature])
    y = data[target_feature].astype('int')

    mi = mutual_info_classif(X, y)
    mi_df = pd.DataFrame({'feature': X.columns, 'mi': mi})

    mi_df = mi_df.sort_values(by='mi', ascending=False)
    top_features = mi_df.head(top_n)

    return top_features


In [9]:
top_features = select_features_by_mutual_info(train_data, 'label', top_n=30)
print(top_features)

                        feature        mi
25      order_total_payment_std  0.261924
34           member_status_mean  0.250750
38       is_member_actived_mean  0.250708
19       is_member_actived_last  0.250488
18           member_status_last  0.250209
21              goods_price_std  0.236049
42  order_detail_goods_num_mean  0.213699
2             goods_status_last  0.210493
24     order_total_payment_mean  0.206818
37             goods_status_sum  0.205719
40            order_status_mean  0.202382
58              goods_time_diff  0.193617
29                  goods_count  0.190058
28                  order_count  0.189645
47         order_total_num_mean  0.185707
20             goods_price_mean  0.176011
22              goods_price_max  0.173705
33    order_detail_discount_sum  0.166564
36            goods_status_mean  0.149263
35            member_status_sum  0.145332
26      order_total_payment_max  0.144665
39        is_member_actived_sum  0.144280
23              goods_price_min  0

In [11]:
train_data = train_data[top_features['feature'].tolist() + ['label']]
print(train_data.head())
# print how many data
print(len(train_data))

             order_total_payment_std  member_status_mean  \
customer_id                                                
1000000                    18.355503                 1.0   
1000034                    18.355503                 1.0   
1000046                    18.355503                 1.0   
1000048                    18.355503                 1.0   
1000069                    18.355503                 1.0   

             is_member_actived_mean  is_member_actived_last  \
customer_id                                                   
1000000                         1.0                     1.0   
1000034                         1.0                     1.0   
1000046                         1.0                     1.0   
1000048                         1.0                     1.0   
1000069                         1.0                     1.0   

             member_status_last  goods_price_std  order_detail_goods_num_mean  \
customer_id                                             

In [12]:
test_data = preprocess_data(raw)
test_data.drop(['goods_list_time_last','goods_delist_time_last','order_pay_time_last'],axis=1,inplace=True)
test_data = test_data[top_features['feature'].tolist()]
# print the shape of the test data
print(test_data.shape)

(1585986, 30)


In [13]:
# save train_data and test_data to pickle files
with open('train_data.pkl', 'wb') as f:
    pickle.dump(train_data, f)
with open('test_data.pkl', 'wb') as f:
    pickle.dump(test_data, f)

In [22]:
# load the data to train
with open('train_data.pkl', 'rb') as f:
    train_data = pickle.load(f)
with open('test_data.pkl', 'rb') as f:
    test_data = pickle.load(f)

print(train_data.keys())


Index(['order_total_payment_std', 'member_status_mean',
       'is_member_actived_mean', 'is_member_actived_last',
       'member_status_last', 'goods_price_std', 'order_detail_goods_num_mean',
       'goods_status_last', 'order_total_payment_mean', 'goods_status_sum',
       'order_status_mean', 'goods_time_diff', 'goods_count', 'order_count',
       'order_total_num_mean', 'goods_price_mean', 'goods_price_max',
       'order_detail_discount_sum', 'goods_status_mean', 'member_status_sum',
       'order_total_payment_max', 'is_member_actived_sum', 'goods_price_min',
       'order_detail_discount_mean', 'order_detail_goods_num_sum',
       'order_status_sum', 'order_total_num_sum', 'order_total_payment_min',
       'order_total_payment_sum', 'order_detail_goods_num_last', 'label'],
      dtype='object')


In [23]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2500, subsample=1, colsample_bytree=1,
        )
clf.fit(train_data.drop(['label'], axis=1), train_data['label'])

[LightGBM] [Info] Number of positive: 22803, number of negative: 1412601
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.053959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3613
[LightGBM] [Info] Number of data points in the train set: 1435404, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015886 -> initscore=-4.126296
[LightGBM] [Info] Start training from score -4.126296


In [32]:
y_pred = clf.predict_proba(test_data)[:, 1]
print(y_pred.shape)
result = pd.read_csv('submission.csv')
result['result'] = y_pred
final_result = result.sort_values(by='result', ascending=False).copy()
buy_num = 400000
final_result.index = range(len(final_result))
final_result.loc[result.index <= buy_num, 'result'] = 1
final_result.loc[result.index > buy_num, 'result'] = 0
final_result.sort_values(by='customer_id', inplace=True)
final_result.to_csv('submission_test.csv', index=False)

(1585986,)


In [31]:
print(final_result.head(10))
print(y_pred[:10])
print(test_data.head(10))

        customer_id  result
282116      1000000     1.0
336888      1000014     1.0
311252      1000034     1.0
143215      1000046     1.0
67361       1000048     1.0
456443      1000069     0.0
295771      1000084     1.0
295731      1000099     1.0
152875      1000105     1.0
185830      1000109     1.0
[0.01363345 0.01205543 0.0127731  0.01758268 0.02224426 0.00935132
 0.01329217 0.01329217 0.01726277 0.01649906]
             order_total_payment_std  member_status_mean  \
customer_id                                                
1000000                          NaN                 NaN   
1000014                          NaN                 NaN   
1000034                          NaN                 NaN   
1000046                          NaN                 NaN   
1000048                          NaN                 NaN   
1000069                          NaN                 NaN   
1000084                          NaN                 NaN   
1000099                          NaN   