In [2]:
import pandas as pd
import numpy as np
import pickle
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
from sklearn.model_selection import train_test_split


In [5]:
raw = pd.read_csv('train.csv')
raw.sort_values(by='order_pay_time', ascending=True, inplace=True)

# # print the keys
# print(raw.keys())

# # print max 'order_pay_time'
# print(raw['order_pay_time'].max())

# # print how many data
# print(len(raw))

print(raw.info())
print(raw.describe())

print(raw.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2306871 entries, 2021321 to 492847
Data columns (total 29 columns):
 #   Column                  Dtype  
---  ------                  -----  
 0   order_detail_id         int64  
 1   order_id                int64  
 2   order_total_num         float64
 3   order_amount            float64
 4   order_total_payment     float64
 5   order_total_discount    float64
 6   order_pay_time          object 
 7   order_status            int64  
 8   order_count             float64
 9   is_customer_rate        float64
 10  order_detail_status     float64
 11  order_detail_goods_num  float64
 12  order_detail_amount     float64
 13  order_detail_payment    float64
 14  order_detail_discount   float64
 15  customer_province       object 
 16  customer_city           object 
 17  member_id               float64
 18  customer_id             int64  
 19  customer_gender         float64
 20  member_status           float64
 21  is_member_actived       fl

In [3]:
def preprocess_data(raw):
    # gender fill 0 if null
    data = pd.DataFrame(raw.groupby('customer_id')['customer_gender'].last().fillna(0))

    # last user-good activity
    data[['goods_id_last','goods_status_last','goods_price_last','goods_has_discount_last','goods_list_time_last',
          'goods_delist_time_last']]= \
          raw.groupby('customer_id')[['goods_id', 'goods_status', 'goods_price', 'goods_has_discount', 'goods_list_time', 'goods_delist_time']].last()
    
    # last user-order activity
    data[['order_total_num_last','order_amount_last','order_total_payment_last','order_total_discount_last','order_pay_time_last',
          'order_status_last','order_count_last','is_customer_rate_last','order_detail_status_last', 'order_detail_goods_num_last', 
          'order_detail_amount_last','order_detail_payment_last', 'order_detail_discount_last']]= \
        raw.groupby('customer_id')[['order_total_num', 'order_amount','order_total_payment', 'order_total_discount', 'order_pay_time',
               'order_status', 'order_count', 'is_customer_rate','order_detail_status', 'order_detail_goods_num', 
                'order_detail_amount','order_detail_payment', 'order_detail_discount']].last()  

    # last user-member activity
    data[['member_id_last','member_status_last','is_member_actived_last']]= \
        raw.groupby('customer_id')[['member_id','member_status','is_member_actived']].last()
    
    # goods_price
    data[['goods_price_mean', 'goods_price_std', 'goods_price_max', 'goods_price_min']] = \
        raw.groupby('customer_id', as_index=False)['goods_price'].agg(['mean', 'std', 'max', 'min']).drop(['customer_id'], axis=1)
    
    # order_total_payment
    data[['order_total_payment_mean', 'order_total_payment_std', 'order_total_payment_max', 'order_total_payment_min']] = \
        raw.groupby('customer_id', as_index=False)['order_total_payment'].agg(['mean', 'std', 'max', 'min']).drop(['customer_id'], axis=1)
   
    # user total order count
    data[['order_count']] = raw.groupby('customer_id', as_index=False)['order_id'].count().drop(['customer_id'], axis=1)

    # user total goods count
    data[['goods_count']] = raw.groupby('customer_id', as_index=False)['goods_id'].count().drop(['customer_id'], axis=1)

    # # customer_province
    # data[['customer_province']] = raw.groupby('customer_id')['customer_province'].transform('last')

    # # customer_city
    # data[['customer_city']] = raw.groupby('customer_id')['customer_city'].last()

    # is_customer_rate
    data[['customer_rate_mean', 'customer_rate_sum']] =  raw.groupby('customer_id', as_index=False)['is_customer_rate'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order discount
    data[['order_detail_discount_mean', 'order_detail_discount_sum']] =  raw.groupby('customer_id', as_index=False)['order_detail_discount'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # member_status
    data[['member_status_mean', 'member_status_sum']] =  raw.groupby('customer_id', as_index=False)['member_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # goods_status
    data[['goods_status_mean', 'goods_status_sum']] =  raw.groupby('customer_id', as_index=False)['goods_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # is_member_actived
    data[['is_member_actived_mean', 'is_member_actived_sum']] =  raw.groupby('customer_id', as_index=False)['is_member_actived'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_status
    data[['order_status_mean', 'order_status_sum']] =  raw.groupby('customer_id', as_index=False)['order_status'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # user goods count
    data[['order_detail_goods_num_mean', 'order_detail_goods_num_sum']] =  raw.groupby('customer_id', as_index=False)['order_detail_goods_num'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # goods_has_discount
    data[['goods_has_discount_mean', 'goods_has_discount_sum']] =  raw.groupby('customer_id', as_index=False)['goods_has_discount'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_total_payment
    data[['order_total_payment_mean', 'order_total_payment_sum']] =  raw.groupby('customer_id', as_index=False)['order_total_payment'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # order_total_num
    data[['order_total_num_mean', 'order_total_num_sum']] =  raw.groupby('customer_id', as_index=False)['order_total_num'].agg(['mean', 'sum']).drop(['customer_id'], axis=1)

    # time
    data['order_pay_time_last'] = pd.to_datetime(data['order_pay_time_last'])
    data['order_pay_time_last_m'] = data['order_pay_time_last'].dt.month
    data['order_pay_time_last_d'] = data['order_pay_time_last'].dt.day
    data['order_pay_time_last_h'] = data['order_pay_time_last'].dt.hour
    data['order_pay_time_last_min'] = data['order_pay_time_last'].dt.minute
    data['order_pay_time_last_s'] = data['order_pay_time_last'].dt.second
    data['order_pay_time_last_weekday'] = data['order_pay_time_last'].dt.weekday
    
    # order_pay_time_last diff
    t_min=pd.to_datetime('2012-10-11 00:00:00')
    data['order_pay_time_last_diff'] = (data['order_pay_time_last']-t_min).dt.days
    
    # goods_list_time last diff    
    data['goods_list_time_last'] =pd.to_datetime(data['goods_list_time_last'])    
    data['goods_list_time_diff'] = (data['goods_list_time_last']-t_min).dt.days
    
    # goods_delist_time last diff
    data['goods_delist_time_last'] =pd.to_datetime(data['goods_delist_time_last'])    
    data['goods_delist_time_diff'] = (data['goods_delist_time_last']-t_min).dt.days
    
    # goods_time_diff
    data['goods_time_diff'] =  data['goods_delist_time_diff']-data['goods_list_time_diff']
    
    data.drop(['goods_list_time_last', 'goods_delist_time_last', 'order_pay_time_last'], axis=1, inplace=True)

    return data

In [4]:
# Build dataset for with data until a specific time.
# If is_train, the returned dataset contains column 'labels'
def build_dataset(raw, until, is_train):
    data = raw[raw['order_pay_time'] <= until]
    data = preprocess_data(data)
    if is_train:
        label_raw = set(raw[raw['order_pay_time'] > until]['customer_id'].dropna())
        data['label'] = data.index.isin(label_raw).astype(int)
    return data

In [21]:
train_cutoff = '2013-07-31 23:59:59'
train_data = build_dataset(raw, train_cutoff, is_train=True)

# print the keys
print(train_data.keys())
# print the first 3 rows
print(train_data.head(3))
# print the shape of the data
print(train_data.shape)

Index(['customer_gender', 'goods_id_last', 'goods_status_last',
       'goods_price_last', 'goods_has_discount_last', 'order_total_num_last',
       'order_amount_last', 'order_total_payment_last',
       'order_total_discount_last', 'order_status_last', 'order_count_last',
       'is_customer_rate_last', 'order_detail_status_last',
       'order_detail_goods_num_last', 'order_detail_amount_last',
       'order_detail_payment_last', 'order_detail_discount_last',
       'member_id_last', 'member_status_last', 'is_member_actived_last',
       'goods_price_mean', 'goods_price_std', 'goods_price_max',
       'goods_price_min', 'order_total_payment_mean',
       'order_total_payment_std', 'order_total_payment_max',
       'order_total_payment_min', 'order_count', 'goods_count',
       'customer_rate_mean', 'customer_rate_sum', 'order_detail_discount_mean',
       'order_detail_discount_sum', 'member_status_mean', 'member_status_sum',
       'goods_status_mean', 'goods_status_sum', 'is_membe

In [6]:
def fill_nulls_with_mean(df):
    for column in df.columns:
        if df[column].dtype in ['float64', 'int64']:  # Only apply to numeric columns
            if df[column].isnull().any():
                mean_value = df[column].mean()
                df[column] = df[column].fillna(mean_value)  # Avoid using inplace=True
    return df

def select_features_by_mutual_info(data, target_feature, top_n=20):
    data = fill_nulls_with_mean(data)
    X = data.drop(columns=[target_feature])
    y = data[target_feature].astype('int')

    mi = mutual_info_classif(X, y)
    mi_df = pd.DataFrame({'feature': X.columns, 'mi': mi})

    mi_df = mi_df.sort_values(by='mi', ascending=False)
    top_features = mi_df.head(top_n)

    return top_features

def select_features_by_anova(data, target_feature, top_n=20):
    data = fill_nulls_with_mean(data)
    X = data.drop(columns=[target_feature])
    y = data[target_feature].astype(int)

    selector = SelectKBest(score_func=f_classif, k=top_n)
    selector.fit(X, y)

    scores = selector.scores_
    feature_names = X.columns
    return pd.DataFrame({'feature': feature_names, 'score': scores}).sort_values(by='score', ascending=False).head(top_n)

In [7]:
top_features = select_features_by_anova(train_data, 'label', top_n=30)
print(top_features)

                        feature         score
55     order_pay_time_last_diff  20176.161882
4       goods_has_discount_last  12517.664029
17               member_id_last   7836.551920
0               customer_gender   2878.713261
2             goods_status_last   2177.192464
58              goods_time_diff   1366.361933
3              goods_price_last    815.713403
50        order_pay_time_last_d    541.959637
9             order_status_last    397.122302
12     order_detail_status_last    377.775460
54  order_pay_time_last_weekday    374.359746
14     order_detail_amount_last     99.696211
15    order_detail_payment_last     90.223244
8     order_total_discount_last     83.614045
1                 goods_id_last     82.798681
16   order_detail_discount_last     80.091719
52      order_pay_time_last_min     71.252507
53        order_pay_time_last_s     57.111136
39        is_member_actived_sum     56.821013
35            member_status_sum     56.821013
7      order_total_payment_last   

  f = msb / msw


In [1]:
# Filter the train and test data to keep only the top features
APPLY_FEATURE_SELECTION = False

In [20]:
test_cutoff = '2013-08-31 23:59:59'
test_data = build_dataset(raw, test_cutoff, is_train=False)

if APPLY_FEATURE_SELECTION:
    selected_features = top_features['feature'].tolist()
    train_data = train_data[selected_features + ['label']]
    test_data = test_data[selected_features]
    
print(train_data.head())
# print how many data
print(len(train_data))
# print the shape of the test data
print(test_data.shape)

             order_pay_time_last_diff  goods_has_discount_last  \
customer_id                                                      
1000000                            21                      0.0   
1000034                           148                      0.0   
1000046                           239                      1.0   
1000048                            62                      0.0   
1000069                           275                      1.0   

             member_id_last  customer_gender  goods_status_last  \
customer_id                                                       
1000000                 0.0              0.0                1.0   
1000034                 0.0              0.0                2.0   
1000046                 0.0              0.0                1.0   
1000048                 0.0              0.0                1.0   
1000069                 0.0              0.0                1.0   

             goods_time_diff  goods_price_last  order_pay_time_last

In [22]:
# save train_data and test_data to pickle files
if APPLY_FEATURE_SELECTION:
    with open('MI_train_data.pkl', 'wb') as f:
        pickle.dump(train_data, f)
    with open('MI_test_data.pkl', 'wb') as f:
        pickle.dump(test_data, f)
else:
    with open('train_data.pkl', 'wb') as f:
        pickle.dump(train_data, f)
    with open('test_data.pkl', 'wb') as f:
        pickle.dump(test_data, f)

In [3]:
# load the data to train
if APPLY_FEATURE_SELECTION:
    with open('MI_train_data.pkl', 'rb') as f:
        train_data = pickle.load(f)
    with open('MI_test_data.pkl', 'rb') as f:
        test_data = pickle.load(f)
else:
    with open('train_data.pkl', 'rb') as f:
        train_data = pickle.load(f)
    with open('test_data.pkl', 'rb') as f:
        test_data = pickle.load(f)

print(train_data.keys())
value_counts = train_data['label'].value_counts()
print(value_counts)

Index(['order_pay_time_last_diff', 'goods_has_discount_last', 'member_id_last',
       'customer_gender', 'goods_status_last', 'goods_time_diff',
       'goods_price_last', 'order_pay_time_last_d', 'order_status_last',
       'order_detail_status_last', 'order_pay_time_last_weekday',
       'order_detail_amount_last', 'order_detail_payment_last',
       'order_total_discount_last', 'goods_id_last',
       'order_detail_discount_last', 'order_pay_time_last_min',
       'order_pay_time_last_s', 'is_member_actived_sum', 'member_status_sum',
       'order_total_payment_last', 'order_amount_last',
       'order_pay_time_last_h', 'order_detail_discount_sum',
       'goods_status_sum', 'order_count', 'goods_count',
       'goods_delist_time_diff', 'order_pay_time_last_m',
       'order_total_payment_max', 'label'],
      dtype='object')
label
0    1412601
1      22803
Name: count, dtype: int64


In [5]:
import lightgbm as lgb
clf = lgb.LGBMClassifier(
            num_leaves=2**5-1, reg_alpha=0.25, reg_lambda=0.25, objective='binary',
            max_depth=-1, learning_rate=0.005, min_child_samples=3, random_state=2021,
            n_estimators=2500, subsample=1, colsample_bytree=1,
        )

In [6]:
X = train_data.drop('label', axis=1)
y = train_data['label']
X_train, X_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2, 
    stratify=y, # This is the key parameter to ensure proportional splitting
    random_state=42 # For reproducible results
)

clf.fit(X_train, y_train)

y_valid_pred_proba = clf.predict_proba(X_val)[:, 1]

[LightGBM] [Info] Number of positive: 18242, number of negative: 1130081
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.030373 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3549
[LightGBM] [Info] Number of data points in the train set: 1148323, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015886 -> initscore=-4.126318
[LightGBM] [Info] Start training from score -4.126318


In [None]:
# predict labels based on a threshold
threshold = 0.1
y_pred = (y_valid_pred_proba > threshold).astype(int)
auc_bin = balanced_accuracy_score(y_val, y_pred)
print(f'The balanced accuracy score on the validation set is: {auc_bin:.4f}')

The balanced accuracy score on the validation set is: 0.6422


In [8]:
# predict labels based on top n probabilities
for n in [10000, 20000, 30000, 35000, 40000, 45000, 50000, 60000]:
    top_indices = np.argsort(y_valid_pred_proba)[::-1][:n]
    y_final_pred = np.zeros_like(y_valid_pred_proba, dtype=int)
    y_final_pred[top_indices] = 1
    y_true = y_val
    score = balanced_accuracy_score(y_true, y_final_pred)
    print(f"n={n}, score: {score:.4f}")


n=10000, score: 0.6521
n=20000, score: 0.6954
n=30000, score: 0.7153
n=35000, score: 0.7243
n=40000, score: 0.7291
n=45000, score: 0.7334
n=50000, score: 0.7393
n=60000, score: 0.7403


In [None]:
# using test_data to train and predict
clf.fit(train_data.drop(['label'], axis=1), train_data['label'])
y_pred = clf.predict_proba(test_data)[:, 1]
result = pd.read_csv('submission.csv')
result['result'] = y_pred
final_result = result.sort_values(by='result', ascending=False).copy()
buy_num = 450000
final_result.index = range(len(final_result))
final_result.loc[result.index <= buy_num, 'result'] = 1
final_result.loc[result.index > buy_num, 'result'] = 0
final_result.sort_values(by='customer_id', inplace=True)
final_result.to_csv('submission_test.csv', index=False)

[LightGBM] [Info] Number of positive: 22803, number of negative: 1412601
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.138470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6908
[LightGBM] [Info] Number of data points in the train set: 1435404, number of used features: 59
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.015886 -> initscore=-4.126296
[LightGBM] [Info] Start training from score -4.126296
customer_id  result
1000000      0.0       1
2217950      1.0       1
2217924      1.0       1
2217923      1.0       1
2217922      1.0       1
                      ..
1606344      0.0       1
1606343      0.0       1
1606342      0.0       1
1606341      0.0       1
2826574      1.0       1
Name: count, Length: 1585986, dtype: int64
