In [1]:
import sys
sys.path.append('../../..')

import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import pandas as pd, numpy as np, gc
from tqdm import tqdm
import joblib
import xgboost as xgb
import pickle
pd.set_option('display.max_columns', 500)


from utils.util import *
from utils.evaluate import calculate_ctr, compute_rce, average_precision_score
from utils.dataiter import Dataiter
from utils.preprocessing import *
from utils.target_encode import MTE_one_shot

import core.config as conf

In [2]:
path = f'{conf.raw_lzo_path}/part-00000'
train = read_data(path)
path = f'{conf.raw_lzo_path}/part-00001'
train2 = read_data(path)
path = f'{conf.raw_lzo_path}/part-00002'
valid = read_data(path)
gc.collect()
save_memory( train )

In [3]:
import numpy as np
class MTE_one_shot:
    
    def __init__(self, folds, smooth, seed=42):
        self.folds = folds
        self.seed = seed
        self.smooth = smooth
        # self.agg_all = pd.DataFrame()
        
    def fit_transform(self, train, x_col, y_col, y_mean=None, out_col = None, out_dtype=None):
        
        self.y_col = y_col
        np.random.seed(self.seed)
        
        if 'fold' not in train.columns:
            fsize = len(train)//self.folds
            train['fold'] = 1
            train['fold'] = train['fold'].cumsum()
            train['fold'] = train['fold']//fsize
            train['fold'] = train['fold']%self.folds
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        
        if y_mean is None:
            y_mean = train[y_col].mean()#.compute().astype('float32')
        self.mean = y_mean # mean도 누적해서 바꿔주면 좋을듯
        
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        
        agg_each_fold = train.groupby(cols).agg({y_col:['count','sum']}).reset_index()
        agg_each_fold.columns = cols + ['count_y','sum_y']
        
        agg_all = agg_each_fold.groupby(x_col).agg({'count_y':'sum','sum_y':'sum'}).reset_index()
        cols = [x_col] if isinstance(x_col,str) else x_col
        agg_all.columns = cols + ['count_y_all','sum_y_all']
        
        agg_each_fold = agg_each_fold.merge(agg_all,on=x_col,how='left')
        agg_each_fold['count_y_all'] = agg_each_fold['count_y_all'] - agg_each_fold['count_y']
        agg_each_fold['sum_y_all'] = agg_each_fold['sum_y_all'] - agg_each_fold['sum_y']
        agg_each_fold[out_col] = (agg_each_fold['sum_y_all']+self.smooth*self.mean)/(agg_each_fold['count_y_all']+self.smooth)
        agg_each_fold = agg_each_fold.drop(['count_y_all','count_y','sum_y_all','sum_y'],axis=1)
        
        agg_all[out_col] = (agg_all['sum_y_all']+self.smooth*self.mean)/(agg_all['count_y_all']+self.smooth)
        agg_all = agg_all.drop(['count_y_all','sum_y_all'],axis=1)
        
        if hasattr(self, 'agg_all'):
            print('train2')
            self.agg_all = pd.concat([self.agg_all, agg_all])
            
        else:
            print('train1')
            self.agg_all = agg_all
        
        self.agg_all = self.agg_all.drop_duplicates(keep='last')
        # agg_all.to_csv('agg_all.csv', index=False)

        train.columns
        cols = ['fold',x_col] if isinstance(x_col,str) else ['fold']+x_col
        train = train.merge(agg_each_fold,on=cols,how='left')
        del agg_each_fold
        #self.agg_each_fold = agg_each_fold
        #train[out_col] = train.map_partitions(lambda cudf_df: cudf_df[out_col].nans_to_nulls())
        train[out_col] = train[out_col].fillna(self.mean)
        
        if out_dtype is not None:
            train[out_col] = train[out_col].astype(out_dtype)
        return train
    
    def transform(self, test, x_col, out_col = None, out_dtype=None):
        
        # self.agg_all = pd.read_csv('agg_all.csv')
        print(len(self.agg_all))
        
        if out_col is None:
            tag = x_col if isinstance(x_col,str) else '_'.join(x_col)
            out_col = f'TE_{tag}_{self.y_col}'
        test = test.merge(self.agg_all,on=x_col,how='left')
        test[out_col] = test[out_col].fillna(self.mean)
        # test[out_col] = test[out_col].fillna(0)
        if out_dtype is not None:
            test[out_col] = test[out_col].astype(out_dtype)
        return test

In [4]:
def set_dataframe_types(df, train):
    df['id']   = np.arange( df.shape[0] )
    df['id']   = df['id'].astype(np.uint32)

    if train:
        df['reply_timestamp']   = df['reply_timestamp'].fillna(0)
        df['retweet_timestamp'] = df['retweet_timestamp'].fillna(0)
        df['comment_timestamp'] = df['comment_timestamp'].fillna(0)
        df['like_timestamp']    = df['like_timestamp'].fillna(0)

        df['reply_timestamp']   = df['reply_timestamp'].astype(np.uint32)
        df['retweet_timestamp'] = df['retweet_timestamp'].astype(np.uint32)
        df['comment_timestamp'] = df['comment_timestamp'].astype(np.uint32)
        df['like_timestamp']    = df['like_timestamp'].astype(np.uint32)

    df['tweet_timestamp']         = df['tweet_timestamp'].astype( np.uint32 )
    df['creator_follower_count']  = df['creator_follower_count'].astype( np.uint32 )
    df['creator_following_count'] = df['creator_following_count'].astype( np.uint32 )
    df['creator_account_creation']= df['creator_account_creation'].astype( np.uint32 )
    df['engager_follower_count']  = df['engager_follower_count'].astype( np.uint32 )
    df['engager_following_count'] = df['engager_following_count'].astype( np.uint32 )
    df['engager_account_creation']= df['engager_account_creation'].astype( np.uint32 )

    return df



def preprocess(df, target, train):
    df = set_dataframe_types(df, train)
    # df = df.set_index('id')
    # df.columns = conf.raw_features + conf.labels
    df = df.drop('text_tokens', axis=1)
    
    df = feature_extraction(df, features=conf.used_features, train=train) # extract 'used_features'
    cols = []
    print('target_encode')
    for c in tqdm([
        # ['engager_id'],
        ['engager_id','tweet_type','language'],
        # ['creator_id'],
        # ['domains','media','tweet_type','language']
        ]):
        out_col = 'TE_'+'_'.join(c)+'_'+target
        if os.path.exists('./encoder.pkl'):
            with open('./encoder.pkl', 'rb') as f:
                encoder = pickle.load(f)
        else:
            encoder = MTE_one_shot(folds=5,smooth=20)

        if train:
            
            df = encoder.fit_transform(df, c, target, out_col=out_col, out_dtype='float32')
            with open('encoder.pkl', 'wb') as f:
                pickle.dump(encoder, f)
        else:
                df = encoder.transform(df, c, out_col=out_col, out_dtype='float32')
        

        cols.append(out_col)
        del encoder


    return df

In [5]:

TARGET = 'like'
train = preprocess(train, TARGET, True)


  0%|          | 0/1 [00:00<?, ?it/s]target_encode
train1
100%|██████████| 1/1 [00:17<00:00, 17.38s/it]


In [6]:
train2 = preprocess(train2, TARGET, True)


  0%|          | 0/1 [00:00<?, ?it/s]target_encode
train2
100%|██████████| 1/1 [00:19<00:00, 19.42s/it]


In [7]:
valid = preprocess(valid, TARGET, False)


  0%|          | 0/1 [00:00<?, ?it/s]target_encode
5188745
100%|██████████| 1/1 [00:04<00:00,  4.69s/it]


In [13]:
label_names = ['reply', 'retweet', 'comment', 'like']
DONT_USE = ['timestamp','creator_account_creation','engager_account_creation','engage_time',
            'creator_account_creation', 'engager_account_creation',
            'fold','tweet_id', 
            'tr','dt_day','','ypred',
            'engager_id','creator_id','engager_is_verified',
            'elapsed_time',
            'links','domains','hashtags0','hashtags1',
            'hashtags','tweet_hash','dt_second','id',
            'tw_hash0',
            'tw_hash1',
            'tw_rt_uhash',
            'same_language', 'nan_language','language',
            'tw_hash', 'tw_freq_hash','tw_first_word', 'tw_second_word', 'tw_last_word', 'tw_llast_word',
            'ypred','creator_count_combined','creator_user_fer_count_delta_time','creator_user_fing_count_delta_time','creator_user_fering_count_delta_time','creator_user_fing_count_mode','creator_user_fer_count_mode','creator_user_fering_count_mode'
            'reply', 'retweet', 'comment', 'like', 'pred'
            
           ]
DONT_USE += label_names
DONT_USE += conf.labels


In [14]:
xgb_parms = { 
                'max_depth':8, 
                'learning_rate':0.025, 
                'subsample':0.85,
                'colsample_bytree':0.35, 
                'eval_metric':'logloss',
                'objective':'binary:logistic',
                'tree_method':'gpu_hist',
                #'predictor': 'gpu_predictor',
                'seed': 1,
            }
LR = [0.05,0.03,0.07,0.01]
xgb_parms['learning_rate'] = LR[3]
RMV = [c for c in DONT_USE if c in train.columns]
dtrain = xgb.DMatrix(data=train.drop(RMV, axis=1) ,label=train[TARGET].values)
model = xgb.train(xgb_parms, 
                        dtrain=dtrain,
                        num_boost_round=500,
                        ) 

In [15]:
train.drop(RMV, axis=1).columns

Index(['tweet_type', 'creator_follower_count', 'creator_following_count',
       'media', 'tweet_timestamp', 'dt_dow', 'dt_hour', 'len_domains',
       'TE_engager_id_tweet_type_language_like'],
      dtype='object')

In [16]:
RMV = [c for c in DONT_USE if c in valid.columns]
dvalid = xgb.DMatrix(data=valid.drop(RMV, axis=1) )
valid['pred'] = model.predict(dvalid)

In [24]:
valid.columns

Index(['creator_id', 'engager_id', 'tweet_id', 'tweet_type', 'language',
       'creator_follower_count', 'creator_following_count', 'domains', 'media',
       'tweet_timestamp', 'dt_day', 'dt_dow', 'dt_hour', 'len_domains',
       'TE_engager_id_like', 'pred'],
      dtype='object')

NameError: name 'df' is not defined

In [17]:
valid['pred']

0          0.615871
1          0.615871
2          0.498268
3          0.363982
4          0.429048
             ...   
5635399    0.409211
5635400    0.272296
5635401    0.439516
5635402    0.296145
5635403    0.296145
Name: pred, Length: 5635404, dtype: float32

In [36]:
agg_all = pd.read_csv('agg_all.csv')

In [37]:
agg_all

Unnamed: 0,engager_id,tweet_type,language,TE_engager_id_tweet_type_language_like
0,0000030E0DCCFDF9DBF2DDC031E6DA58,1,19,0.379077
1,0000055BD24EE1ED318EAC7970A78849,1,10,0.379077
2,000005BCF00DCCEABCF7F82BDCFB3543,1,10,0.379077
3,000013136E63BA1782731FA3E59F7A30,2,19,0.426697
4,00001353EE2339E074323A06CCC5D89E,2,61,0.426697
...,...,...,...,...
2593956,FFFFE27E2E0BDCCDC5D8E1BF1A50E5B2,1,61,0.379077
2593957,FFFFE6F4A73D42C558AB2BB375109AC5,2,46,0.426697
2593958,FFFFE7D1852F5FC078F31004CDC344D6,2,48,0.426697
2593959,FFFFED3C5DFF871BFE0C8FF745396A7D,1,61,0.379077


In [None]:
a = np.load('agg_all.npy')

In [54]:
int('000005BCF00DCCEABCF7F82BDCFB3543', 16)

116381235657025742637517049115971