In [12]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"
deal_device_new_app = pd.read_pickle("{}/deal_device_new_app.pickle".format(pickle_path))
device_new_app = pd.read_pickle("{}/device_new_app.pickle".format(pickle_path))

In [13]:
deal_device_new_app.head()

Unnamed: 0,deviceid,applist
0,000046581b8a28c431be90c278674925,app_133
1,000046581b8a28c431be90c278674925,app_1
2,00016381ab699d4e76dc99291e79e7a1,app_133
3,0001c7e6a85a3a4498fe0c5f29f3a379,app_133
4,000207c515d01c00e9144c6866b546a7,app_133


In [14]:
device_new_app.head()

Unnamed: 0,deviceid,applist,app_len
0,000046581b8a28c431be90c278674925,"[app_133, app_1]",2
1,00016381ab699d4e76dc99291e79e7a1,[app_133],1
2,0001c7e6a85a3a4498fe0c5f29f3a379,[app_133],1
3,000207c515d01c00e9144c6866b546a7,"[app_133, app_1]",2
4,000355d66e3fe127c8c2dd1ef60322a3,"[app_84, app_85, app_4, app_5, app_86, app_87,...",86


In [15]:
from tqdm import tqdm

In [16]:
from collections import Counter

def Gini(pr):
    cate=Counter(pr)
    gini=1
    total=len(pr)
    for i in cate:
        gini=gini-(cate[i]/total)**2
    return gini

import math
# Calc Entropy
def entropy(pr):
    cate = Counter(pr)
    log2 = math.log2
    total = len(pr)
    ent = 0
    for i in cate:
        p = float(cate[i] / total)
        if p==0:
            ent=0
            continue
        ent = ent - p * (log2(p))
    return ent

def get_small(x,a,b):
    return int((x>a) & (x<b))

def get_feature_flatten(df):
    
    fea = []
    # Count Encoder
    t1 = time.time()
    df['appid_count'] = df.groupby(['applist'])['deviceid'].transform('count')
#     print(df.isnull().any())
#     print(df[['deviceid','appid_count']].groupby(['deviceid'])['appid_count'].transform('count').value_counts())
    fea.append(df[['deviceid','appid_count']].groupby(['deviceid'])['appid_count'].agg({'mean','std','min','max','median'}))
    fea.append(pd.DataFrame(df[['deviceid','applist']].groupby(['deviceid'])['applist'].apply(Gini)).rename(columns = {'applist':"Gini"}))
    fea.append(pd.DataFrame(df[['deviceid','applist']].groupby(['deviceid'])['applist'].apply(entropy)).rename(columns = {'applist':"entropy"}))
    df['appid_count_0_1e3'] = df['appid_count'].map(lambda x:get_small(x,0,1e3))
    df['appid_count_1e3_1e4'] = df['appid_count'].map(lambda x:get_small(x,1e3,1e4))
    df['appid_count_1e4_2e5'] = df['appid_count'].map(lambda x:get_small(x,1e4,2e5))
    tmp = df.groupby(['deviceid']).agg({'appid_count_0_1e3' : ['mean','sum','std'],
                          'appid_count_1e3_1e4' : ['mean','sum','std'],
                          'appid_count_1e4_2e5' : ['mean','sum','std']
                          })

    tmp.columns = ['_'.join(col).strip() for col in tmp.columns.values]
    fea.append(tmp)
    print("Count Active Finish... :",time.time()-t1)
    
    return fea

In [17]:
deviceid = device_new_app[['deviceid']]

In [18]:
deviceid.shape

(114584, 1)

In [19]:
fea0 = get_feature_flatten(deal_device_app)

Count Active Finish... : 24.356213569641113


In [20]:
for i in tqdm(fea0):
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(i.columns)

100%|██████████| 4/4 [00:00<00:00, 1284.13it/s]

~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index(['std', 'max', 'mean', 'min', 'median'], dtype='object')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index(['Gini'], dtype='object')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index(['entropy'], dtype='object')
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Index(['appid_count_0_1e3_mean', 'appid_count_0_1e3_sum',
       'appid_count_0_1e3_std', 'appid_count_1e3_1e4_mean',
       'appid_count_1e3_1e4_sum', 'appid_count_1e3_1e4_std',
       'appid_count_1e4_2e5_mean', 'appid_count_1e4_2e5_sum',
       'appid_count_1e4_2e5_std'],
      dtype='object')





In [21]:
for i in tqdm(fea0):
    deviceid = deviceid.merge(i,how='left',on='deviceid')

100%|██████████| 4/4 [00:00<00:00, 18.48it/s]


In [22]:
deviceid.head()

Unnamed: 0,deviceid,std,max,mean,min,median,Gini,entropy,appid_count_0_1e3_mean,appid_count_0_1e3_sum,appid_count_0_1e3_std,appid_count_1e3_1e4_mean,appid_count_1e3_1e4_sum,appid_count_1e3_1e4_std,appid_count_1e4_2e5_mean,appid_count_1e4_2e5_sum,appid_count_1e4_2e5_std
0,000046581b8a28c431be90c278674925,19969.402607,82968,68847.5,54727,68847.5,0.5,1.0,0.0,0,0.0,0.0,0,0.0,1.0,2,0.0
1,00016381ab699d4e76dc99291e79e7a1,,82968,82968.0,82968,82968.0,0.0,0.0,0.0,0,,0.0,0,,1.0,1,
2,0001c7e6a85a3a4498fe0c5f29f3a379,,82968,82968.0,82968,82968.0,0.0,0.0,0.0,0,,0.0,0,,1.0,1,
3,000207c515d01c00e9144c6866b546a7,19969.402607,82968,68847.5,54727,68847.5,0.5,1.0,0.0,0,0.0,0.0,0,0.0,1.0,2,0.0
4,000355d66e3fe127c8c2dd1ef60322a3,8285.166665,54727,7697.232558,8,5298.0,0.988372,6.426265,0.151163,13,0.360308,0.593023,51,0.494152,0.255814,22,0.438877


In [23]:
deviceid.shape

(114584, 17)

In [24]:
deviceid = deviceid.set_index('deviceid').add_prefix("app_").reset_index()

In [25]:
deviceid.head()

Unnamed: 0,deviceid,app_std,app_max,app_mean,app_min,app_median,app_Gini,app_entropy,app_appid_count_0_1e3_mean,app_appid_count_0_1e3_sum,app_appid_count_0_1e3_std,app_appid_count_1e3_1e4_mean,app_appid_count_1e3_1e4_sum,app_appid_count_1e3_1e4_std,app_appid_count_1e4_2e5_mean,app_appid_count_1e4_2e5_sum,app_appid_count_1e4_2e5_std
0,000046581b8a28c431be90c278674925,19969.402607,82968,68847.5,54727,68847.5,0.5,1.0,0.0,0,0.0,0.0,0,0.0,1.0,2,0.0
1,00016381ab699d4e76dc99291e79e7a1,,82968,82968.0,82968,82968.0,0.0,0.0,0.0,0,,0.0,0,,1.0,1,
2,0001c7e6a85a3a4498fe0c5f29f3a379,,82968,82968.0,82968,82968.0,0.0,0.0,0.0,0,,0.0,0,,1.0,1,
3,000207c515d01c00e9144c6866b546a7,19969.402607,82968,68847.5,54727,68847.5,0.5,1.0,0.0,0,0.0,0.0,0,0.0,1.0,2,0.0
4,000355d66e3fe127c8c2dd1ef60322a3,8285.166665,54727,7697.232558,8,5298.0,0.988372,6.426265,0.151163,13,0.360308,0.593023,51,0.494152,0.255814,22,0.438877


In [26]:
deviceid.to_pickle("../pickle/app_flatten_all_data.pickle")