In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score,f1_score,recall_score

import gc
import time
import os
import sys
import warnings
warnings.filterwarnings('ignore')

pickle_path = "../pickle"

train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

app_data_nlp = pd.read_pickle("{}/device_new_app.pickle".format(pickle_path))
user_fav_nlp = pd.read_pickle("{}/tag_weight_new_data.pickle".format(pickle_path))

all_data = train.append(test,sort = False)
# all_data = all_data[['target','deviceid']]
all_data = all_data.reset_index(drop=True)
print(all_data.shape)

(15030273, 15)


In [2]:
app_data_nlp.head()

Unnamed: 0,deviceid,applist,app_len
0,000046581b8a28c431be90c278674925,"[app_133, app_1]",2
1,00016381ab699d4e76dc99291e79e7a1,[app_133],1
2,0001c7e6a85a3a4498fe0c5f29f3a379,[app_133],1
3,000207c515d01c00e9144c6866b546a7,"[app_133, app_1]",2
4,000355d66e3fe127c8c2dd1ef60322a3,"[app_84, app_85, app_4, app_5, app_86, app_87,...",86


In [3]:
app_data_nlp.shape

(114584, 3)

In [4]:
user_fav_nlp.head()

Unnamed: 0,deviceid,all_tag_word,all_tag_weight
0,000046581b8a28c431be90c278674925,"[美食, --其他, 美食攻略, 花絮片段, 玩具, 吃秀, 社会热点, 中医, 片段, 大...","[0.4171913341996304, 0.36140167938226964, 0.35..."
1,00016381ab699d4e76dc99291e79e7a1,[未知],[0]
2,0001c7e6a85a3a4498fe0c5f29f3a379,"[社会热点, --其他, 古代, 范冰冰, 台湾, 李治廷, 彦希, 灰姑娘, 清朝, 总裁...","[0.8310844893612963, 0.3135020218516166, 6.367..."
3,000207c515d01c00e9144c6866b546a7,"[海军, 航母, 导弹, 武器, 武器, 导弹, 洲际导弹, 大妈, 海军, 航母, 网游,...","[17.15805189101101, 13.780793638746603, 13.220..."
4,000355d66e3fe127c8c2dd1ef60322a3,"[东北, 大盘, 菜谱]","[37.141856323864594, 35.747926949211916, 4.949..."


In [5]:
user_fav_nlp.shape

(114584, 3)

In [7]:
train.shape

(11376681, 15)

In [8]:
test.shape

(3653592, 13)

In [9]:
all_data.shape

(15030273, 15)

In [10]:
all_data.head()

Unnamed: 0,id,target,timestamp,deviceid,newsid,guid,pos,app_version,device_vendor,netmodel,osversion,lng,lat,device_version,ts
0,1,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,g4,9,112.5385,37.83793,STF-AL00,1573298086436
1,2,0.0,,8b2d7f2aed47ab32e9c6ae4f5ae00147,8008333091915950969,9a2c909ebc47aec49d9c160cdb4a6572,1,2.1.5,HONOR,w,9,111.7312,35.62274,STF-AL00,1573298087570
2,3,0.0,,832aaa33cdf4a0938ba2c795eb3ffefd,4941885624885390992,d51a157d2b1e0e9aed4dd7f9900b85b2,2,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377075934
3,4,0.0,,832aaa33cdf4a0938ba2c795eb3ffefd,6088376349846612406,d51a157d2b1e0e9aed4dd7f9900b85b2,1,1.9.9,vivo,w,8.1.0,5e-324,5e-324,V1818T,1573377044359
4,5,0.0,,67dd9dac18cce1a6d79e8f20eefd98ab,5343094189765291622,625dc45744f59ddbc3ec8df161217188,0,2.1.1,xiaomi,w,9,116.7509,36.56831,Redmi Note 7,1573380989662


In [13]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer,HashingVectorizer
from sklearn.decomposition import TruncatedSVD,SparsePCA
from sklearn.linear_model import LogisticRegression,BayesianRidge,SGDClassifier,PassiveAggressiveClassifier,RidgeClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC,NuSVC,SVC
from sklearn.metrics import roc_auc_score,accuracy_score
from sklearn.model_selection import KFold,StratifiedKFold,TimeSeriesSplit
from scipy import sparse
import xgboost as xgb
import lightgbm as lgb
import catboost as cbt

def get_sklearn_embedding(now = None,fea = None,num_classes = 2,n_splits=5,ngram=1,prefix=None):
    
    if os.path.exists("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix)):
        return pd.read_pickle("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix))
    else:
        df = now.copy()
        df[fea] = df[fea].map(lambda x:" ".join(x))
        df = df.merge(all_data,how='right',on='deviceid')
        print(df.head())
        print(df.shape)
        '''
        TF-IDF(term frequency-inverse document frequency)词频-逆向文件频率。
        字词的重要性与其在文本中出现的频率成正比(TF)，与其在语料库中出现的频率成反比(IDF)。
        '''
        tfidf = TfidfVectorizer(ngram_range=(1,ngram))
        tf = tfidf.fit_transform(df[fea].fillna("##").values)
        '''
        CountVectorizer会将文本中的词语转换为词频矩阵，
        它通过fit_transform函数计算各个词语出现的次数
        '''
        count = CountVectorizer(ngram_range=(1,ngram))
        cv = count.fit_transform(df[fea].fillna("##").values)
        all_ = sparse.csr_matrix(sparse.hstack([tf, cv]))
        print("TFIDF & COUNT FINISHED...")
        
        
        tr = df['target'].notnull()
        te = df['target'].isnull()
        y = df[tr]['target']
        X_train = all_[df[tr].index]
        X_test = all_[df[te].index]
        print(X_train.shape)
        print(X_test.shape)

        random_seed = 2019
        model_zoo = [SGDClassifier(n_jobs=10,verbose=1),SGDClassifier(loss='log',n_jobs=10,verbose=1),
                     SGDClassifier(loss='modified_huber',n_jobs=10,verbose=1),
                     PassiveAggressiveClassifier(n_jobs=10,verbose=1),LogisticRegression(C=10),
                     RidgeClassifier(solver='lsqr',fit_intercept=False),LinearSVC(verbose=1,max_iter=500),
                     BernoulliNB(),MultinomialNB()]

        columns = ['SGD_HINGE','SGD_LOG','SGD_HUBER','PAC','LR','RIDGE','LSVC','BNB','MNB']

        oof = []
        count = 0

        for model in model_zoo:
            t1 = time.time()
            cv_pred_stack = np.zeros((X_train.shape[0],num_classes))
            test_pred_stack = np.zeros((X_test.shape[0],num_classes))
            skf = KFold(n_splits=n_splits,random_state=random_seed)
            if os.path.exists("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count])):
                tmp = pd.read_pickle("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count]))
            else:
                for index, (train_index, test_index) in enumerate(skf.split(X_train, y)):
                    print(index,model)
                    train_x, test_x, train_y, test_y = X_train[train_index], X_train[test_index], y.iloc[train_index], y.iloc[test_index]
                    model.fit(train_x,train_y)
                    try:
                        y_val = model._predict_proba_lr(test_x)
                    except:
                        y_val = model.predict_proba(test_x)
                    cv_pred_stack[test_index] = y_val
                    print(y_val.shape)
                    try:
                        test_pred_stack += model._predict_proba_lr(X_test) / n_splits
                    except:
                        test_pred_stack += model.predict_proba(X_test) / n_splits
                print(model,'score:',accuracy_score(y,np.argmax(cv_pred_stack,axis=1)))
                print(time.time()-t1)
                a = pd.DataFrame(cv_pred_stack).add_prefix(columns[count]+"_")
                a['deviceid'] = df[tr]['deviceid'].values
                b = pd.DataFrame(test_pred_stack).add_prefix(columns[count]+"_")
                b['deviceid'] = df[te]['deviceid'].values
                tmp = a.append(b).sort_values(by=['deviceid']).reset_index(drop=True)
                tmp.to_pickle("../pickle/{}_TFIDF_COUNT_{}.pickle".format(prefix,columns[count]))
                
            count += 1
            oof.append(tmp)
  
        df_agg = pd.DataFrame()
        for i in tqdm(oof):
            df_agg[i.columns] = i
        df_agg = df_agg.sort_values(by=['deviceid'],ascending=True)
        df_agg.to_pickle("../pickle/{}_tfidf_count_emb_all.pickle".format(prefix))
    
    return df_agg



In [14]:
prob_app_data = get_sklearn_embedding(now = app_data_nlp,fea = 'applist',n_splits=5,ngram=1,prefix='app_data')
prob_user_fav = get_sklearn_embedding(now = user_fav_nlp,fea = 'all_tag_word',n_splits=5,ngram=1,prefix='user_fav')

                           deviceid        applist  app_len       id  target  \
0  000046581b8a28c431be90c278674925  app_133 app_1        2   267793     0.0   
1  000046581b8a28c431be90c278674925  app_133 app_1        2   267794     0.0   
2  000046581b8a28c431be90c278674925  app_133 app_1        2  1027164     0.0   
3  000046581b8a28c431be90c278674925  app_133 app_1        2  1027165     1.0   
4  000046581b8a28c431be90c278674925  app_133 app_1        2  1027166     0.0   

      timestamp               newsid                              guid  pos  \
0           NaN  5560193608293752904  8162329d2d2ad3d13ce8535267901b42    4   
1           NaN  7957896460416082441  8162329d2d2ad3d13ce8535267901b42    0   
2           NaN  1511292148904879652  8162329d2d2ad3d13ce8535267901b42    1   
3  1.573392e+12  5612171532368788498  8162329d2d2ad3d13ce8535267901b42    2   
4           NaN   625815464017909362  8162329d2d2ad3d13ce8535267901b42    0   

  app_version device_vendor netmodel osversi

MemoryError: 