In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import random
import json
import gc
import pickle
import gensim
from gensim.models import Word2Vec
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
click_df=pd.read_csv("./raw_data/train_preliminary/click_log.csv")
ad_df=pd.read_csv("./raw_data/train_preliminary/ad.csv")
train_user=pd.read_csv("./raw_data/train_preliminary/user.csv")

In [5]:
click_df.head(10)

Unnamed: 0,time,user_id,creative_id,click_times
0,9,30920,567330,1
1,65,30920,3072255,1
2,56,30920,2361327,1
3,6,309204,325532,1
4,59,309204,2746730,1
5,12,309204,726402,1
6,79,309204,2851451,1
7,32,309204,1569716,1
8,5,309204,71956,1
9,8,309204,322354,1


In [6]:
ad_df.head(10)

Unnamed: 0,creative_id,ad_id,product_id,product_category,advertiser_id,industry
0,1,1,\N,5,381,78
1,4,4,\N,5,108,202
2,7,7,\N,5,148,297
3,8,8,\N,5,713,213
4,9,9,\N,5,695,213
5,10,10,\N,5,100,73
6,12,12,\N,5,765,6
7,13,13,\N,5,113,267
8,16,16,\N,5,623,1
9,20,20,34647,5,312,267


In [7]:
train_user.head(10)

Unnamed: 0,user_id,age,gender
0,1,4,1
1,2,10,1
2,3,7,2
3,4,5,1
4,5,4,1
5,6,6,1
6,7,6,2
7,8,5,1
8,9,5,1
9,10,9,2


In [None]:
def merge_files():
    #合并点击记录
    print("merge click files...")
    click_df=pd.read_csv("./raw_data/train_preliminary/click_log.csv")
    train_click=pd.read_csv("./raw_data/train_semi_final/click_log.csv")
    train_click=train_click[train_click.user_id>2000000]
    click_df=click_df.append(train_click)
    click_df=click_df.sort_values(by=["time"]).drop_duplicates()   
    
    #合并广告信息
    print("merge ad files...")
    ad_df=pd.read_csv("./raw_data/train_preliminary/ad.csv")
    ad_df=ad_df.append(pd.read_csv("./raw_data/train_semi_final/ad.csv"))
    ad_df=ad_df.drop_duplicates() 
    
    #合并用户信息
    print("merge user files...")
    train_user=pd.read_csv("./raw_data/train_preliminary/user.csv")
    train_user=train_user.reset_index(drop=True)
    train_user['age']=train_user['age']-1
    train_user['gender']=train_user['gender']-1
    test_user=pd.read_csv("./raw_data/train_semi_final/user.csv")[-1000000:].drop_duplicates('user_id')[['user_id']].reset_index(drop=True)
    test_user=test_user.sort_values(by='user_id').reset_index(drop=True)
    test_user['age']=-1
    test_user['gender']=-1

    #合并点击，广告，用户信息
    print("merge all files...")
    click_df=click_df.merge(ad_df,on="creative_id",how='left')
    click_df=click_df.merge(train_user,on="user_id",how='left')
    click_df=click_df.fillna(-1)
    click_df=click_df.replace("\\N",-1)
    for f in click_df:
        click_df[f]=click_df[f].astype(int)
    for i in range(10):
        click_df['age_{}'.format(i)]=(click_df['age']==i).astype(np.int16) 
    for i in range(2):
        click_df['gender_{}'.format(i)]=(click_df['gender']==i).astype(np.int16) 
    
    
    return click_df,train_user,test_user


if __name__ == "__main__":
    click_df,train_user,test_user=merge_files() 
    #保存预处理文件
    print("preprocess done! saving data...")
    click_df.to_pickle("./tmp_data/click.pkl")
    train_user.to_pickle("./tmp_data/train_user.pkl")
    test_user.to_pickle("./tmp_data/test_user.pkl")

In [3]:
def get_agg_features(dfs,f1,f2,agg,log):    
    #判定特殊情况
    if type(f1)==str:
        f1=[f1]
    if agg!='size':
        data=log[f1+[f2]]
    else:
        data=log[f1] 
    f_name='_'.join(f1)+"_"+f2+"_"+agg     
    #聚合操作    
    if agg=="size":
        tmp = pd.DataFrame(data.groupby(f1).size()).reset_index()
    elif agg=="count":
        tmp = pd.DataFrame(data.groupby(f1)[f2].count()).reset_index()
    elif agg=="mean":
        tmp = pd.DataFrame(data.groupby(f1)[f2].mean()).reset_index()
    elif agg=="unique":
        tmp = pd.DataFrame(data.groupby(f1)[f2].nunique()).reset_index()
    elif agg=="max":
        tmp = pd.DataFrame(data.groupby(f1)[f2].max()).reset_index()
    elif agg=="min":
        tmp = pd.DataFrame(data.groupby(f1)[f2].min()).reset_index()
    elif agg=="sum":
        tmp = pd.DataFrame(data.groupby(f1)[f2].sum()).reset_index()
    elif agg=="std":
        tmp = pd.DataFrame(data.groupby(f1)[f2].std()).reset_index()
    elif agg=="median":
        tmp = pd.DataFrame(data.groupby(f1)[f2].median()).reset_index()
    else:
        raise "agg error"   
    #赋值聚合特征
    for df in dfs:
        try:
            del df[f_name]
        except:
            pass
        tmp.columns = f1+[f_name]
        df[f_name]=df.merge(tmp, on=f1, how='left')[f_name] 
    del tmp
    del data
    gc.collect()
    return [f_name]


def sequence_text(dfs,f1,f2,log):
    f_name='sequence_text_'+f1+'_'+f2
    print(f_name)
    #遍历log，获得用户的点击序列
    dic,items={},[]
    for item in log[[f1,f2]].values:
        try:
            dic[item[0]].append(str(item[1]))
        except:
            dic[item[0]]=[str(item[1])]      
    for key in dic:
        items.append([key,' '.join(dic[key])])
    #赋值序列特征
    temp=pd.DataFrame(items)
    temp.columns=[f1,f_name]
    temp = temp.drop_duplicates(f1)
    for df in dfs:
        try:
            del df[f_name]
        except:
            pass
        temp.columns = [f1]+[f_name]
        df[f_name]=df.merge(temp, on=f1, how='left')[f_name]
    gc.collect() 
    del temp
    del items
    del dic
    return [f_name]

def kfold(train_df,test_df,log_data,pivot):
    #先对log做kflod统计，统计每条记录中pivot特征的性别年龄分布
    kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)]
    log=log_data[kfold_features+['user_id',pivot,'fold']]
    tmps=[]
    for fold in range(6):
        tmp = pd.DataFrame(log[(log['fold'] != fold) & (log['fold'] != 5)].groupby(pivot)[kfold_features].mean()).reset_index()
        tmp.columns=[pivot]+kfold_features
        tmp['fold']=fold
        tmps.append(tmp)
    tmp=pd.concat(tmps,axis=0).reset_index()
    tmp=log[['user_id',pivot,'fold']].merge(tmp,on=[pivot,'fold'],how='left')
    del log
    del tmps
    gc.collect() 
    #获得用户点击的所有记录的平均性别年龄分布
    tmp_mean = pd.DataFrame(tmp.groupby('user_id')[kfold_features].mean()).reset_index()
    tmp_mean.columns=['user_id']+[f+'_'+pivot+'_mean' for f in kfold_features]
    for df in [train_df,test_df]:
        temp=df.merge(tmp_mean,on='user_id',how='left')
        temp=temp.fillna(-1)
        for f1 in [f+'_'+pivot+'_mean' for f in kfold_features]:
            df[f1]=temp[f1]
        del temp
        gc.collect()
    del tmp
    del tmp_mean
    gc.collect()



def kfold_sequence(train_df,test_df,log_data,pivot): 
    #先对log做kflod统计，统计每条记录中pivot特征的性别年龄分布
    kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)]
    log=log_data[kfold_features+[pivot,'fold','user_id']]
    tmps=[]
    for fold in range(6):
        tmp = pd.DataFrame(log[(log['fold'] != fold) & (log['fold'] != 5)].groupby(pivot)[kfold_features].mean()).reset_index()
        tmp.columns=[pivot]+kfold_features
        tmp['fold']=fold
        tmps.append(tmp)
    tmp=pd.concat(tmps,axis=0).reset_index()
    tmp=log[[pivot,'fold','user_id']].merge(tmp,on=[pivot,'fold'],how='left')
    tmp=tmp.fillna(-1)   
    tmp[pivot+'_fold']=tmp[pivot]*10+tmp['fold']   
    del log
    del tmps
    gc.collect() 
    #获得用户点击记录的年龄性别分布序列
    tmp[pivot+'_fold']=tmp[pivot+'_fold'].astype(int)
    kfold_sequence_features=sequence_text([train_df,test_df],'user_id',pivot+'_fold',tmp)
    tmp=tmp.drop_duplicates([pivot+'_fold']).reset_index(drop=True)
    #对每条记录年龄性别分布进行标准化
    kfold_features=['age_{}'.format(i) for i in range(10)]+['gender_{}'.format(i) for i in range(2)]
    ss=StandardScaler()
    ss.fit(tmp[kfold_features])
    tmp[kfold_features]=ss.transform(tmp[kfold_features])
    for f in kfold_features:
        tmp[f]=tmp[f].apply(lambda x:round(x,4))   
#     #将每条记录年龄性别分布转成w2v形式的文件
#     with open('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),'w') as f:
#         f.write(str(len(tmp))+' '+'12'+'\n')
#         for item in tmp[[pivot+'_fold']+kfold_features].values:
#             f.write(' '.join([str(int(item[0]))]+[str(x) for x in item[1:]])+'\n') 
#     tmp=gensim.models.KeyedVectors.load_word2vec_format('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),binary=False)
#     pickle.dump(tmp,open('data/sequence_text_user_id_'+pivot+'_fold'+".{}d".format(12),'wb'))
#     del tmp
#     gc.collect()  
    return kfold_sequence_features

if __name__ == "__main__":
    #读取数据
    click_log=pd.read_pickle('./tmp_data/click.pkl')
    train_df=pd.read_pickle('./tmp_data/train_user.pkl')
    test_df=pd.read_pickle('./tmp_data/test_user.pkl')
    print(click_log.shape,train_df.shape,test_df.shape)
    ################################################################################
    #获取聚合特征
    print("Extracting aggregate feature...")
    agg_features=[]
    agg_features+=get_agg_features([train_df,test_df],'user_id','','size',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','ad_id','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','creative_id','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','advertiser_id','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','industry','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','product_id','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','time','unique',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','sum',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','mean',click_log)
    agg_features+=get_agg_features([train_df,test_df],'user_id','click_times','std',click_log)
    train_df[agg_features]=train_df[agg_features].fillna(-1)
    test_df[agg_features]=test_df[agg_features].fillna(-1)
    print("Extracting aggregate feature done!")
    print("List aggregate feature names:")
    print(agg_features)
    ################################################################################
    #获取序列特征，用户点击的id序列
    print("Extracting sequence feature...")
    text_features=[]
    text_features+=sequence_text([train_df,test_df],'user_id','ad_id',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','creative_id',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','advertiser_id',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','product_id',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','industry',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','product_category',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','time',click_log)
    text_features+=sequence_text([train_df,test_df],'user_id','click_times',click_log)
    print("Extracting sequence feature done!")
    print("List sequence feature names:")   
    print(text_features)
    ################################################################################
    #获取K折统计特征，求出用户点击的所有记录的年龄性别平均分布
    #赋值index,训练集为0-4，测试集为5
    print("Extracting Kflod feature...")
    log=click_log.drop_duplicates(['user_id','creative_id']).reset_index(drop=True)
    del click_log
    gc.collect()
    log['cont']=1
    train_df['fold']=train_df.index%5
    test_df['fold']=5
    df=train_df.append(test_df)[['user_id','fold']].reset_index(drop=True)
    log=log.merge(df,on='user_id',how='left')
    del df
    gc.collect()
    #获取用户点击某特征的年龄性别平均分布
    for pivot in ['creative_id','ad_id','product_id','advertiser_id','industry']:
        print("Kfold",pivot)
        kfold(train_df,test_df,log,pivot)
    del log
    gc.collect()       
    print("Extracting Kflod feature done!")
    ################################################################################
    #获取K折序列特征,求出用户点击的每一条记录的年龄性别分布
    #赋值index,训练集为0-4，测试集为5
    print("Extracting Kflod sequence feature...")
    click_log=pd.read_pickle('./tmp_data/click.pkl')
    log=click_log.reset_index(drop=True)
    del click_log
    gc.collect()
    log['cont']=1
    train_df['fold']=train_df.index%5
    train_df['fold']=train_df['fold'].astype(int)
    test_df['fold']=5
    df=train_df.append(test_df)[['user_id','fold']].reset_index(drop=True)
    log=log.merge(df,on='user_id',how='left')
#     #获取用户点击某特征的年龄性别分布序列
#     kfold_sequence_features=[] 
#     for pivot in ['creative_id','ad_id','product_id','advertiser_id','industry']:
#         print("Kfold sequence",pivot)
#         kfold_sequence_features+=kfold_sequence(train_df,test_df,log,pivot) 
    del log
#     gc.collect()        
#     print("Extracting Kfold sequence feature done!")
#     print("List Kfold sequence feature names:")   
#     print(kfold_sequence_features)  
    ################################################################################
    print("Extract features done! saving data...")
    train_df.to_pickle('./tmp_data/train_df.pkl')
    test_df.to_pickle('./tmp_data/test_df.pkl')

Extracting Kflod sequence feature...
Extract features done! saving data...


In [3]:
train_df=pd.read_pickle('./tmp_data/train_df.pkl')
test_df=pd.read_pickle('./tmp_data/test_df.pkl')

In [4]:
behavior_list=train_df.sequence_text_user_id_creative_id.tolist()+test_df.sequence_text_user_id_creative_id.tolist()
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
cntv = CountVectorizer(min_df=30)
cntv_user = cntv.fit_transform(behavior_list)

tfv = TfidfVectorizer(min_df=30,max_features=100000)
tfv_user = tfv.fit_transform(behavior_list)

In [5]:
behavior_list_ad=train_df.sequence_text_user_id_advertiser_id.tolist()+test_df.sequence_text_user_id_advertiser_id.tolist()
cntv = CountVectorizer()
cntv_user_ad = cntv.fit_transform(behavior_list_ad)

tfv = TfidfVectorizer()
tfv_user_ad = tfv.fit_transform(behavior_list_ad)

In [6]:
features=['user_id__size', 'user_id_ad_id_unique',
       'user_id_creative_id_unique', 'user_id_advertiser_id_unique',
       'user_id_industry_unique', 'user_id_product_id_unique',
       'user_id_time_unique', 'user_id_click_times_sum',
       'user_id_click_times_mean', 'user_id_click_times_std',
       'age_0_creative_id_mean',
       'age_1_creative_id_mean', 'age_2_creative_id_mean',
       'age_3_creative_id_mean', 'age_4_creative_id_mean',
       'age_5_creative_id_mean', 'age_6_creative_id_mean',
       'age_7_creative_id_mean', 'age_8_creative_id_mean',
       'age_9_creative_id_mean', 'gender_0_creative_id_mean',
       'gender_1_creative_id_mean', 'age_0_ad_id_mean', 'age_1_ad_id_mean',
       'age_2_ad_id_mean', 'age_3_ad_id_mean', 'age_4_ad_id_mean',
       'age_5_ad_id_mean', 'age_6_ad_id_mean', 'age_7_ad_id_mean',
       'age_8_ad_id_mean', 'age_9_ad_id_mean', 'gender_0_ad_id_mean',
       'gender_1_ad_id_mean', 'age_0_product_id_mean', 'age_1_product_id_mean',
       'age_2_product_id_mean', 'age_3_product_id_mean',
       'age_4_product_id_mean', 'age_5_product_id_mean',
       'age_6_product_id_mean', 'age_7_product_id_mean',
       'age_8_product_id_mean', 'age_9_product_id_mean',
       'gender_0_product_id_mean', 'gender_1_product_id_mean',
       'age_0_advertiser_id_mean', 'age_1_advertiser_id_mean',
       'age_2_advertiser_id_mean', 'age_3_advertiser_id_mean',
       'age_4_advertiser_id_mean', 'age_5_advertiser_id_mean',
       'age_6_advertiser_id_mean', 'age_7_advertiser_id_mean',
       'age_8_advertiser_id_mean', 'age_9_advertiser_id_mean',
       'gender_0_advertiser_id_mean', 'gender_1_advertiser_id_mean',
       'age_0_industry_mean', 'age_1_industry_mean', 'age_2_industry_mean',
       'age_3_industry_mean', 'age_4_industry_mean', 'age_5_industry_mean',
       'age_6_industry_mean', 'age_7_industry_mean', 'age_8_industry_mean',
       'age_9_industry_mean', 'gender_0_industry_mean',
       'gender_1_industry_mean']

In [7]:
from scipy import sparse
train_csr = sparse.csr_matrix(train_df[features])
test_csr = sparse.csr_matrix(test_df[features])
# CountVectorizer结果合并
train_csr = sparse.hstack((train_csr, cntv_user[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , cntv_user[900000:])).tocsr()

train_csr = sparse.hstack((train_csr, tfv_user[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , tfv_user[900000:])).tocsr()

train_csr = sparse.hstack((train_csr, cntv_user_ad[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , cntv_user_ad[900000:])).tocsr()

train_csr = sparse.hstack((train_csr, tfv_user_ad[:900000])).tocsr()
test_csr  = sparse.hstack((test_csr , tfv_user_ad[900000:])).tocsr()

In [8]:
testuser=pd.read_csv("./raw_data/train_semi_final/user.csv")[-1000000:]
testuser['age']=testuser['age']-1
testuser['gender']=testuser['gender']-1

In [None]:
tar='age'
params = {
    'num_leaves':2**8,
    'n_estimators':10000,
    'early_stopping_rounds':50,
#     'max_bin':2**10,
    'lambda_l1':1,
    'boosting_type': 'gbdt',
    'objective': 'multiclass',    
    'num_class':10,
    'metric':'multi_error',
    'num_threads':20
}

train_set = lgb.Dataset(train_csr,train_df[tar])
val_set=lgb.Dataset(test_csr, testuser[tar])

model1 = lgb.train(params,
                  train_set,
                  valid_sets=[val_set,train_set],
                  valid_names=['eval','train'],
                  verbose_eval=10)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2743259
[LightGBM] [Info] Number of data points in the train set: 900000, number of used features: 325608
[LightGBM] [Info] Start training from score -3.241491
[LightGBM] [Info] Start training from score -1.796631
[LightGBM] [Info] Start training from score -1.489637
[LightGBM] [Info] Start training from score -1.787914
[LightGBM] [Info] Start training from score -1.929743
[LightGBM] [Info] Start training from score -2.180171
[LightGBM] [Info] Start training from score -2.602025
[LightGBM] [Info] Start training from score -3.337691
[LightGBM] [Info] Start training from score -3.833315
[LightGBM] [Info] Start training from score -4.359352
Training until validation scores don't improve for 50 rounds
[10]	train's multi_error: 0.564277	eval's multi_error: 0.596383
[20]	train's multi_error: 0.537363	eval's multi_error: 0.584892
[30]	train's m

In [69]:
model1.best_score

defaultdict(dict,
            {'train': {'multi_error': 0.27578},
             'eval': {'multi_error': 0.548374}})

In [None]:
tar='gender'

params = {
    'num_leaves':2**7-1,
    'n_estimators':10000,
    'early_stopping_rounds':50,
    'lambda_l1':1,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric':'binary_error',
    'num_threads':20
}

train_set = lgb.Dataset(train_csr,train_df[tar])
val_set=lgb.Dataset(test_csr, testuser[tar])

model2 = lgb.train(params,
                  train_set,
                  valid_sets=[val_set,train_set],
                  valid_names=['eval','train'],
                  verbose_eval=10)

In [17]:
model2.best_score

defaultdict(dict,
            {'train': {'binary_error': 0.04469888888888889},
             'eval': {'binary_error': 0.067795}})

In [11]:
train_csr

<900000x437652 sparse matrix of type '<class 'numpy.float64'>'
	with 145440000 stored elements in Compressed Sparse Row format>

In [12]:
test_csr

<1000000x437652 sparse matrix of type '<class 'numpy.float64'>'
	with 161564576 stored elements in Compressed Sparse Row format>

In [25]:
importance = pd.DataFrame([model1.feature_name(),model1.feature_importance()])
importance2 = pd.DataFrame([model2.feature_name(),model2.feature_importance()])
importance.T.sort_values(1)[-50:]

In [77]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

genclf=[
    LogisticRegression(penalty='l1'),
    SGDClassifier(),    
    BernoulliNB()
]

ageclf=[
    LogisticRegression(penalty='l1'),
    SGDClassifier(),
    MultinomialNB()
]
acc={}

for aclf,clf in zip(ageclf,genclf):
    aclf.fit(train_csr,train_df['age'])
    test_result_age=pd.DataFrame(aclf.predict_proba(test_csr),
                                 columns=[aclf.__class__.__name__+'_age_{}'.format(i) for i in range(10)])
    test_df=pd.concat([test_df,test_result_age],axis=1)
    train_result_age=pd.DataFrame(aclf.predict_proba(train_csr),
                                  columns=[aclf.__class__.__name__+'_age_{}'.format(i) for i in range(10)])
    train_df=pd.concat([train_df,train_result_age],axis=1)
    clf.fit(train_csr,train_df['gender'])
    test_result_gen=pd.DataFrame(clf.predict_proba(test_csr),
                                 columns=[clf.__class__.__name__+'_gender_{}'.format(i) for i in range(2)])
    test_df=pd.concat([test_df,test_result_gen],axis=1)
    train_result_gen=pd.DataFrame(clf.predict_proba(train_csr),
                                  columns=[clf.__class__.__name__+'_gender_{}'.format(i) for i in range(2)])
    train_df=pd.concat([train_df,train_result_gen],axis=1)
    test_result_agenp=aclf.predict(test_csr)
    test_result_gennp=clf.predict(test_csr)
    acc[clf.__class__.__name__]=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]

In [41]:
acc={}
from sklearn.metrics import accuracy_score
prob=test_df.iloc[:,-12:].values+test_df.iloc[:,-24:-12].values+test_df.iloc[:,-36:-24].values
test_result_gennp=pd.DataFrame(prob[:,-2:]).idxmax(1)
test_result_agenp=pd.DataFrame(prob[:,:10]).idxmax(1)
acc['sum']=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]

prob=test_df.iloc[:,-12:].values
test_result_gennp=pd.DataFrame(prob[:,-2:]).idxmax(1)
test_result_agenp=pd.DataFrame(prob[:,:10]).idxmax(1)
acc['NB']=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]
prob=test_df.iloc[:,-24:-12].values
test_result_gennp=pd.DataFrame(prob[:,-2:]).idxmax(1)
test_result_agenp=pd.DataFrame(prob[:,:10]).idxmax(1)
acc['SGD']=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]
prob=test_df.iloc[:,-36:-24].values
test_result_gennp=pd.DataFrame(prob[:,-2:]).idxmax(1)
test_result_agenp=pd.DataFrame(prob[:,:10]).idxmax(1)
acc['LR_l1']=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]

In [48]:
acc

{'sum': [0.919099, 0.399813],
 'NB': [0.90695, 0.377696],
 'SGD': [0.910405, 0.328027],
 'LR_l1': [0.917684, 0.388937]}

In [48]:
# train_df.to_pickle('./tmp_data/train_df_tf.pkl')
# test_df.to_pickle('./tmp_data/test_df_tf.pkl')

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score


genclf=[
    LogisticRegression(penalty='l1'),
    SGDClassifier(),    
    BernoulliNB(),
    LinearSVC()
]

ageclf=[
    LogisticRegression(penalty='l1'),
    SGDClassifier(),
    MultinomialNB(),
    LinearSVC()
]
acc={}

for aclf,clf in zip(ageclf,genclf):
    aclf.fit(train_csr,train_df['age'])
    clf.fit(train_csr,train_df['gender'])
    test_result_agenp=aclf.predict(test_csr)
    test_result_gennp=clf.predict(test_csr)
    acc[clf.__class__.__name__]=[accuracy_score(testuser['gender'],test_result_gennp),accuracy_score(testuser['age'],test_result_agenp)]



{'LogisticRegression': [0.918158, 0.400817]}
{'LogisticRegression': [0.918158, 0.400817], 'SGDClassifier': [0.917283, 0.381357]}
{'LogisticRegression': [0.918158, 0.400817], 'SGDClassifier': [0.917283, 0.381357], 'BernoulliNB': [0.897853, 0.381901]}


In [10]:
acc

{'LogisticRegression': [0.918158, 0.400817],
 'SGDClassifier': [0.917283, 0.381357],
 'BernoulliNB': [0.897853, 0.381901],
 'LinearSVC': [0.893187, 0.342516]}