# 华为算法精英赛

In [17]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

读取数据（`user_app_usage.csv` 除外）

In [None]:
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])

In [None]:
class2id = {}
id2class = {}

def mergeBasicTables(baseTable):
    
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(user_behavior_info, how='left', on='uid', suffixes=('_base1', '_ubef'))
    
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [None]:
train_basic_info = mergeBasicTables(age_train)
test_basic_info = mergeBasicTables(age_test)

In [None]:
train_basic_info.head(2)

`user_basic_info.csv` 中的 `ramCapacity`，`ramLeftRation`，`romCapacity`，`romLeftRation` 和 `fontSize` 列都包含 `NAN` 值，采用平均值进行填充 

In [None]:
train_basic_info['ramCapacity'] = train_basic_info['ramCapacity'].fillna(round(user_basic_info['ramCapacity'].mean()))
test_basic_info['ramCapacity'] = test_basic_info['ramCapacity'].fillna(round(user_basic_info['ramCapacity'].mean()))

train_basic_info['ramLeftRation'] = train_basic_info['ramLeftRation'].fillna(round(user_basic_info['ramLeftRation'].mean()))
test_basic_info['ramLeftRation'] = test_basic_info['ramLeftRation'].fillna(round(user_basic_info['ramLeftRation'].mean(), 2))

train_basic_info['romCapacity'] = train_basic_info['romCapacity'].fillna(round(user_basic_info['romCapacity'].mean()))
test_basic_info['romCapacity'] = test_basic_info['romCapacity'].fillna(round(user_basic_info['romCapacity'].mean()))

train_basic_info['romLeftRation'] = train_basic_info['romLeftRation'].fillna(round(user_basic_info['romLeftRation'].mean(), 2))
test_basic_info['romLeftRation'] = test_basic_info['romLeftRation'].fillna(round(user_basic_info['romLeftRation'].mean(), 2))

train_basic_info['fontSize'] = train_basic_info['fontSize'].fillna(round(user_basic_info['fontSize'].mean(), 2))
test_basic_info['fontSize'] = test_basic_info['fontSize'].fillna(round(user_basic_info['fontSize'].mean(), 2))

In [92]:
train_basic_info.to_csv('train_basic_info.csv')
test_basic_info.to_csv('test_basic_info.csv')

In [18]:
train_basic_info = pd.read_csv('./data/train_basic_info.csv')
test_basic_info = pd.read_csv('./data/test_basic_info.csv')

---

In [9]:
train_app_info = train_basic_info.merge(user_app_actived, how='left', on='uid')[['uid', 'appId']]
test_app_info = test_basic_info.merge(user_app_actived, how='left', on='uid')[['uid', 'appId']]

In [10]:
app_info_dict = {}
for i in range(len(app_info)):
    entry = app_info.iloc[i]
    if entry.appId not in app_info_dict:
        app_info_dict[entry.appId] = entry.category
    else:
        app_info_dict[entry.appId] = app_info_dict[entry.appId] + '#' + entry.category

In [11]:
def getCategoryStr(appIdStr):
    categoryList = []
    appList = appIdStr.split('#')
    for i in appList:
        if i in app_info_dict:
            categoryList.append(app_info_dict[i])
    return '#'.join(categoryList)

train_app_info['categoryId'] = train_app_info.appId.apply(getCategoryStr)
test_app_info['categoryId'] = test_app_info.appId.apply(getCategoryStr)

In [12]:
train_app_info.head(2)

Unnamed: 0,uid,appId
0,1000001,a00140327#a00170298#a00184278#a00187480#a00239...
1,1000011,a00158535#a00163116#a00170432#a00187480#a00224...


In [14]:
train_app_info['categoryId'] = train_app_info.appId.apply(getCategoryStr)
test_app_info['categoryId'] = test_app_info.appId.apply(getCategoryStr)

In [15]:
cats = list(set(app_info['category']))
category2id = dict(zip(sorted(cats), range(0,len(cats))))
id2category = dict(zip(range(0,len(cats)), sorted(cats)))

In [65]:
def getTopNCategory(categoryStr, n):
    tops = pd.value_counts(categoryStr.split('#'))
    ret = []
    for i in range(1, n+1):
        if categoryStr.split('#')[0] == '' or len(tops) < i:
            ret.append(str(40))
        else:
            ret.append(str(category2id[tops.index[i - 1]]))
            
    return '#'.join(ret)

In [87]:
train_app_info['top3Category'] = train_app_info.categoryId.apply(lambda x: getTopNCategory(x, 3))
test_app_info['top3Category'] = test_app_info.categoryId.apply(lambda x: getTopNCategory(x, 3))

In [103]:
def getTopNCategory(categoryStr, n):
    tops = pd.value_counts(categoryStr.split('#'))
    ret = []
    for i in range(1, n+1):
        if categoryStr.split('#')[0] == '' or len(tops) < i:
            ret.append(str(40))
        else:
            ret.append(str(category2id[tops.index[i - 1]]))
            
    return '#'.join(ret)

train_app_info['top3Category'] = train_app_info.categoryId.apply(lambda x: getTopNCategory(x, 3))
test_app_info['top3Category'] = test_app_info.categoryId.apply(lambda x: getTopNCategory(x, 3))

train_app_info['1stCategory'] = train_app_info['top3Category'].apply(lambda x: int(x.split('#')[0]))
train_app_info['2ndCategory'] = train_app_info['top3Category'].apply(lambda x: int(x.split('#')[1]))
train_app_info['3rdCategory'] = train_app_info['top3Category'].apply(lambda x: int(x.split('#')[2]))

In [104]:
test_app_info['1stCategory'] = test_app_info['top3Category'].apply(lambda x: int(x.split('#')[0]))
test_app_info['2ndCategory'] = test_app_info['top3Category'].apply(lambda x: int(x.split('#')[1]))
test_app_info['3rdCategory'] = test_app_info['top3Category'].apply(lambda x: int(x.split('#')[2]))

In [108]:
train_app_info.to_csv('train_app_info.csv')
test_app_info.to_csv('test_app_info.csv')

In [19]:
train_app_info = pd.read_csv('./data/train_app_info.csv')
test_app_info = pd.read_csv('./data/test_app_info.csv')

In [249]:
from sklearn.feature_extraction.text import CountVectorizer

countVzer = CountVectorizer(min_df=1, max_df=0.7, tokenizer=lambda x: x.split('#'), max_features=3000)
train_app_counts = countVzer.fit_transform(train_app_info['appId'])
test_app_counts = countVzer.transform(test_app_info['appId'])

from sklearn.feature_extraction.text import TfidfTransformer

tfidfTfmer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
train_app_tfidfs = tfidfTfmer.fit_transform(train_app_counts)
test_app_tfidfs = tfidfTfmer.transform(test_app_counts)


In [250]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidfTfmer = TfidfTransformer(smooth_idf=True, sublinear_tf=True, use_idf=True)
train_app_tfidfs = tfidfTfmer.fit_transform(train_app_counts)
test_app_tfidfs = tfidfTfmer.transform(test_app_counts)

In [119]:
train_app_info['categoryId'] = train_app_info['categoryId'].fillna('')

In [120]:
test_app_info['categoryId'] = test_app_info['categoryId'].fillna('')

In [224]:
countVzer = CountVectorizer(min_df=0, max_df=1.0, tokenizer=lambda x: x.split('#'))
train_category_counts = countVzer.fit_transform(train_app_info['categoryId'])
test_category_counts = countVzer.transform(test_app_info['categoryId'])

In [227]:
train_category_tfidfs = tfidfTfmer.fit_transform(train_category_counts)
test_category_tfidfs = tfidfTfmer.transform(test_category_counts)

In [127]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix

0.1

In [110]:
from sklearn.metrics import accuracy_score
from sklearn import svm

SVM = svm.LinearSVC(C=0.01)
SVM.fit(train_app_counts, train_basic_info['age_group'])
SVM_preds = SVM.predict(test_app_counts)

In [111]:
df = pd.DataFrame({'id':test_basic_info['uid'],'label':SVM_preds})
df.to_csv('submission.csv',index=False)

In [172]:
from sklearn.metrics import accuracy_score
from sklearn import svm

SVM = svm.LinearSVC(C=0.007)
SVM.fit(sp.hstack((train_app_counts[:50000], train_category_counts[:50000]), format='csr'), train_basic_info['age_group'][:50000])
SVM_preds = SVM.predict(sp.hstack((train_app_counts[-5000:], train_category_counts[-5000:]), format='csr'))
accuracy_score(SVM_preds, train_basic_info['age_group'][-5000:])



0.56559999999999999

In [257]:
SVM = svm.LinearSVC(C=0.08)
SVM.fit(train_app_tfidfs[:50000], train_basic_info['age_group'][:50000])
SVM_preds = SVM.predict(train_app_tfidfs[-5000:])
accuracy_score(SVM_preds, train_basic_info['age_group'][-5000:])

0.55940000000000001

In [255]:
from sklearn.metrics import accuracy_score
from sklearn import svm

SVM = svm.LinearSVC(C=0.008)
SVM.fit(train_app_counts[:50000], train_basic_info['age_group'][:50000])
SVM_preds = SVM.predict(train_app_counts[-5000:])
accuracy_score(SVM_preds, train_basic_info['age_group'][-5000:])

0.56359999999999999

In [301]:
sp.hstack((csr_matrix(train_basic_info[['age_group']])[:10000], train_app_tfidfs[:10000]), format='csr')

<10000x3001 sparse matrix of type '<class 'numpy.float64'>'
	with 353329 stored elements in Compressed Sparse Row format>

In [326]:
age_info = csr_matrix((train_basic_info[['age_group']]))

In [327]:
SVM = svm.LinearSVC(C=0.03)
SVM.fit(sp.hstack((age_info[:10000], train_app_tfidfs[:10000]), format='csr'), train_basic_info['age_group'][:10000])
SVM_preds = SVM.predict(sp.hstack((age_info[-5000:], train_app_tfidfs[-5000:]), format='csr'))
accuracy_score(SVM_preds, train_basic_info['age_group'][-5000:])

0.80500000000000005

In [277]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [286]:
train_basic_info[['city']].values

array([[ 51],
       [ 30],
       [228],
       ..., 
       [ 84],
       [ 30],
       [143]])

In [287]:
scaler.fit_transform(train_basic_info[['city']].values)



array([[ 0.13850416],
       [ 0.08033241],
       [ 0.62880886],
       ..., 
       [ 0.2299169 ],
       [ 0.08033241],
       [ 0.3933518 ]])

---

In [21]:
y_train = train_basic_info['age_group']

In [11]:
import torch.nn as nn
import torch
from torch.autograd import Variable
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

class FM(nn.Module):
    def __init__(self, n, k):
        super(FM, self).__init__()
        self.n = n
        self.k = k
        self.linear = nn.Linear(self.n, 6)   # 前两项线性层
        self.V = nn.Parameter(torch.randn(self.n, self.k))   # 交互矩阵
    def fm_layer(self, x):
        linear_part = self.linear(x)
        interaction_part_1 = torch.mm(x, self.V)
        interaction_part_1 = torch.pow(interaction_part_1, 2)
        interaction_part_2 = torch.mm(torch.pow(x, 2), torch.pow(self.V, 2))
        output = linear_part + torch.sum(0.5 * interaction_part_2 - interaction_part_1)
        
        return output
    def forward(self, x):
        return self.fm_layer(x)

In [12]:
def save_model(model, save_path):
    torch.save(model.state_dict(), save_path)

In [13]:
train_app_counts

<2010000x9395 sparse matrix of type '<class 'numpy.int64'>'
	with 73110632 stored elements in Compressed Sparse Row format>

In [16]:
%%time
num_epochs = 5
batch_size = 256
lr = 5e-4
weight_decay = 1e-5
log_interval = 10
k = 100

device = torch.device("cuda:1")

test_acc = 0

clf_FM = FM(train_app_counts.shape[1], k)
clf_FM.to(device)
optimizer = torch.optim.Adam(clf_FM.parameters(), lr=lr, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()
kf = KFold(n_splits=3, shuffle=True, random_state=0)

fm_train_losses = []
fm_train_accs = []
fm_test_losses = []
fm_test_accs = []

for f, (train_index, val_index) in enumerate(kf.split(train_app_counts[:100000])):
    print('Fold[{}]'.format(f+1))
    for epoch in range(num_epochs):
        clf_FM.train()
        fm_loss = 0
        fm_acc = 0
        for i in range(int(len(train_index)/batch_size)):
            batch_train_counts = Variable(torch.FloatTensor(
                train_app_counts[:100000][train_index][i*batch_size:i*batch_size+batch_size].toarray())).to(device)
            batch_train_targets = Variable(torch.LongTensor(
                y_train[:100000][train_index][i*batch_size:i*batch_size+batch_size].values)).to(device)
            output = clf_FM(batch_train_counts)
            
            loss = loss_fn(output, batch_train_targets-1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            batch_train_predicts = torch.max(output, dim=1)[1]
            acc = accuracy_score((batch_train_targets-1).cpu(), batch_train_predicts.cpu())
            
            fm_loss += loss
            fm_acc += acc
            if i % log_interval == 0:
                print('Batch[{}] Train loss: {:.4f}\tTrain acc: {:.4f}'.format(i, loss, acc))
            
        fm_loss = fm_loss/int(len(train_index)/batch_size)
        fm_acc = fm_acc/int(len(train_index)/batch_size)
        fm_train_losses.append(fm_loss)
        fm_train_accs.append(fm_acc)
        print('Epoch[{}/{}] Train loss: {:.4f}\tTrain acc: {:.4f}'.format(epoch+1, num_epochs, fm_loss, fm_acc))
        
        clf_FM.eval()
        with torch.no_grad():
            output = clf_FM(torch.FloatTensor(train_app_counts[-10000:].toarray()).to(device))
            fm_loss = loss_fn(output, torch.LongTensor(y_train[-10000:].values-1).to(device))
            test_predicts = torch.max(output, dim=1)[1]
            fm_acc = accuracy_score(torch.LongTensor(y_train[-10000:].values-1).cpu(), test_predicts.cpu())
            
            fm_test_losses.append(fm_loss)
            fm_test_accs.append(fm_acc)
            print('           Test loss: {:.4f}\tTest acc: {:.4f}'.format(fm_loss, fm_acc))
            
            if (acc > test_acc):
                save_model(clf_FM, './state_dict/fm.pt')

Fold[1]
Batch[0] Train loss: 1.8029	Train acc: 0.0820
Batch[10] Train loss: 1.7351	Train acc: 0.3203
Batch[20] Train loss: 1.6660	Train acc: 0.3516
Batch[30] Train loss: 1.6429	Train acc: 0.3398
Batch[40] Train loss: 1.5813	Train acc: 0.3438
Batch[50] Train loss: 1.5449	Train acc: 0.4258
Batch[60] Train loss: 1.5394	Train acc: 0.4102
Batch[70] Train loss: 1.5002	Train acc: 0.4453
Batch[80] Train loss: 1.4719	Train acc: 0.4688
Batch[90] Train loss: 1.4712	Train acc: 0.4531
Batch[100] Train loss: 1.4630	Train acc: 0.4648
Batch[110] Train loss: 1.4070	Train acc: 0.4766
Batch[120] Train loss: 1.4323	Train acc: 0.4609
Batch[130] Train loss: 1.4056	Train acc: 0.4922
Batch[140] Train loss: 1.4103	Train acc: 0.4883
Batch[150] Train loss: 1.3680	Train acc: 0.5195
Batch[160] Train loss: 1.3982	Train acc: 0.4844
Batch[170] Train loss: 1.3856	Train acc: 0.4727
Batch[180] Train loss: 1.3777	Train acc: 0.5117
Batch[190] Train loss: 1.3474	Train acc: 0.5195
Batch[200] Train loss: 1.3527	Train acc: 0.

KeyboardInterrupt: 

In [37]:
clf_FM.load_state_dict(torch.load('./state_dict/fm.pt'))
clf_FM.eval()
with torch.no_grad():
    output = clf_FM(torch.FloatTensor(test_app_counts[:10].toarray()).to(device))
    clf_FM_preds = torch.max(output, dim=1)[1].cpu()

In [34]:
df = pd.DataFrame({'id':test_basic_info['uid'][:10],'label':clf_FM_preds})
df.to_csv('asd.csv',index=False)    

---