# 华为算法精英赛

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import accuracy_score
from sklearn import svm
pd.set_option('display.max_columns',None)

读取数据（`user_app_usage.csv` 除外）

In [3]:
age_train = pd.read_csv("age_train.csv", names=['uid','age_group'])
age_test = pd.read_csv("age_test.csv", names=['uid'])
user_basic_info = pd.read_csv("user_basic_info.csv", names=['uid','gender','city','prodName','ramCapacity','ramLeftRation','romCapacity','romLeftRation','color','fontSize','ct','carrier','os'])
user_behavior_info = pd.read_csv("user_behavior_info.csv", names=['uid','bootTimes','AFuncTimes','BFuncTimes','CFuncTimes','DFuncTimes','EFuncTimes','FFuncTimes','FFuncSum'])
user_app_actived = pd.read_csv("user_app_actived.csv", names=['uid','appId'])
app_info = pd.read_csv("app_info.csv", names=['appId', 'category'])

In [4]:
class2id = {}
id2class = {}

def mergeBasicTables(baseTable):
    
    resTable = baseTable.merge(user_basic_info, how='left', on='uid', suffixes=('_base0', '_ubaf'))
    resTable = resTable.merge(user_behavior_info, how='left', on='uid', suffixes=('_base1', '_ubef'))
    
    cat_columns = ['city','prodName','color','carrier','os','ct']
    for c in cat_columns:
        resTable[c] = resTable[c].apply(lambda x: x if type(x)==str else str(x))
        
        sort_temp = sorted(list(set(resTable[c])))  
        class2id[c+'2id'] = dict(zip(sort_temp, range(1, len(sort_temp)+1)))
        id2class['id2'+c] = dict(zip(range(1,len(sort_temp)+1), sort_temp))
        resTable[c] = resTable[c].apply(lambda x: class2id[c+'2id'][x])
        
    return resTable

In [5]:
train_basic = mergeBasicTables(age_train)
test_basic = mergeBasicTables(age_test)

In [6]:
train_basic.head()

Unnamed: 0,uid,age_group,gender,city,prodName,ramCapacity,ramLeftRation,romCapacity,romLeftRation,color,fontSize,ct,carrier,os,bootTimes,AFuncTimes,BFuncTimes,CFuncTimes,DFuncTimes,EFuncTimes,FFuncTimes,FFuncSum
0,1000001,4,0,51,78,3.0,0.43,32.0,0.46,80,1.15,5,1,14,108,0.0,0.0,1.0,0.07,0.0,0.0,3319
1,1000011,3,0,30,138,,,,,16,,6,1,15,0,0.0,0.0,0.0,0.0,0.0,0.0,220
2,1000015,5,1,228,78,3.0,0.34,32.0,0.06,80,1.3,8,2,14,12,0.0,0.0,0.03,0.13,0.0,0.0,21881
3,1000019,3,0,57,166,2.0,,17.0,,100,,7,3,15,0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,1000023,2,1,293,164,2.0,0.34,16.0,0.06,119,1.0,8,2,12,5,0.0,0.0,0.0,0.13,0.0,0.0,0


`user_basic_info.csv` 中的 `ramCapacity`，`ramLeftRation`，`romCapacity`，`romLeftRation` 和 `fontSize` 列都包含 NAN 值，采用平均值进行填充 

In [7]:
train_basic['ramCapacity'] = train_basic['ramCapacity'].fillna(round(user_basic_info['ramCapacity'].mean()))
test_basic['ramCapacity'] = test_basic['ramCapacity'].fillna(round(user_basic_info['ramCapacity'].mean()))

train_basic['ramLeftRation'] = train_basic['ramLeftRation'].fillna(round(user_basic_info['ramLeftRation'].mean()))
test_basic['ramLeftRation'] = test_basic['ramLeftRation'].fillna(round(user_basic_info['ramLeftRation'].mean(), 2))

train_basic['romCapacity'] = train_basic['romCapacity'].fillna(round(user_basic_info['romCapacity'].mean()))
test_basic['romCapacity'] = test_basic['romCapacity'].fillna(round(user_basic_info['romCapacity'].mean()))

train_basic['romLeftRation'] = train_basic['romLeftRation'].fillna(round(user_basic_info['romLeftRation'].mean(), 2))
test_basic['romLeftRation'] = test_basic['romLeftRation'].fillna(round(user_basic_info['romLeftRation'].mean(), 2))

train_basic['fontSize'] = train_basic['fontSize'].fillna(round(user_basic_info['fontSize'].mean(), 2))
test_basic['fontSize'] = test_basic['fontSize'].fillna(round(user_basic_info['fontSize'].mean(), 2))

In [8]:
train_app = train_basic.merge(user_app_actived, how='left', on='uid')[['uid', 'appId']]
test_app = test_basic.merge(user_app_actived, how='left', on='uid')[['uid', 'appId']]

In [11]:
vectorizer = CountVectorizer(min_df=1, max_df=0.7, tokenizer=lambda x:x.split('#'))
train_app_counts = vectorizer.fit_transform(train_app['appId'])
test_app_counts = vectorizer.transform(test_app['appId'])

In [None]:
from sklearn.decomposition import LatentDirichletAllocation



In [11]:
import scipy.sparse as sp
from scipy.sparse import csr_matrix

X_train = sp.hstack((csr_matrix(train_basic.drop(['uid', 'age_group'], axis=1).values), train_app_counts), format='csr')
X_test = sp.hstack((csr_matrix(test_basic.drop(['uid'], axis=1).values), test_app_counts), format='csr')
y_train = train_basic['age_group']

In [17]:
from sklearn.naive_bayes import MultinomialNB
SVM = svm.LinearSVC(C=0.0001)
SVM.fit(X_train[:10000], y_train[:10000])
SVM_preds = SVM.predict(X_train[-5000:])
accuracy_score(SVM_preds, y_train[-5000:])

0.34839999999999999

0.06: 59.219%

In [27]:
SVM = svm.LinearSVC(C=0.06)
SVM.fit(train_app_counts, train_basic['age_group'])
SVM_preds = SVM.predict(test_app_counts)

In [28]:
df = pd.DataFrame({'id':test_basic['uid'],'label':SVM_preds})
df.to_csv('submission.csv',index=False)

In [None]:
df = pd.DataFrame({'id':test_basic['uid'],'label':NB_preds})
df.to_csv('submission.csv',index=False)

In [26]:
from sklearn.neighbors import KNeighborsClassifier

clf_KNN = KNeighborsClassifier(30)
clf_KNN.fit(train_app_counts[:10000], y_train[:10000])
KNN_preds = clf_KNN.predict(train_app_counts[-5000:])
accuracy_score(KNN_preds, y_train[-5000:])

0.36959999999999998

---

In [25]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

from sklearn.metrics import accuracy_score
from sklearn import svm
from torch.autograd import Variable
from sklearn.model_selection import KFold

读取处理好的 `basic` 以及 `app` 数据

In [2]:
train_basic_info = pd.read_csv('./data/train_basic_info.csv')
test_basic_info = pd.read_csv('./data/test_basic_info.csv')
train_app_info = pd.read_csv('./data/train_app_info.csv')
test_app_info = pd.read_csv('./data/test_app_info.csv')

In [3]:
X_train = np.concatenate((train_basic_info.drop(['uid', 'age_group'], axis=1).values, train_app_info[['1stCategory', '2ndCategory', '3rdCategory']].values), axis=1)
X_test = np.concatenate((test_basic_info.drop(['uid'], axis=1).values, test_app_info[['1stCategory', '2ndCategory', '3rdCategory']].values), axis=1)

In [4]:
y_train = train_basic_info['age_group'].values

In [43]:
from sklearn.metrics import accuracy_score
from sklearn import svm

SVM = svm.LinearSVC(C=0.1)
SVM.fit(train_app_counts[:10000], y_train[:10000])
SVM_preds = SVM.predict(train_app_counts[-5000:])
accuracy_score(SVM_preds, y_train[-5000:])

0.50160000000000005

In [8]:
import torch.nn as nn
import torch

class MLP(nn.Module):
    def __init__(self, n_feature, n_hidden, n_output):

        super(MLP, self).__init__()
        # define a liner layer
        self.hidden = nn.Linear(n_feature, n_hidden)
        # define sigmoid activation 
        self.sigmoid = nn.Sigmoid()
        self.predict = nn.Linear(n_hidden, n_output)

    def forward(self, x):
        # hidden layer
        h1 = self.hidden(x)
        # activate function
        h2 = self.sigmoid(h1)
        # output layer
        out = self.predict(h2)
        return out

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

countVzer = CountVectorizer(min_df=1, max_df=0.7, tokenizer=lambda x: x.split('#'))
train_app_counts = countVzer.fit_transform(train_app_info['appId'])
test_app_counts = countVzer.transform(test_app_info['appId'])

In [30]:
batch_size = 256
device = torch.device("cuda:1")
lr = 1e-2
weight_decay = 1e-5

clf_MLP = MLP(train_app_counts[:2000000].shape[1], 1000, 6)
clf_MLP.to(device)
optimizer = torch.optim.Adam(clf_MLP.parameters(), lr=lr, weight_decay=weight_decay)
loss_fn = nn.CrossEntropyLoss()
kf = KFold(n_splits=3, shuffle=True, random_state=0)

for f, (train_index, val_index) in enumerate(kf.split(train_app_counts[:2000000])):
    print('Fold[{}]'.format(f+1))
    for epoch in range(10):
        clf_MLP.train()
        for i in range(int(len(train_index)/batch_size)):
            batch_X_train = Variable(torch.FloatTensor(train_app_counts[:2000000][train_index][i*batch_size:(i+1)*batch_size].toarray())).to(device)
            batch_y_train = Variable(torch.LongTensor(y_train[:2000000][train_index][i*batch_size:(i+1)*batch_size])).to(device)
            output = clf_MLP(batch_X_train)

            loss = loss_fn(output, batch_y_train-1)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            batch_y_preds = torch.max(output, dim=1)[1]
            acc = accuracy_score((batch_y_train-1).cpu(), batch_y_preds.cpu())

            if i % 10 == 0:
                print('Batch[{}] Train loss: {:.4f}\tTrain acc: {:.4f}'.format(i, loss, acc))

        clf_MLP.eval()
        with torch.no_grad():
            output = clf_MLP(torch.FloatTensor(train_app_counts[-10000:].toarray()).to(device))
            loss = loss_fn(output, torch.LongTensor(y_train[-10000:]-1).to(device))
            MLP_preds = torch.max(output, dim=1)[1]
            acc = accuracy_score(torch.LongTensor(y_train[-10000:]-1).cpu(), MLP_preds.cpu())

            print('           Test loss: {:.4f}\tTest acc: {:.4f}'.format(loss, acc))

Fold[1]
Batch[0] Train loss: 1.8615	Train acc: 0.0195
Batch[10] Train loss: 1.7259	Train acc: 0.3398
Batch[20] Train loss: 1.6279	Train acc: 0.3789
Batch[30] Train loss: 1.3335	Train acc: 0.4766
Batch[40] Train loss: 1.1965	Train acc: 0.5000
Batch[50] Train loss: 1.0711	Train acc: 0.5312
Batch[60] Train loss: 1.0606	Train acc: 0.5469
Batch[70] Train loss: 1.0578	Train acc: 0.5859
Batch[80] Train loss: 1.1155	Train acc: 0.4844
Batch[90] Train loss: 0.9897	Train acc: 0.5820
Batch[100] Train loss: 1.0166	Train acc: 0.5469
Batch[110] Train loss: 1.0605	Train acc: 0.5586
Batch[120] Train loss: 1.1247	Train acc: 0.5000
Batch[130] Train loss: 1.1033	Train acc: 0.5469
Batch[140] Train loss: 1.0275	Train acc: 0.5820
Batch[150] Train loss: 1.0346	Train acc: 0.5234
Batch[160] Train loss: 1.1717	Train acc: 0.5000
Batch[170] Train loss: 1.0382	Train acc: 0.5625
Batch[180] Train loss: 1.1221	Train acc: 0.5547
Batch[190] Train loss: 0.9984	Train acc: 0.5703
Batch[200] Train loss: 1.0961	Train acc: 0.

KeyboardInterrupt: 

In [31]:
from sklearn.neural_network import MLPClassifier

In [41]:
from sklearn.neural_network import MLPClassifier

clf_MLP = MLPClassifier(solver='adam', activation='relu', hidden_layer_sizes=(100, 100), verbose=True)
clf_MLP.fit(train_app_counts[:10000], y_train[:10000])
MLP_preds = clf_MLP.predict(train_app_counts[-5000:])
accuracy_score(MLP_preds, y_train[-5000:])

Iteration 1, loss = 1.37662281
Iteration 2, loss = 0.95313958
Iteration 3, loss = 0.71360190
Iteration 4, loss = 0.49706417
Iteration 5, loss = 0.32677944
Iteration 6, loss = 0.20624829
Iteration 7, loss = 0.13652077
Iteration 8, loss = 0.08753950
Iteration 9, loss = 0.05881813
Iteration 10, loss = 0.04267850
Iteration 11, loss = 0.03232963
Iteration 12, loss = 0.02556241
Iteration 13, loss = 0.02036107
Iteration 14, loss = 0.01697590
Iteration 15, loss = 0.01396573
Iteration 16, loss = 0.01243395
Iteration 17, loss = 0.01049332
Iteration 18, loss = 0.00919446
Iteration 19, loss = 0.00837245
Iteration 20, loss = 0.00746220
Iteration 21, loss = 0.00665187
Iteration 22, loss = 0.00618236
Iteration 23, loss = 0.00603475
Iteration 24, loss = 0.00580351
Iteration 25, loss = 0.00487482
Iteration 26, loss = 0.00467258




0.4768