In [1]:
import threading
import warnings
import time
import gc

import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

dataPath = '../dataSet/'
warnings.filterwarnings('ignore')

In [2]:
train = pd.read_csv(dataPath+'age_train.csv', names=['uid','age'])
test = pd.read_csv(dataPath+'age_test.csv', names=['uid'])

app_package = pd.read_csv(dataPath+'user_app_actived.csv', names=['uid','appid'])

In [3]:
# Do tfidf-features stacking or not
do_stacking = True

# app_infos translation
def get_str(df):
    res = ''
    for ele in df.split("#"):
        res += ele + ' '
    
    return res

if do_stacking:
    app_package['app_str'] = app_package['appid'].apply(lambda x: get_str(x), 1)

    # Try Tfidf
    tfidf = CountVectorizer()
    train_str_app = pd.merge(train[['uid']], app_package[['uid','app_str']], on='uid', how='left')
    test_str_app = pd.merge(test[['uid']], app_package[['uid','app_str']], on='uid', how='left')
    app_package['app_str'] = tfidf.fit_transform(app_package['app_str'])
    train_app = tfidf.transform(list(train_str_app['app_str'])).tocsc()
    test_app = tfidf.transform(list(test_str_app['app_str'])).tocsc()

In [4]:
trainData = train_app.tocsc()
testData = test_app.tocsc()

trainLabel = train['age'] - 1

In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

do_stacking = False

# According to app_features, do stacking
if do_stacking:
    train_feature = train_app
    test_feature = test_app

    df_stack = pd.DataFrame()

    all_id = pd.concat([train[['uid']], test[['uid']]])
    n_folds = 10
    df_stack['uid'] = all_id['uid']

    labels = train['age'] - 1

    print('LR Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=0, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('LR Stacking: %d/%d' % ((i+1), n_folds))
        clf = LogisticRegression(solver='sag', n_jobs=-1)
        clf.fit(train_feature[tr], labels[tr])
        score_va = clf.predict(train_feature[va])

        score_te = clf.predict(test_feature)
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], score_va)))
        print('Accuracy: ' + str(accuracy_score(labels[va], score_va)))
        
        stack_train[va, 0] = score_va + 1
        stack_test[:, 0] += score_te + 1

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_lr_classify_{}'.format('age')] = stack[:, 0]

    print('SGD Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('SGD Stacking: %d/%d' % ((i+1), n_folds))
        sgd = SGDClassifier(loss='log', n_jobs=-1)
        sgd.fit(train_feature[tr], labels[tr])
        score_va = sgd.predict(train_feature[va])

        score_te = sgd.predict(test_feature)
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], sgd.predict(train_feature[va]))))
        print('Accuracy: ' + str(accuracy_score(labels[va], score_va)))
        
        stack_train[va, 0] = score_va + 1
        stack_test[:, 0] += score_te + 1

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_sgd_classify_{}'.format('age')] = stack[:, 0]

    print('PAC Stacking')
    stack_train = np.zeros((len(train), 1))
    stack_test = np.zeros((len(test), 1))

    score_va = 0

    kfold = StratifiedKFold(n_splits=n_folds, random_state=1024, shuffle=True)

    for i, (tr, va) in enumerate(kfold.split(labels, labels)):
        print('PAC Stacking: %d/%d' % ((i+1), n_folds))
        pac = PassiveAggressiveClassifier(n_jobs=-1)
        pac.fit(train_feature[tr], labels[tr])
        score_va = pac.predict(train_feature[va])

        score_te = pac.predict(test_feature)
        print('Mean_Squared_Error: ' + str(mean_squared_error(labels[va], pac.predict(train_feature[va]))))
        print('Accuracy: ' + str(accuracy_score(labels[va], score_va)))
        
        stack_train[va, 0] = score_va + 1
        stack_test[:, 0] += score_te + 1

    stack_test /= n_folds
    stack = np.vstack([stack_train, stack_test])

    df_stack['pack_tfidf_pac_classify_{}'.format('age')] = stack[:, 0]

    df_stack.to_csv(dataPath+'tfidf_classfiy.csv', index=None, encoding='utf8')
    print('Tfidf Features Stacking is Done~')

In [6]:
# Establish lightGBM to check K-Fold score
params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'learning_rate': 0.10,
    'num_class': 6,
    'nthread': 8
}

do_KFold = False

if do_KFold:
    kfold = StratifiedKFold(n_splits=5, random_state=0, shuffle=True)
    
    for index, (trainIndex, testIndex) in enumerate(kfold.split(trainData, trainLabel)):
        tr_x = trainData[trainIndex].astype(float)
        tr_y = trainLabel[trainIndex].astype(float)
        te_x = trainData[testIndex].astype(float)
        te_y = trainLabel[testIndex].astype(float)

        trainDataSet = lgb.Dataset(tr_x, label=tr_y)
        testDataSet = lgb.Dataset(te_x, label=te_y)

        model = lgb.train(params, trainDataSet, num_boost_round=5000,
                          valid_sets=testDataSet, verbose_eval=100, early_stopping_rounds=100)

        prediction = model.predict(te_x, num_iteration=model.best_iteration)

        # Deal with float vals in prediction
        pred = []
        for ele in prediction:
            pred.append(ele.tolist().index(max(ele)) + 1)

        accuracy = accuracy_score(te_y+1, pred)
        loss = mean_squared_error(te_y+1, pred)

        print('KFold Iteration: %d' % index)
        print('Accuracy: %.5f' % accuracy)
        print('Loss: %.5f' % loss)

In [7]:
params = {
    'objective': 'multiclass',
    'boosting_type': 'gbdt',
    'learning_rate': 0.10,
    'num_class': 6,
    'nthread': 8
}

tr_x = trainData.astype(float)
tr_y = trainLabel.astype(float)
te_x = testData.astype(float)

trainDataSet = lgb.Dataset(tr_x, label=tr_y)
testDataSet = lgb.Dataset(tr_x, label=tr_y)

model = lgb.train(params, trainDataSet, num_boost_round=5000,
                      valid_sets=testDataSet, verbose_eval=100, early_stopping_rounds=100)
    
prediction = model.predict(te_x, num_iteration=model.best_iteration)

# Deal with float vals in prediction
pred = []
for ele in prediction:
    pred.append(ele.tolist().index(max(ele)) + 1)

test['age'] = pred

Training until validation scores don't improve for 100 rounds.
[100]	valid_0's multi_logloss: 1.07277
[200]	valid_0's multi_logloss: 1.0107
[300]	valid_0's multi_logloss: 0.982358
[400]	valid_0's multi_logloss: 0.965137
[500]	valid_0's multi_logloss: 0.952621
[600]	valid_0's multi_logloss: 0.942921
[700]	valid_0's multi_logloss: 0.935071
[800]	valid_0's multi_logloss: 0.928637
[900]	valid_0's multi_logloss: 0.92243
[1000]	valid_0's multi_logloss: 0.917197
[1100]	valid_0's multi_logloss: 0.912417
[1200]	valid_0's multi_logloss: 0.907782
[1300]	valid_0's multi_logloss: 0.903474
[1400]	valid_0's multi_logloss: 0.89938
[1500]	valid_0's multi_logloss: 0.895299
[1600]	valid_0's multi_logloss: 0.891497
[1700]	valid_0's multi_logloss: 0.887762
[1800]	valid_0's multi_logloss: 0.88409
[1900]	valid_0's multi_logloss: 0.880554
[2000]	valid_0's multi_logloss: 0.877134
[2100]	valid_0's multi_logloss: 0.873929
[2200]	valid_0's multi_logloss: 0.870567
[2300]	valid_0's multi_logloss: 0.867378
[2400]	va

In [8]:
test.to_csv('../result/submission_tfidf.csv', header=['id', 'label'], index=False)