In [1]:
import threading
import warnings
import time
import gc

import lightgbm as lgb
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

from keras.utils import to_categorical

dataPath = '../dataSet/'
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


In [2]:
train = pd.read_csv(dataPath+'age_train.csv', names=['uid','age'])
test = pd.read_csv(dataPath+'age_test.csv', names=['uid'])

app_package = pd.read_csv(dataPath+'user_app_actived.csv', names=['uid','appid'])

In [3]:
app_number_feat = pd.read_csv(dataPath+'app_activated_sum.csv')

train = pd.merge(train, app_package, on='uid', how='left')
train = pd.merge(train, app_number_feat, on='uid', how='left')
test = pd.merge(test, app_package, on='uid', how='left')
test = pd.merge(test, app_number_feat, on='uid', how='left')

In [4]:
from gensim.models import FastText, Word2Vec
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing import text, sequence

# Perform Word to Vec embedding
do_embedding = False
do_fast_model = False
embedding_size = 128

if do_fast_model:
    fast_model = Word2Vec(list(app_package['app_list']), size=embedding_size, window=4, min_count=3, negative=2,
                         sg=1, sample=0.002, hs=1, workers=8)
    fast_model.save(dataPath + 'nn/fastmodel.model')
else:
    fast_model = Word2Vec.load(dataPath + 'nn/fastmodel.model')
    
if do_embedding:
    embedding_fast = pd.DataFrame([fast_model[word] for word in (fast_model.wv.vocab)])
    embedding_fast['app'] = list(fast_model.wv.vocab)
    embedding_fast.columns = ['fast_dim_%s' % str(i) for i in range(embedding_size)] + ['app']
    embedding_fast.to_csv(dataPath + 'embedding_fast.csv')
else:
    embedding_fast = pd.read_csv(dataPath + 'embedding_fast.csv')

In [5]:
tokenizer = Tokenizer(lower=False, char_level=False, split='#')
tokenizer.fit_on_texts(list(app_package['appid']))

X_seq = tokenizer.texts_to_sequences(train['appid'])
X_test_seq = tokenizer.texts_to_sequences(test['appid'])

X = pad_sequences(X_seq, maxlen=100, value=0)
X_test = pad_sequences(X_test_seq, maxlen=100, value=0)

max_features = 30000
embedding_matrix = np.zeros((max_features, embedding_size))
for word in tokenizer.word_index:
    if word not in fast_model.wv.vocab:
        continue
    embedding_matrix[tokenizer.word_index[word]] = fast_model[word]

embedding_pd = pd.DataFrame(embedding_matrix)
embedding_pd.to_csv(dataPath + 'embedding_matrix.csv', index=False)

In [6]:
Y_age = train['age']

sub = np.zeros((X_test.shape[0], ))
oof_pref = np.zeros((X.shape[0], ))

In [7]:
Y_age = train['age']

In [15]:
kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
for index, (trainIndex, testIndex) in enumerate(kfold.split(X, Y_age)):
    tr_x = X[trainIndex]
    tr_y = Y_age[trainIndex]
    te_x = X[testIndex]
    te_y = Y_age[testIndex]
    
    model = lgb.LGBMClassifier(nthread=8, feature_fraction=0.4, bagging_fraction=0.632,
                               bagging_freq=10, n_estimators=2000)
    model.fit(tr_x, tr_y)
    
    prediction = model.predict(te_x)
    # Deal with float vals in prediction
    pred = prediction
    
    accuracy = accuracy_score(te_y, pred)
    loss = mean_squared_error(te_y, pred)
    
    train_prediction = model.predict(tr_x)
    train_accuracy = accuracy_score(tr_y, train_prediction)
    train_loss = mean_squared_error(tr_y, train_prediction)
    
    oof_pref[testIndex] = pred
    
    print('KFold Iteration: %d' % index)
    print('Train Accuracy: %.5f, Train Loss: %.5f' % (train_accuracy, train_loss))
    print('Validation Accuracy: %.5f, Validation Loss: %.5f' % (accuracy, loss))

KFold Iteration: 0
Train Accuracy: 0.56085, Train Loss: 0.88736
Validation Accuracy: 0.51703, Validation Loss: 0.94468


KeyboardInterrupt: 

In [12]:
model = lgb.LGBMClassifier(nthread=8, feature_fraction=0.4, bagging_fraction=0.632,
                               bagging_freq=10, n_estimators=2000)
model.fit(X, Y_age)

prediction = model.predict(X_test)
final_sub = test[['uid']]

ValueError: Length mismatch: Expected axis has 1 elements, new values have 2 elements

In [14]:
final_sub['age'] = prediction
print(final_sub)

final_sub.to_csv('../result/submission.csv', header=['id', 'label'], index=False)

            uid  age
0       1000002    4
1       1000003    5
2       1000004    3
3       1000005    3
4       1000006    4
5       1000007    3
6       1000008    3
7       1000009    3
8       1000010    5
9       1000012    5
10      1000013    4
11      1000014    5
12      1000016    3
13      1000017    3
14      1000018    1
15      1000020    4
16      1000021    3
17      1000022    3
18      1000024    3
19      1000026    4
20      1000027    4
21      1000028    4
22      1000030    3
23      1000031    3
24      1000032    3
25      1000033    4
26      1000034    3
27      1000036    3
28      1000039    4
29      1000041    4
...         ...  ...
502470  3178805    3
502471  3179045    2
502472  3179064    2
502473  3179100    2
502474  3179128    3
502475  3179165    2
502476  3179213    2
502477  3179270    1
502478  3179327    2
502479  3179417    2
502480  3179418    1
502481  3179611    1
502482  3179789    2
502483  3179836    2
502484  3179942    2
502485  31799