In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [118]:
train = pd.read_csv('_data/train.csv', dtype = {'challenge_sequence': 'i'})
test = pd.read_csv('_data/test.csv', dtype = {'challenge_sequence': 'i'})
challenge_data = pd.read_csv('_data/challenge_data.csv', 
                dtype = { 'category_id': 'O' })

### Fill NaN

In [64]:
cat_columns = ['challenge_ID', 'programming_language', 'challenge_series_ID', 'author_ID', 'author_gender',
               'author_org_ID', 'category_id']

In [65]:
for col in cat_columns:
    challenge_data[col].fillna('-999', inplace=True)
challenge_data['total_submissions'].fillna(challenge_data.total_submissions.median(), inplace=True)

### Get publish_date features

In [63]:
from datetime import datetime
challenge_data['publish_day'] = challenge_data.publish_date.apply(lambda x: x[:2])
challenge_data['publish_month'] = challenge_data.publish_date.apply(lambda x: x[3:5])
challenge_data['publish_year'] = challenge_data.publish_date.apply(lambda x: x[6:])
challenge_data['publish_weekday'] = challenge_data.publish_date.apply(lambda x: datetime.strptime(x, '%d-%m-%Y').weekday())
challenge_data = challenge_data.drop('publish_date', axis=1)

### Clip total_submissions

In [57]:
q99 = challenge_data.total_submissions.quantile(0.99)
challenge_data['total_submissions'] = challenge_data.total_submissions.apply(lambda x: np.clip(x, 0, q99))

In [62]:
challenge_data.head()

Unnamed: 0,challenge_ID,programming_language,challenge_series_ID,total_submissions,publish_date,author_ID,author_gender,author_org_ID,category_id,publish_day,publish_month,publish_year,publish_weekday
0,CI23478,2,SI2445,37.0,06-05-2006,AI563576,M,AOI100001,-999,6,5,2006,5
1,CI23479,2,SI2435,48.0,17-10-2002,AI563577,M,AOI100002,32,17,10,2002,3
2,CI23480,1,SI2435,15.0,16-10-2002,AI563578,M,AOI100003,-999,16,10,2002,2
3,CI23481,1,SI2710,236.0,19-09-2003,AI563579,M,AOI100004,70,19,9,2003,4
4,CI23482,2,SI2440,137.0,21-03-2002,AI563580,M,AOI100005,-999,21,3,2002,3


### Numericalize

In [67]:
from sklearn.preprocessing import LabelEncoder
label_encoders = {col: LabelEncoder().fit(challenge_data[col]) for col in cat_columns}

In [73]:
label_encoders['author_gender'].transform(challenge_data.author_gender)

array([0, 1, 2])

### Train - Dev split

In [79]:
train.user_id.nunique()

69532

In [120]:
train_size = 60000
np.random.seed(0)
user_ids = train.user_id.copy(deep=True).unique()
np.random.shuffle(user_ids)
train_ids, dev_ids = set(user_ids[:train_size]), set(user_ids[train_size:])

In [121]:
train_data = train[train.user_id.apply(lambda x: x in train_ids)]
dev_data = train[train.user_id.apply(lambda x: x in dev_ids)]

### Get batches

In [123]:
train.groupby('user_id').challenge_sequence.nunique().value_counts()

13    69532
Name: challenge_sequence, dtype: int64

In [133]:
def get_sequences(ids, df):
    sequences = []
    targets = []
    for user_id in ids:
        rows = df[df.user_id == user_id].sort_values('challenge_sequence')
        challenges = list(rows['challenge'])
        for i in range(10, 13):
            sequences.append(challenges[:i])
            targets.append(challenges[i])
    return sequences, targets

In [134]:
train_sequences, train_targets = get_sequences(train_ids, train_data)
dev_sequences, dev_targets = get_sequences(dev_ids, dev_data)

In [135]:
len(train_sequences), len(train_targets), len(dev_sequences), len(dev_targets)

(180000, 180000, 28596, 28596)

In [165]:
sequence_lengths = list(set([len(seq) for seq in train_sequences + dev_sequences]))

In [208]:
def encode_sequences(sequences, targets):
    return [label_encoders['challenge_ID'].transform(seq) for seq in sequences], \
            label_encoders['challenge_ID'].transform(targets)

In [None]:
train_sequences_ids, train_targets_ids = encode_sequences(train_sequences, train_targets)
dev_sequences_ids, dev_targets_ids = encode_sequences(dev_sequences, dev_targets)

In [177]:
def get_random_batch(sequences, targets, batch_size, sequence_lengths):
    length = np.random.choice(sequence_lengths)
    candidates = np.array([(seq, target) for seq, target in zip(sequences, targets) if len(seq)==length])
    batch = candidates[np.random.choice(len(candidates), batch_size)]
    return zip(*batch)

In [199]:
seq_batch, target_batch = get_random_batch(train_sequences, train_targets, 128, sequence_lengths)

## Model

In [110]:
train_data.head()

Unnamed: 0,user_sequence,user_id,challenge_sequence,challenge
0,4576_1,4576,1,CI23714
1,4576_2,4576,2,CI23855
2,4576_3,4576,3,CI24917
3,4576_4,4576,4,CI23663
4,4576_5,4576,5,CI23933
