In [None]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, normalize
from sklearn.feature_selection import SelectKBest

from scipy.stats import entropy
from scipy.sparse import hstack, csr_matrix

from xgboost import XGBClassifier

from __future__ import division

### Reading train and test data

In [None]:
train_df = pd.read_csv('../data/train.csv', dtype={'Upc':'str', 'FineLineNumber':'str'})
test_df = pd.read_csv('../data/test.csv', dtype={'Upc':'str', 'FineLineNumber':'str'})

train_df = train_df.fillna('others')
test_df = test_df.fillna('others')

train_df.TripType = 'TripType_' + train_df.TripType.astype('str')

### ScanCount aggregation per visit for a feature 

In [None]:
def get_feature_count(data, col):
    data_col = pd.DataFrame(data.groupby(['VisitNumber', col])['ScanCount'].sum())
    data_col = data_col.reset_index()
    
    data_col['Col_ScanCount'] = zip(data_col[col], data_col['ScanCount'])
    data_col = data_col.drop([col, 'ScanCount'], axis=1)

    
    data_col = data_col.groupby('VisitNumber').aggregate(lambda x: list(x)).reset_index()
    
    col_as_list_of_dict = [dict(dep_count_list) for dep_count_list in data_col['Col_ScanCount']]
    return col_as_list_of_dict

### Adding nonzero counts sum and entropy per observation

In [None]:
def add_sample_stat(X, add_sum=False):
    
    X = csr_matrix(X)
    
    sample_count = (X != 0).sum(axis=1)
    sample_sum = X.sum(axis=1)
    
    
    sample_entropy = np.array([entropy(x.toarray()[0]) for x in X]).reshape((-1,1))
    sample_entropy[sample_entropy<0] = 0
    
    X = normalize(X, norm='l1', copy=False)
    
    if add_sum:
        X = hstack((X,sample_sum,sample_count,sample_entropy))
    else:
        X = hstack((X,sample_count,sample_entropy))
    return csr_matrix(X)

### DepartmentDescription features

In [None]:
dep_vectorizer = DictVectorizer()
X_dep = dep_vectorizer.fit_transform(get_feature_count(train_df, 'DepartmentDescription'))
X_dep_test = dep_vectorizer.transform(get_feature_count(test_df, 'DepartmentDescription'))

X_dep = add_sample_stat(X_dep, add_sum=True)
X_dep_test = add_sample_stat(X_dep_test, add_sum=True)

### FinelineNumber features

In [None]:
fine_vectorizer = DictVectorizer()
X_fine = fine_vectorizer.fit_transform(get_feature_count(train_df, 'FinelineNumber'))
X_fine_test = fine_vectorizer.transform(get_feature_count(test_df, 'FinelineNumber'))

X_fine = add_sample_stat(X_fine)
X_fine_test = add_sample_stat(X_fine_test)

### Upc features

In [None]:
upc_vectorizer = DictVectorizer()
X_upc = upc_vectorizer.fit_transform(get_feature_count(train_df, 'Upc'))
X_upc_test = upc_vectorizer.transform(get_feature_count(test_df, 'Upc'))

### Target variable

In [None]:
y = np.array(train_df[['VisitNumber', 'TripType']].drop_duplicates()['TripType'])

### Dimensionality Reduction for Upc features

In [None]:
selection = SelectKBest(k=500)
X_upc = selection.fit_transform(X_upc, y)
X_upc_test = selection.transform(X_upc_test)

### Weekday feature

In [None]:
X_weekday = np.asarray(train_df[['VisitNumber', 'Weekday']].drop_duplicates().Weekday)
X_weekday_test = np.asarray(test_df[['VisitNumber', 'Weekday']].drop_duplicates().Weekday)

lbl_enc = LabelEncoder()
one_enc = OneHotEncoder()

X_weekday = lbl_enc.fit_transform(X_weekday).reshape((-1,1))
X_weekday_test = lbl_enc.transform(X_weekday_test).reshape((-1,1))

X_weekday = one_enc.fit_transform(X_weekday)
X_weekday_test = one_enc.transform(X_weekday_test)

### Stacking features

In [None]:
X = hstack((X_dep, X_fine, X_upc, X_weekday))
X_test = hstack((X_dep_test, X_fine_test, X_upc_test, X_weekday_test))

### xgboost model

In [None]:
xgb_model = XGBClassifier(max_depth=15,
                          learning_rate=0.08,
                          n_estimators=959,
                          objective='multi:softprob',
                          nthread=-1, 
                          min_child_weight=1, 
                          max_delta_step=0,
                          subsample=0.83,
                          colsample_bytree=1,
                          colsample_bylevel=0.77,
                          seed=2345)

xgb_model.fit(X,y)
predictions = xgb_model.predict_proba(X_test)

### Submission

In [None]:
submission_df = pd.DataFrame(predictions, columns=xgb_model.classes_)
submission_df['VisitNumber'] = test_df.VisitNumber.unique()
submission_df = submission_df[['VisitNumber'] + xgb_model.classes_]
submission_df.to_csv('../submission/xgb_submission.csv', index=False)