In [26]:
import pandas as pd
import numpy as np
import calendar
from time import time
from datetime import datetime
import re
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, cross_validate, RandomizedSearchCV, GridSearchCV
from sklearn.metrics import log_loss, accuracy_score, classification_report
from mlxtend.classifier import StackingClassifier
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data_train = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")

In [3]:
dept_list = sorted(list(data_train.DepartmentDescription.dropna().unique()))

In [4]:
weekdays = list(calendar.day_name)
dept_list_sum = dict.fromkeys(dept_list, np.sum)
weekday_dict = dict.fromkeys(weekdays, np.max)
feature_dict = {"TripType": np.max, 'NumItems': np.sum, 'Return': np.max}
feature_dict = {**feature_dict, **weekday_dict, **dept_list_sum}

In [5]:
def transform_data(data):
    dummies = pd.get_dummies(data.Weekday)
    data[dummies.columns] = dummies
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    dummies = dummies.apply(lambda x: x*data["ScanCount"])
    data[dummies.columns] = dummies 

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    
    data = data.rename(columns={"ScanCount":"NumItems"})
    
    grouped = data.groupby("VisitNumber")
    grouped = grouped.aggregate(feature_dict)
    data = grouped[["TripType", "NumItems", "Return"] + weekdays + dept_list]

    return data

In [6]:
data_new = transform_data(data_train)

In [7]:
def add_category_counts(data):
    alist = []
    for array in np.asarray(data.loc[:, dept_list[0]:]):
        count = 0
        count = sum(x > 0 for x in array)
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(3, 'CategoryCounts', cat_counts)
    return data

In [8]:
data_new = add_category_counts(data_new)

In [9]:
data_new.shape

(95674, 78)

In [10]:
def fineline_dummies(data):
    values = data.FinelineNumber
    counts = values.value_counts()
    mask = values.isin(counts[counts > 500].index)
    values[~mask] = "-"
    dummies = pd.get_dummies(values).drop('-', axis=1)

    dummies.columns = ['fln_'+str(col) for col in dummies.columns]
    dummies = pd.concat([dummies, data.VisitNumber], axis=1)
    dummies = dummies.groupby("VisitNumber")
    dummies = dummies.aggregate(np.sum)
    return dummies

In [11]:
start = datetime.now()
fln_dummies = fineline_dummies(data_train)
end = datetime.now()
str(end-start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


'0:00:13.157992'

In [12]:
data_new = data_new.join(fln_dummies)

In [13]:
data_new.shape

(95674, 351)

In [14]:
def Upc_dummies(data):
    values = data.Upc
    counts = values.value_counts()
    mask = values.isin(counts[counts > 300].index)
    values[~mask] = "-"
    dummies = pd.get_dummies(values).drop('-', axis=1)

    dummies.columns = ['upc_'+str(col) for col in dummies.columns]
    dummies = pd.concat([dummies, data.VisitNumber], axis=1)
    dummies = dummies.groupby("VisitNumber")
    dummies = dummies.aggregate(np.sum)
    return dummies

In [15]:
start = datetime.now()
upc_dummies = Upc_dummies(data_train)
end = datetime.now()
str(end-start)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


'0:00:05.555006'

In [16]:
data_new = data_new.join(upc_dummies)

In [17]:
data_new.shape

(95674, 451)

In [18]:
X = data_new.drop('TripType', axis=1)

trip_types = sorted(data_train.TripType.unique())
trip_types_map = dict(zip(trip_types, np.arange(0, len(trip_types))))
y = data_new.TripType.map(trip_types_map)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y, random_state=42)

In [19]:
len(X.columns)

450

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from mlxtend.classifier import StackingClassifier

In [21]:
from sklearn.decomposition import PCA, SparsePCA

In [22]:
pca = PCA(n_components=100)
X_train=pca.fit_transform(X_train)
X_test=pca.transform(X_test)

In [23]:
scoring = ['neg_log_loss', 'accuracy']

In [24]:
rfc = RandomForestClassifier(n_estimators=50, random_state=42)

In [31]:
rfc_params = {'bootstrap': [True, False], 'max_depth': np.arange(50, 90, 10), 'min_samples_split': [3, 5, 7], 
              'min_samples_leaf': [2, 4], 'max_features': ['sqrt'], 'n_estimators': [50]}
rfc_GS = GridSearchCV(rfc, param_grid=rfc_params, scoring=scoring, cv=4, 
                      refit='accuracy', verbose=1, n_jobs=-1)

In [32]:
start = datetime.now()
rfc_GS.fit(X, y)
end = datetime.now()
str(end-start)

Fitting 4 folds for each of 48 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 10.3min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed: 62.8min finished


'1:03:48.991848'

In [33]:
pd.DataFrame(rfc_GS.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_bootstrap', 'param_max_depth', 'param_max_features',
       'param_min_samples_leaf', 'param_min_samples_split',
       'param_n_estimators', 'params', 'split0_test_neg_log_loss',
       'split1_test_neg_log_loss', 'split2_test_neg_log_loss',
       'split3_test_neg_log_loss', 'mean_test_neg_log_loss',
       'std_test_neg_log_loss', 'rank_test_neg_log_loss',
       'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy',
       'split3_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy',
       'rank_test_accuracy'],
      dtype='object')

In [34]:
rfc_GS.best_params_, rfc_GS.best_score_

({'bootstrap': False,
  'max_depth': 80,
  'max_features': 'sqrt',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 50},
 0.6957689654451575)

In [35]:
cols = ['rank_test_neg_log_loss', 'mean_test_neg_log_loss', 'rank_test_accuracy', 'mean_test_accuracy', 'param_bootstrap', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_features', 'param_max_depth']
df_rfc = pd.DataFrame(rfc_GS.cv_results_)

In [36]:
df_rfc.loc[df_rfc.rank_test_accuracy <= 15, cols]

Unnamed: 0,rank_test_neg_log_loss,mean_test_neg_log_loss,rank_test_accuracy,mean_test_accuracy,param_bootstrap,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth
12,13,-1.057193,15,0.68979,True,3,2,sqrt,70
18,10,-1.048834,13,0.690188,True,3,2,sqrt,80
19,11,-1.051243,12,0.690386,True,5,2,sqrt,80
24,27,-1.080362,11,0.690741,False,3,2,sqrt,50
25,26,-1.079359,13,0.690188,False,5,2,sqrt,50
26,22,-1.074975,10,0.690898,False,7,2,sqrt,50
30,9,-1.046993,8,0.69232,False,3,2,sqrt,60
31,7,-1.042066,7,0.69302,False,5,2,sqrt,60
32,8,-1.045168,9,0.692288,False,7,2,sqrt,60
36,5,-1.028702,3,0.695267,False,3,2,sqrt,70


In [37]:
xgbc = XGBClassifier(n_estimators=50, random_state=42)

In [40]:
xgb_params = {'objective':['multi:softmax'], 'min_child_weight': [3, 5, 7], 'gamma': [2, 4, 6], 
              'subsample': [0.6, 0.8], 'colsample_bytree': [0.6, 0.8], 'max_depth': [8, 10, 12]}

xgbc_RS = RandomizedSearchCV(xgbc, param_distributions = xgb_params, scoring=scoring, cv = 4, n_iter = 50, 
                        refit='accuracy', return_train_score=True, random_state=42, n_jobs = -1)

In [41]:
start = datetime.now()
xgbc_RS.fit(X, y)
end = datetime.now()
str(end-start)

KeyboardInterrupt: 

In [None]:
pd.DataFrame(xgbc_RS.cv_results_).columns

In [None]:
xgbc_RS.best_params_, xgbc_RS.best_score_

In [None]:
cols = ['rank_test_neg_log_loss', 'mean_test_neg_log_loss', 'rank_test_accuracy', 'mean_test_accuracy', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_features', 'param_max_depth']
df_xgbc = pd.DataFrame(xgbc_RS.cv_results_)

In [None]:
df_xgbc.loc[df_xgbc.rank_test_accuracy <= 12, cols]