In [1]:
import pandas as pd
import numpy as np
import calendar
import time
import re 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, classification_report
%matplotlib inline

In [2]:
data_train = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_train_orig = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_test = pd.read_csv("test.csv").replace("MENS WEAR","MENSWEAR")

In [3]:
dept_list = sorted(list(data_train.DepartmentDescription.dropna().unique()))

In [4]:
weekdays = list(calendar.day_name)
dept_list_sum = dict.fromkeys(dept_list, np.sum)
weekday_dict = dict.fromkeys(weekdays, np.max)
feature_dict = {"TripType": np.max, 'NumItems': np.sum, 'Return': np.max}
feature_dict = {**feature_dict, **weekday_dict, **dept_list_sum}

In [5]:
def transform_data(data):
    dummies = pd.get_dummies(data.Weekday)
    data[dummies.columns] = dummies
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    dummies = dummies.apply(lambda x: x*data["ScanCount"])
    data[dummies.columns] = dummies 

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    
    data = data.rename(columns={"ScanCount":"NumItems"})
    
    grouped = data.groupby("VisitNumber")
    grouped = grouped.aggregate(feature_dict)
    data = grouped[["TripType", "NumItems", "Return"] + weekdays + dept_list]

    return data

In [6]:
data_new = transform_data(data_train)

In [7]:
def add_category_counts(data):
    alist = []
    for array in np.asarray(data.loc[:, dept_list[0]:]):
        count = 0
        count = sum(x > 0 for x in array)
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(3, 'CategoryCounts', cat_counts)
    return data

In [8]:
data_new_cat = add_category_counts(data_new)

In [9]:
X = data_new_cat.drop('TripType', axis=1)
y = data_new_cat.TripType
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
from xgboost import XGBClassifier

In [16]:
xgbc = XGBClassifier(n_estimators=50, random_state=42)
start = time.time()
xgbc = xgbc.fit(X_train, y_train)
predictions = xgbc.predict(X_test)
end = time.time()
end - start

666.2241320610046

In [17]:
accuracy_score(y_test, predictions)

0.6721597045604989

In [18]:
xgbc = XGBClassifier(n_estimators=200, random_state=42)
start = time.time()
xgbc = xgbc.fit(X_train, y_train)
predictions = xgbc.predict(X_test)
end = time.time()
end - start

2909.4840416908264

In [19]:
accuracy_score(y_test, predictions)

0.6982893774169947

In [20]:
scoring = ['neg_log_loss', 'accuracy']
params = {'objective':['multi:softmax'], 'n_estimators': [100], 'min_child_weight': [1, 5, 10], 'gamma': [0, 2, 4, 6, 8], 
          'subsample': [0.6, 0.8, 1.0], 'colsample_bytree': [0.6, 0.8, 1.0], 'max_depth': [5, 7, 9, 11]}

RS = RandomizedSearchCV(xgbc, param_distributions = params, scoring=scoring, cv = 4, n_iter = 100, 
                        refit='accuracy', return_train_score=True, random_state=42, n_jobs = -1)

In [None]:
start = time.time()
RS.fit(X, y)
end = time.time()
end - start

In [37]:
pd.DataFrame(RS.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_n_estimators', 'param_min_samples_split',
       'param_min_samples_leaf', 'param_max_features', 'param_max_depth',
       'param_bootstrap', 'params', 'split0_test_neg_log_loss',
       'split1_test_neg_log_loss', 'split2_test_neg_log_loss',
       'split3_test_neg_log_loss', 'mean_test_neg_log_loss',
       'std_test_neg_log_loss', 'rank_test_neg_log_loss',
       'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy',
       'split3_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy',
       'rank_test_accuracy'],
      dtype='object')

In [38]:
RS.best_params_

{'n_estimators': 100,
 'min_samples_split': 3,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 50,
 'bootstrap': False}

In [39]:
RS.best_score_

0.6832263728912766

In [46]:
cols = ['rank_test_neg_log_loss', 'mean_test_neg_log_loss', 'rank_test_accuracy', 'mean_test_accuracy', 'param_bootstrap', 'param_min_samples_split', 'param_min_samples_leaf', 'param_max_features', 'param_max_depth', ]
df = pd.DataFrame(RS.cv_results_)

In [49]:
df.loc[df.rank_test_accuracy <= 12, cols]

Unnamed: 0,rank_test_neg_log_loss,mean_test_neg_log_loss,rank_test_accuracy,mean_test_accuracy,param_bootstrap,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth
9,8,-1.019521,5,0.683028,False,3,2,auto,70
13,8,-1.019521,5,0.683028,False,3,2,sqrt,70
36,13,-1.023998,1,0.683226,False,3,2,sqrt,50
41,4,-1.017265,10,0.68262,False,5,2,auto,60
47,10,-1.019561,3,0.683111,False,7,2,auto,60
48,3,-1.016811,8,0.682756,False,7,2,auto,70
49,14,-1.024496,9,0.682735,False,5,2,sqrt,50
52,1,-1.014689,12,0.682495,False,5,2,auto,90
60,10,-1.019561,3,0.683111,False,7,2,sqrt,60
63,2,-1.016636,11,0.682516,False,7,2,auto,80


In [53]:
params = {'bootstrap': [False], 'max_depth': np.arange(30, 90, 10), 'min_samples_split': [3, 5, 7, 9],  
          'min_samples_leaf': [2], 'max_features': ['log2', 'sqrt'], 'n_estimators': [100]}
RS_new = RandomizedSearchCV(rfc, param_distributions = params, scoring=scoring, cv = 4, 
                        n_iter = 30, refit='accuracy', return_train_score=True, random_state=42, n_jobs = -1)