In [1]:
import pandas as pd
import numpy as np
import calendar
import time
import re 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, classification_report
%matplotlib inline

In [2]:
data_train = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_train_orig = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_test = pd.read_csv("test.csv").replace("MENS WEAR","MENSWEAR")

In [3]:
dept_list = sorted(list(data_train.DepartmentDescription.dropna().unique()))
dept_list

['1-HR PHOTO',
 'ACCESSORIES',
 'AUTOMOTIVE',
 'BAKERY',
 'BATH AND SHOWER',
 'BEAUTY',
 'BEDDING',
 'BOOKS AND MAGAZINES',
 'BOYS WEAR',
 'BRAS & SHAPEWEAR',
 'CAMERAS AND SUPPLIES',
 'CANDY, TOBACCO, COOKIES',
 'CELEBRATION',
 'COMM BREAD',
 'CONCEPT STORES',
 'COOK AND DINE',
 'DAIRY',
 'DSD GROCERY',
 'ELECTRONICS',
 'FABRICS AND CRAFTS',
 'FINANCIAL SERVICES',
 'FROZEN FOODS',
 'FURNITURE',
 'GIRLS WEAR, 4-6X  AND 7-14',
 'GROCERY DRY GOODS',
 'HARDWARE',
 'HEALTH AND BEAUTY AIDS',
 'HOME DECOR',
 'HOME MANAGEMENT',
 'HORTICULTURE AND ACCESS',
 'HOUSEHOLD CHEMICALS/SUPP',
 'HOUSEHOLD PAPER GOODS',
 'IMPULSE MERCHANDISE',
 'INFANT APPAREL',
 'INFANT CONSUMABLE HARDLINES',
 'JEWELRY AND SUNGLASSES',
 'LADIES SOCKS',
 'LADIESWEAR',
 'LARGE HOUSEHOLD GOODS',
 'LAWN AND GARDEN',
 'LIQUOR,WINE,BEER',
 'MEAT - FRESH & FROZEN',
 'MEDIA AND GAMING',
 'MENSWEAR',
 'OFFICE SUPPLIES',
 'OPTICAL - FRAMES',
 'OPTICAL - LENSES',
 'OTHER DEPARTMENTS',
 'PAINT AND ACCESSORIES',
 'PERSONAL CARE',
 

In [4]:
weekdays = list(calendar.day_name)
dept_list_sum = dict.fromkeys(dept_list, np.sum)
weekday_dict = dict.fromkeys(weekdays, np.max)
feature_dict = {"TripType": np.max, 'NumItems': np.sum, 'Return': np.max}
feature_dict = {**feature_dict, **weekday_dict, **dept_list_sum}

In [5]:
def transform_data(data):
    dummies = pd.get_dummies(data.Weekday)
    data[dummies.columns] = dummies
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    dummies = dummies.apply(lambda x: x*data["ScanCount"])
    data[dummies.columns] = dummies 

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    
    data = data.rename(columns={"ScanCount":"NumItems"})
    
    grouped = data.groupby("VisitNumber")
    grouped = grouped.aggregate(feature_dict)
    data = grouped[["TripType", "NumItems", "Return"] + weekdays + dept_list]

    return data

In [6]:
data_new = transform_data(data_train)

In [7]:
def add_category_counts(data):
    alist = []
    for array in np.asarray(data.loc[:, dept_list[0]:]):
        count = 0
        count = sum(x > 0 for x in array)
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(3, 'CategoryCounts', cat_counts)
    return data

In [8]:
data_new_cat = add_category_counts(data_new)

In [9]:
X = data_new_cat.drop('TripType', axis=1)
y = data_new_cat.TripType
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr = lr.fit(X_train, y_train)
predictions = lr.predict(X_test)



In [11]:
accuracy_score(y_test,predictions )

0.6456816360659164

In [12]:
scoring = ['neg_log_loss', 'accuracy']
lr = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)

In [13]:
#start = time.time()
#cv_results = cross_validate(lr, X, y, cv=4, scoring = scoring, return_train_score=True)
#time.time() - start

In [14]:
#-cv_results['train_neg_log_loss'].mean()

In [15]:
#-cv_results['test_neg_log_loss'].mean()

In [16]:
#cv_results['train_accuracy'].mean()

In [17]:
#cv_results['test_accuracy'].mean()

In [18]:
#cv_results['fit_time']

In [19]:
# parameters = {'solver': ('lbfgs', 'saga'), 'multi_class': ('multinomial', 'ovr')}
# parameters = {'C': np.linspace(0.1, 1, 10), 'penalty': ['l1', 'l2'], 'multi_class': ['multinomial', 'ovr']}
# parameters = {'C': np.linspace(0.1, 1, 10), 'multi_class': ['multinomial', 'ovr']}
parameters = {'C': np.linspace(0.1, 1, 10)}
start = time.time()
GS = GridSearchCV(lr, param_grid=parameters, cv=4, scoring=scoring, return_train_score=True, refit='accuracy')
GS.fit(X, y)
time.time() - start



1939.2145881652832

In [20]:
GS.best_params_

{'C': 0.9}

In [21]:
GS.best_score_

0.6629178251144512

In [24]:
pd.DataFrame(GS.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,params,split0_test_neg_log_loss,split1_test_neg_log_loss,split2_test_neg_log_loss,split3_test_neg_log_loss,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,37.532123,1.365362,0.223613,0.05989,0.1,{'C': 0.1},-1.204309,-1.210361,-1.233225,-1.176139,...,0.667615,0.66089,0.004159,8,0.667373,0.659958,0.670778,0.663485,0.665399,0.004065
1,47.212993,8.016617,0.29282,0.019015,0.2,{'C': 0.2},-1.20031,-1.20373,-1.235259,-1.177457,...,0.666987,0.661423,0.004185,7,0.670203,0.663847,0.667531,0.662398,0.665995,0.003066
2,50.721325,0.406508,0.286326,0.03429,0.3,{'C': 0.30000000000000004},-1.201911,-1.205304,-1.235411,-1.177338,...,0.667824,0.661747,0.003612,4,0.667136,0.665254,0.67075,0.664293,0.666858,0.002468
3,37.765058,2.357865,0.209871,0.036766,0.4,{'C': 0.4},-1.203351,-1.199225,-1.235082,-1.173907,...,0.669204,0.662531,0.004619,2,0.667373,0.669519,0.669523,0.665422,0.667959,0.001707
4,48.286261,5.77994,0.290074,0.024847,0.5,{'C': 0.5},-1.203006,-1.206505,-1.233695,-1.172053,...,0.667657,0.661455,0.003794,6,0.668892,0.66145,0.670457,0.665352,0.666538,0.003471
5,49.490021,1.198144,0.270835,0.022701,0.6,{'C': 0.6},-1.202274,-1.204189,-1.233181,-1.176998,...,0.668033,0.661977,0.003717,3,0.668349,0.663958,0.670248,0.663527,0.66652,0.002862
6,42.906258,9.138255,0.280578,0.105964,0.7,{'C': 0.7000000000000001},-1.202378,-1.204696,-1.233428,-1.174774,...,0.66523,0.661496,0.002566,5,0.668516,0.664307,0.670206,0.662064,0.666273,0.003244
7,37.590307,3.411571,0.201627,0.041408,0.8,{'C': 0.8},-1.203656,-1.204036,-1.236713,-1.173657,...,0.669288,0.660817,0.005278,9,0.66577,0.665742,0.66774,0.665854,0.666276,0.000846
8,55.063854,4.378503,0.294819,0.028766,0.9,{'C': 0.9},-1.203159,-1.202269,-1.236531,-1.17397,...,0.669121,0.662918,0.003884,1,0.669464,0.664502,0.671892,0.665882,0.667935,0.002916
9,51.718696,1.533384,0.305313,0.038838,1.0,{'C': 1.0},-1.203374,-1.202712,-1.234135,-1.179123,...,0.663599,0.660786,0.002524,10,0.66977,0.665031,0.669872,0.660935,0.666402,0.003713


In [26]:
pd.DataFrame(GS.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'params', 'split0_test_neg_log_loss',
       'split1_test_neg_log_loss', 'split2_test_neg_log_loss',
       'split3_test_neg_log_loss', 'mean_test_neg_log_loss',
       'std_test_neg_log_loss', 'rank_test_neg_log_loss',
       'split0_train_neg_log_loss', 'split1_train_neg_log_loss',
       'split2_train_neg_log_loss', 'split3_train_neg_log_loss',
       'mean_train_neg_log_loss', 'std_train_neg_log_loss',
       'split0_test_accuracy', 'split1_test_accuracy', 'split2_test_accuracy',
       'split3_test_accuracy', 'mean_test_accuracy', 'std_test_accuracy',
       'rank_test_accuracy', 'split0_train_accuracy', 'split1_train_accuracy',
       'split2_train_accuracy', 'split3_train_accuracy', 'mean_train_accuracy',
       'std_train_accuracy'],
      dtype='object')

In [35]:
cols = ['param_C', 'rank_test_neg_log_loss', 'mean_test_neg_log_loss', 'mean_train_neg_log_loss', 'rank_test_accuracy', 'mean_test_accuracy', 'mean_train_accuracy']
pd.DataFrame(GS.cv_results_)[cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
0,0.1,10,-1.206011,-1.178054,8,0.66089,0.665399
1,0.2,6,-1.20419,-1.174772,7,0.661423,0.665995
2,0.3,9,-1.204993,-1.176241,4,0.661747,0.666858
3,0.4,1,-1.202894,-1.173945,2,0.662531,0.667959
4,0.5,2,-1.203817,-1.174358,6,0.661455,0.666538
5,0.6,5,-1.204162,-1.175089,3,0.661977,0.66652
6,0.7,3,-1.203821,-1.175125,5,0.661496,0.666273
7,0.8,7,-1.204518,-1.175653,9,0.660817,0.666276
8,0.9,4,-1.203984,-1.174717,1,0.662918,0.667935
9,1.0,8,-1.204838,-1.175924,10,0.660786,0.666402


In [37]:
lr = LogisticRegression(solver='sag', multi_class='auto', random_state=42)
parameters = {'C': np.linspace(0.1, 1, 10)}
start = time.time()
GS = GridSearchCV(lr, param_grid=parameters, cv=4, scoring=scoring, return_train_score=True, refit='accuracy')
GS.fit(X, y)
time.time() - start





6139.587750196457

In [38]:
pd.DataFrame(GS.cv_results_)[cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
0,0.1,10,-1.288982,-1.270373,10,0.649644,0.652703
1,0.2,9,-1.286837,-1.268037,9,0.649779,0.652947
2,0.3,8,-1.286134,-1.267269,8,0.649999,0.653009
3,0.4,7,-1.285785,-1.266887,7,0.650093,0.653055
4,0.5,6,-1.285577,-1.266658,5,0.650135,0.6531
5,0.6,5,-1.285438,-1.266507,4,0.650156,0.653159
6,0.7,4,-1.285339,-1.266398,5,0.650135,0.653166
7,0.8,3,-1.285265,-1.266317,3,0.650166,0.653173
8,0.9,2,-1.285208,-1.266254,1,0.650198,0.653166
9,1.0,1,-1.285162,-1.266203,1,0.650198,0.653177


In [39]:
parameters = {'solver': ('lbfgs', 'sag'), 'C': np.linspace(0.1, 1, 10), 'multi_class': ('multinomial', 'ovr')}
lr = LogisticRegression(random_state=42)
start = time.time()
GS = GridSearchCV(lr, param_grid=parameters, cv=4, scoring=scoring, return_train_score=True, refit='accuracy')
GS.fit(X, y)
time.time() - start











































































39571.43832445145

In [40]:
pd.DataFrame(GS.cv_results_).columns

Index(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time',
       'param_C', 'param_multi_class', 'param_solver', 'params',
       'split0_test_neg_log_loss', 'split1_test_neg_log_loss',
       'split2_test_neg_log_loss', 'split3_test_neg_log_loss',
       'mean_test_neg_log_loss', 'std_test_neg_log_loss',
       'rank_test_neg_log_loss', 'split0_train_neg_log_loss',
       'split1_train_neg_log_loss', 'split2_train_neg_log_loss',
       'split3_train_neg_log_loss', 'mean_train_neg_log_loss',
       'std_train_neg_log_loss', 'split0_test_accuracy',
       'split1_test_accuracy', 'split2_test_accuracy', 'split3_test_accuracy',
       'mean_test_accuracy', 'std_test_accuracy', 'rank_test_accuracy',
       'split0_train_accuracy', 'split1_train_accuracy',
       'split2_train_accuracy', 'split3_train_accuracy', 'mean_train_accuracy',
       'std_train_accuracy'],
      dtype='object')

In [41]:
pd.DataFrame(GS.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_multi_class,param_solver,params,split0_test_neg_log_loss,split1_test_neg_log_loss,...,split3_test_accuracy,mean_test_accuracy,std_test_accuracy,rank_test_accuracy,split0_train_accuracy,split1_train_accuracy,split2_train_accuracy,split3_train_accuracy,mean_train_accuracy,std_train_accuracy
0,36.860616,3.65829,0.231109,0.044897,0.1,multinomial,lbfgs,"{'C': 0.1, 'multi_class': 'multinomial', 'solv...",-1.204309,-1.210361,...,0.667615,0.66089,0.004159,8,0.667373,0.659958,0.670778,0.663485,0.665399,0.004065
1,247.346068,27.971883,0.352532,0.102461,0.1,multinomial,sag,"{'C': 0.1, 'multi_class': 'multinomial', 'solv...",-1.317609,-1.201652,...,0.649586,0.649644,0.008161,20,0.648374,0.666648,0.650362,0.645427,0.652703,0.008241
2,103.998948,7.554887,0.322054,0.042824,0.1,ovr,lbfgs,"{'C': 0.1, 'multi_class': 'ovr', 'solver': 'lb...",-1.299485,-1.293434,...,0.643939,0.638,0.004143,30,0.641864,0.642607,0.646739,0.639798,0.642752,0.002522
3,582.348559,141.232788,0.229358,0.03953,0.1,ovr,sag,"{'C': 0.1, 'multi_class': 'ovr', 'solver': 'sag'}",-1.510189,-1.341826,...,0.629591,0.626816,0.005891,40,0.627758,0.635458,0.630839,0.62493,0.629746,0.003904
4,50.514145,9.107729,0.274582,0.058821,0.2,multinomial,lbfgs,"{'C': 0.2, 'multi_class': 'multinomial', 'solv...",-1.20031,-1.20373,...,0.666987,0.661423,0.004185,7,0.670203,0.663847,0.667531,0.662398,0.665995,0.003066
5,176.913669,36.107597,0.304562,0.05894,0.2,multinomial,sag,"{'C': 0.2, 'multi_class': 'multinomial', 'solv...",-1.315798,-1.198595,...,0.649544,0.649779,0.008293,19,0.648569,0.667317,0.65046,0.645441,0.652947,0.008488
6,114.744848,5.695653,0.350535,0.076501,0.2,ovr,lbfgs,"{'C': 0.2, 'multi_class': 'ovr', 'solver': 'lb...",-1.291534,-1.284402,...,0.645152,0.63986,0.003555,29,0.643718,0.644363,0.649373,0.641665,0.64478,0.002833
7,570.287824,117.29441,0.247598,0.077083,0.2,ovr,sag,"{'C': 0.2, 'multi_class': 'ovr', 'solver': 'sag'}",-1.50787,-1.3369,...,0.629758,0.62714,0.005885,39,0.627786,0.636141,0.630853,0.625042,0.629955,0.00412
8,44.114618,6.591143,0.246349,0.041499,0.3,multinomial,lbfgs,"{'C': 0.30000000000000004, 'multi_class': 'mul...",-1.201911,-1.205304,...,0.667824,0.661747,0.003612,4,0.667136,0.665254,0.67075,0.664293,0.666858,0.002468
9,138.472776,29.028206,0.213867,0.046318,0.3,multinomial,sag,"{'C': 0.30000000000000004, 'multi_class': 'mul...",-1.315197,-1.197616,...,0.649544,0.649999,0.008395,18,0.648541,0.667512,0.650585,0.645399,0.653009,0.008574


In [44]:
df = pd.DataFrame(GS.cv_results_)
df.loc[(df.param_multi_class == 'multinomial') & (df.param_solver == 'lbfgs'), cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
0,0.1,10,-1.206011,-1.178054,8,0.66089,0.665399
4,0.2,6,-1.20419,-1.174772,7,0.661423,0.665995
8,0.3,9,-1.204993,-1.176241,4,0.661747,0.666858
12,0.4,1,-1.202894,-1.173945,2,0.662531,0.667959
16,0.5,2,-1.203817,-1.174358,6,0.661455,0.666538
20,0.6,5,-1.204162,-1.175089,3,0.661977,0.66652
24,0.7,3,-1.203821,-1.175125,5,0.661496,0.666273
28,0.8,7,-1.204518,-1.175653,9,0.660817,0.666276
32,0.9,4,-1.203984,-1.174717,1,0.662918,0.667935
36,1.0,8,-1.204838,-1.175924,10,0.660786,0.666402


In [46]:
df.loc[(df.param_multi_class == 'multinomial') & (df.param_solver == 'sag'), cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
1,0.1,28,-1.288982,-1.270373,20,0.649644,0.652703
5,0.2,25,-1.286837,-1.268037,19,0.649779,0.652947
9,0.3,22,-1.286134,-1.267269,18,0.649999,0.653009
13,0.4,20,-1.285785,-1.266887,17,0.650093,0.653055
17,0.5,16,-1.285577,-1.266658,15,0.650135,0.6531
21,0.6,15,-1.285438,-1.266507,14,0.650156,0.653159
25,0.7,14,-1.285339,-1.266398,15,0.650135,0.653166
29,0.8,13,-1.285265,-1.266317,13,0.650166,0.653173
33,0.9,12,-1.285208,-1.266254,11,0.650198,0.653166
37,1.0,11,-1.285162,-1.266203,11,0.650198,0.653177


In [47]:
df.loc[(df.param_multi_class == 'ovr') & (df.param_solver == 'lbfgs'), cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
2,0.1,30,-1.299092,-1.270433,30,0.638,0.642752
6,0.2,29,-1.29098,-1.259083,29,0.63986,0.64478
10,0.3,27,-1.288124,-1.254735,28,0.640414,0.645337
14,0.4,26,-1.287191,-1.252756,27,0.640592,0.645473
18,0.5,24,-1.286557,-1.251378,26,0.640686,0.645853
22,0.6,23,-1.286172,-1.250503,25,0.64101,0.646198
26,0.7,21,-1.285933,-1.249689,22,0.641376,0.646337
30,0.8,19,-1.285745,-1.249212,24,0.641094,0.64617
34,0.9,17,-1.285621,-1.248781,21,0.641585,0.646626
38,1.0,18,-1.285684,-1.248556,23,0.641209,0.646341


In [48]:
df.loc[(df.param_multi_class == 'ovr') & (df.param_solver == 'sag'), cols]

Unnamed: 0,param_C,rank_test_neg_log_loss,mean_test_neg_log_loss,mean_train_neg_log_loss,rank_test_accuracy,mean_test_accuracy,mean_train_accuracy
3,0.1,40,-1.466899,-1.453996,40,0.626816,0.629746
7,0.2,39,-1.463899,-1.450883,39,0.62714,0.629955
11,0.3,38,-1.462901,-1.449846,38,0.627151,0.630039
15,0.4,37,-1.462402,-1.449327,35,0.627224,0.630074
19,0.5,36,-1.462103,-1.449017,34,0.627234,0.630116
23,0.6,35,-1.461904,-1.448809,37,0.627192,0.630133
27,0.7,34,-1.461762,-1.448661,35,0.627224,0.630119
31,0.8,33,-1.461655,-1.44855,33,0.627245,0.630119
35,0.9,32,-1.461572,-1.448464,31,0.627255,0.63013
39,1.0,31,-1.461506,-1.448395,31,0.627255,0.63013
