In [1]:
import pandas as pd
import numpy as np
import calendar
import time
import re 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import log_loss, accuracy_score, classification_report
%matplotlib inline

In [2]:
data_train = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_train_orig = pd.read_csv("train.csv").replace("MENS WEAR","MENSWEAR")
data_test = pd.read_csv("test.csv").replace("MENS WEAR","MENSWEAR")

In [3]:
dept_list = sorted(list(data_train.DepartmentDescription.dropna().unique()))
#dept_list

In [4]:
weekdays = list(calendar.day_name)
dept_list_sum = dict.fromkeys(dept_list, np.sum)
weekday_dict = dict.fromkeys(weekdays, np.max)
feature_dict = {"TripType": np.max, 'NumItems': np.sum, 'Return': np.max}
feature_dict = {**feature_dict, **weekday_dict, **dept_list_sum}

In [5]:
def transform_data(data):
    dummies = pd.get_dummies(data.Weekday)
    data[dummies.columns] = dummies
    
    dummies = pd.get_dummies(data.DepartmentDescription)
    dummies = dummies.apply(lambda x: x*data["ScanCount"])
    data[dummies.columns] = dummies 

    data.loc[data.ScanCount < 0, 'Return'] = 1
    data.loc[data.Return != 1, 'Return'] = 0
    
    data = data.rename(columns={"ScanCount":"NumItems"})
    
    grouped = data.groupby("VisitNumber")
    grouped = grouped.aggregate(feature_dict)
    data = grouped[["TripType", "NumItems", "Return"] + weekdays + dept_list]

    return data

In [6]:
data_new = transform_data(data_train)

In [7]:
def add_category_counts(data):
    alist = []
    for array in np.asarray(data.loc[:, dept_list[0]:]):
        count = 0
        count = sum(x > 0 for x in array)
        alist.append(count)
    cat_counts = pd.DataFrame(alist)
    cat_counts = cat_counts.rename(columns={0:"CategoryCount"})
    cat_counts = cat_counts.set_index(data.index)
    data.insert(3, 'CategoryCounts', cat_counts)
    return data

In [8]:
data_new_cat = add_category_counts(data_new)

In [9]:
X = data_new_cat.drop('TripType', axis=1)
y = data_new_cat.TripType
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(multi_class='auto', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



80.38047170639038

In [11]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.64601362933233

In [12]:
lr = LogisticRegression(solver='saga', multi_class='auto', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



505.65342354774475

In [13]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.6517831013002215

In [15]:
lr = LogisticRegression(solver='sag', multi_class='auto', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



145.31299686431885

In [16]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.6641582005936703

In [17]:
lr = LogisticRegression(solver='newton-cg', multi_class='auto', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start

1669.1244790554047

In [18]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.673314101760107

In [20]:
lr = LogisticRegression(multi_class='ovr', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



59.87126278877258

In [21]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.64601362933233

In [24]:
lr = LogisticRegression(solver='lbfgs', multi_class='multinomial', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



34.84452986717224

In [25]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.6685480162214139

In [28]:
lr = LogisticRegression(solver='sag', multi_class='multinomial', random_state=42)
start = time.time()
lr = lr.fit(X_train, y_train)
time.time() - start



145.40098690986633

In [29]:
predictions = lr.predict(X_test)
accuracy_score(y_test, predictions)

0.6641582005936703

In [14]:
scoring = ['neg_log_loss', 'accuracy']
lr = LogisticRegression(solver='lbfgs', multi_class='auto', random_state=42)