In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model as LM
from sklearn import preprocessing as prp
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

np.seterr(divide = 'raise')
%matplotlib inline

## идея работы такая:
1) подгружаем данные из hdf файла

2) обучаем SGD classifier на имеющемся объёме данных

3) Тестируем полученную модель:
    - оформить всю процедуру от исходных сырых данных до конечного сета features в удобоваримый функционал
    - случайно выбираем пользователя 
    - получить предсказания модели
    - оценки модели

In [2]:
# import products description
products    =  pd.read_csv('./data/products.csv',index_col='product_id')
aisles      =  pd.read_csv('./data/aisles.csv',index_col = 'aisle_id')
departments =  pd.read_csv('./data/departments.csv', index_col = 'department_id')

aislesList = np.hstack([np.zeros(1), aisles.index.values])
departmentsList = departments.index.values

In [39]:
trainFile = "w:\Fedor\Kaggel\instacart\\newFeatures2.hdf"
trainDf = pd.read_hdf(trainFile)
trainDf.reset_index(drop=True,inplace=True)
print  trainDf.shape

(788656, 6)


In [28]:
trainDf = trainDf.assign(relCount = trainDf['absCount'].groupby(trainDf['userId']).transform(lambda x: x/float(x.sum())))

In [22]:
def preprocessData(inp):
    df = inp.merge(products,left_on='product_id',right_index=True)
    df = df.assign(decay = df.lastOrder/df.periodicity).replace([np.inf, -np.inf], 0)
    df.fillna(0,inplace=True)
    df.reset_index(drop=True,inplace=True)
    aisleEncoder =  prp.OneHotEncoder(sparse = False,n_values=aislesList.shape[0])
    
    aisleDf = pd.DataFrame(
        aisleEncoder.fit_transform(df.loc[:,'aisle_id'].values.reshape(-1, 1)),
        columns =['aisle_'+str(int(x)) for x in aislesList])
    
    aisleDf.drop('aisle_0', axis = 1, inplace=True)
    
    df = df.merge(aisleDf,left_index=True,right_index =True)
    X  = df.drop(['product_id','product_name','aisle_id','department_id'],axis = 1)
    
    return X

In [23]:
def calcf1Score(uGr):
    intersection = float((uGr.wasOrdered & uGr.wasPredicted).sum())
    predicted = uGr.wasPredicted.sum() 

    if predicted:
        precession   = intersection/predicted    
    else:
        precession  = 0.0

    basketSize = uGr.wasOrdered.sum()

    if basketSize:
        recall = intersection/basketSize
    else:
        recall = 0.0

    if (bool(precession) | bool(recall)):
        f1 = 2*precession*recall/(precession+recall)
    else:
        f1 = 0.0
    return f1

In [29]:
X = preprocessData(trainDf.drop(['wasOrdered'],axis = 1))
y = trainDf.wasOrdered.apply(lambda x: 1 if x else 0)

In [31]:
def testModel(estimator,X,y,n_splits = 3):
    skf = StratifiedKFold(n_splits = n_splits,random_state=None, shuffle=False)
    
    output = pd.DataFrame()
    
    for train_index, test_index in skf.split(X, y):
        print 'Fold' 
        # split folds
        X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
        y_train, y_test = y[train_index], y[test_index]
        # fit and predict
        estimator.fit(X_train.drop('userId',axis = 1),y_train)
        y_pr = estimator.predict(X_test.drop('userId',axis = 1))
        
        # estimate quality
        fullSet = X_test.assign(wasPredicted = y_pr.astype(bool), 
                                   wasOrdered = y_test )
        output = pd.concat([output, fullSet])
        f1 = fullSet.groupby('userId').apply(lambda x: calcf1Score(x))
        #print f1
        print f1.mean()
    return output    

# SGD classifier

In [32]:
classifier = LM.SGDClassifier()
testModel(classifier,X,y)

Fold
0.0
Fold
0.0
Fold
0.0


Unnamed: 0,periodicity,absCount,lastOrder,userId,relCount,decay,aisle_1,aisle_2,aisle_3,aisle_4,...,aisle_127,aisle_128,aisle_129,aisle_130,aisle_131,aisle_132,aisle_133,aisle_134,wasOrdered,wasPredicted
0,19.555556,10,14.0,1,0.169492,0.715909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,False
1,14.000000,5,7.0,15,0.069444,0.500000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,False
2,27.500000,3,23.0,19,0.014706,0.836364,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
3,0.000000,1,252.0,21,0.004878,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
4,49.000000,2,22.0,31,0.006689,0.448980,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,False
5,28.000000,2,50.0,43,0.013605,1.785714,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
6,13.076923,14,69.0,52,0.082840,5.276471,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
7,9.611111,19,6.0,67,0.234568,0.624277,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,False
8,60.000000,2,67.0,81,0.074074,1.116667,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,False
9,25.125000,9,13.0,82,0.068702,0.517413,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,False


## Decision tree

In [70]:
tree = DecisionTreeClassifier(random_state = 42)
testModel(tree,X,y)

Fold
0.0169252125303
Fold
0.0149870609112
Fold
0.0157426706127


In [71]:
tree = DecisionTreeClassifier(class_weight = 'balanced',random_state = 42)
testModel(tree,X,y)

Fold
0.0778106345289
Fold
0.0783755986599
Fold
0.0741709654152


In [41]:
tree = DecisionTreeClassifier(class_weight = 'balanced',splitter = 'random',random_state = 42)
answer = testModel(tree,X,y)

Fold
0.0746847820522
Fold
0.0769538476957
Fold
0.0723048071368


In [34]:
tree = DecisionTreeClassifier(class_weight = 'balanced',splitter = 'random',random_state = 42)
answer = testModel(tree,X,y)

Fold
0.0404933095165
Fold
0.0421682060264
Fold
0.0394856931628


In [40]:
#trainDf.drop('absCount',axis = 1, inplace=True)
X = preprocessData(trainDf.drop(['wasOrdered'],axis = 1))
y = trainDf.wasOrdered.apply(lambda x: 1 if x else 0)

In [38]:
tree = DecisionTreeClassifier(class_weight = 'balanced',splitter = 'random',random_state = 42)
answer = testModel(tree,X,y)

Fold
0.0410968878996
Fold
0.0428959887673
Fold
0.0417151184755


In [None]:
def save2File(fName,df):
    f = open(fName,'w')
    f.write('order_id,products\n')
    np.set_printoptions(linewidth = 500)
    for key, val in df.iteritems():
        k = np.array2string(val[1])[1:-1]+'\n'
        s = str(val[0])+','+k
        f.write(s)
    f.close()
    np.set_printoptions(linewidth = 75)

In [None]:
orders = pd.read_csv('./data/orders.csv',index_col = 'order_id',usecols = ['order_id','user_id','eval_set','order_number','days_since_prior_order'])
orders.loc[:,'days_since_prior_order'].fillna(0,inplace = True) 
orders = orders.assign(date = orders.groupby('user_id').days_since_prior_order.transform(np.cumsum))

In [None]:
features_test = preprocessData(newDf.drop(['wasOrdered'],axis = 1))
ans_test = newDf.wasOrdered.apply(lambda x: 1 if x else 0)