In [41]:
import numpy as np 
import pandas as pd 
import os
import random
import seaborn as sns
sns.set_style('darkgrid')
import matplotlib.pyplot as plt
from timeit import default_timer as timer

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/santander-product-recommendation/test_ver2.csv.zip
/kaggle/input/santander-product-recommendation/sample_submission.csv.zip
/kaggle/input/santander-product-recommendation/train_ver2.csv.zip


In [42]:
dtype_list = {'ind_cco_fin_ult1': 'uint8',
              'ind_deme_fin_ult1': 'uint8',
              'ind_aval_fin_ult1': 'uint8',
              'ind_valo_fin_ult1': 'uint8',
              'ind_reca_fin_ult1': 'uint8',
              'ind_ctju_fin_ult1': 'uint8',
              'ind_cder_fin_ult1': 'uint8', 
              'ind_plan_fin_ult1': 'uint8',
              'ind_fond_fin_ult1': 'uint8', 
              'ind_hip_fin_ult1': 'uint8',
              'ind_pres_fin_ult1': 'uint8', 
              'ind_nomina_ult1': 'Int64', 
              'ind_cno_fin_ult1': 'uint8',
              'ind_ctpp_fin_ult1': 'uint8',
              'ind_ahor_fin_ult1': 'uint8',
              'ind_dela_fin_ult1': 'uint8',
              'ind_ecue_fin_ult1': 'uint8',
              'ind_nom_pens_ult1': 'Int64',
              'ind_recibo_ult1': 'uint8',
              'ind_deco_fin_ult1': 'uint8',
              'ind_tjcr_fin_ult1': 'uint8', 
              'ind_ctop_fin_ult1': 'uint8',
              'ind_viv_fin_ult1': 'uint8',
              'ind_ctma_fin_ult1': 'uint8',
             'ncodpers' : 'uint32'}  

In [43]:
# alternative way to read big csv file
# df_train = pd.read_csv('../input/santander-product-recommendation/train_ver2.csv.zip', nrows=2e6,
#                      dtype=dtype_list, 
#                        usecols=name_col
#                     )

In [44]:
name_col = ['ncodpers', 'fecha_dato', 'ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1',
               'ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
               'ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

In [45]:
reader = pd.read_csv('../input/santander-product-recommendation/train_ver2.csv.zip', chunksize=1e6,
                     dtype=dtype_list, usecols=name_col
                    )
df_train = pd.concat([chunk for chunk in reader])

df_train.shape

(13647309, 26)

In [46]:
# df_train.to_csv(df_trian_small, compression='zip')

In [47]:
df_train1505 = df_train[df_train.fecha_dato == '2015-05-28']
df_train1505 = df_train1505.drop(['fecha_dato'], axis=1, inplace=False)
df_train1505 = df_train1505.fillna(0)
df_train1505.shape

(631957, 25)

In [48]:
#TOP POPULAR

In [49]:
product_col2 = ['ind_ahor_fin_ult1','ind_aval_fin_ult1','ind_cco_fin_ult1',
               'ind_cder_fin_ult1','ind_cno_fin_ult1','ind_ctju_fin_ult1',
               'ind_ctma_fin_ult1','ind_ctop_fin_ult1','ind_ctpp_fin_ult1',
               'ind_deco_fin_ult1','ind_deme_fin_ult1','ind_dela_fin_ult1',
               'ind_ecue_fin_ult1','ind_fond_fin_ult1','ind_hip_fin_ult1',
               'ind_plan_fin_ult1','ind_pres_fin_ult1','ind_reca_fin_ult1',
               'ind_tjcr_fin_ult1','ind_valo_fin_ult1','ind_viv_fin_ult1',
               'ind_nomina_ult1','ind_nom_pens_ult1','ind_recibo_ult1']

In [50]:
def popularity_based(df, product_col = product_col2):
    top_col = {}
    for col in product_col:
        top_col[col] = df[col].value_counts()[1]
        
#     top_col = dict(sorted(top_col.items(), key=lambda it: it[1], reverse=True)) # sorted by most popular
    
    for k, v in top_col.items():
        top_col[k] = np.around(v / df.shape[0], decimals=4)
        
    return top_col

In [51]:
popularity_based(df_train1505)

{'ind_ahor_fin_ult1': 0.0001,
 'ind_aval_fin_ult1': 0.0,
 'ind_cco_fin_ult1': 0.775,
 'ind_cder_fin_ult1': 0.0005,
 'ind_cno_fin_ult1': 0.1003,
 'ind_ctju_fin_ult1': 0.0121,
 'ind_ctma_fin_ult1': 0.0112,
 'ind_ctop_fin_ult1': 0.1661,
 'ind_ctpp_fin_ult1': 0.0562,
 'ind_deco_fin_ult1': 0.0033,
 'ind_deme_fin_ult1': 0.0023,
 'ind_dela_fin_ult1': 0.0556,
 'ind_ecue_fin_ult1': 0.1006,
 'ind_fond_fin_ult1': 0.0239,
 'ind_hip_fin_ult1': 0.0076,
 'ind_plan_fin_ult1': 0.0117,
 'ind_pres_fin_ult1': 0.0037,
 'ind_reca_fin_ult1': 0.0632,
 'ind_tjcr_fin_ult1': 0.0568,
 'ind_valo_fin_ult1': 0.0316,
 'ind_viv_fin_ult1': 0.005,
 'ind_nomina_ult1': 0.0655,
 'ind_nom_pens_ult1': 0.0672,
 'ind_recibo_ult1': 0.1527}

In [52]:
# USER ITEM

In [53]:
df_ui = df_train1505.copy()

df_ui = df_ui.set_index('ncodpers')
df_ui = df_ui[:10000] # limited to 10k due to RAM limit
# df_ui = df_ui.sample(10000) # alternative way 
# df_ui = pd.concat([df_ui, df_train1505[df_train1505.ncodpers == 1061608].set_index('ncodpers')]) #add new user
df_ui.shape

(10000, 24)

In [54]:
from sklearn.metrics.pairwise import pairwise_distances 

cosine_sim = 1 - pairwise_distances(df_ui, metric="cosine") #kasuje indexy stąd w funkcji obliczam po numerze indexu a nie nazwie indexu

In [55]:
def useritem(user_id, df, sim_matrix = cosine_sim):
    cos_id = list(df.index).index(user_id) # numer wiersza w df_ui dla podanego numeru klienta

    k = 0 # k dobrze działa przy 150 200 userach, chce tylko najbardziej podobnych
    sim_min = 0.79
    user_sim_k = {}
    
    while k < 20:
        for user in range(len(df)):   # user sim dla każdego użytkownika
            if sim_min < cosine_sim[cos_id, user] < 0.99: #nie chce idenycznych userów dlatego < 0.99 i wskaźnik podobieństwa min 0.79
                user_sim_k[user] = cosine_sim[cos_id, user]
                k+=1 # licznik userów
                
        sim_min -= 0.025
        
        if sim_min < 0.65: # jeżeli nie będzie już podobnych, wtedy jest nan w prawdopodobieństwie, trzeba to poprawic -> waga na 0
            break
    
    
    user_sim_k = dict(sorted(user_sim_k.items(), key=lambda item: item[1], reverse=True)) #posortowani k najbardziej podobnych userów
    user_id_k = list(user_sim_k.keys()) 
    
    df_user_k = df.iloc[user_id_k] # zwraca z numerem klientra
    df_user_k_T = df_user_k.T
    df_user_k_T.columns = user_id_k # zwraca z numerem indexu - cos id
    
    
    lista = []
    usit = {}
    for row_name, row in df_user_k_T.iterrows(): #row.index[0] to nr wiersza w cos id # row.values[0] zwraca wartosc 0 lub 1 
        
        for indx, ocena in row.items(): #ocena to 0 lub 1, zamiast ocena ->row.values[0] ale ten zapis zapewnia iteracje dla wiersza
            
            lista.append(ocena) 
            
        usit[row_name] = np.mean(lista) # średnia z k poodbnych userów - suma 1 dzielona przez k
        lista=[]
            
    return usit

In [56]:
useritem(1061608, df_ui)

{'ind_ahor_fin_ult1': 0.0,
 'ind_aval_fin_ult1': 0.0,
 'ind_cco_fin_ult1': 1.0,
 'ind_cder_fin_ult1': 0.0,
 'ind_cno_fin_ult1': 0.0,
 'ind_ctju_fin_ult1': 0.0,
 'ind_ctma_fin_ult1': 0.00430416068866571,
 'ind_ctop_fin_ult1': 0.0,
 'ind_ctpp_fin_ult1': 0.0,
 'ind_deco_fin_ult1': 0.0,
 'ind_deme_fin_ult1': 0.0,
 'ind_dela_fin_ult1': 0.03156384505021521,
 'ind_ecue_fin_ult1': 0.31563845050215206,
 'ind_fond_fin_ult1': 0.00430416068866571,
 'ind_hip_fin_ult1': 0.0,
 'ind_plan_fin_ult1': 0.0,
 'ind_pres_fin_ult1': 0.0014347202295552368,
 'ind_reca_fin_ult1': 0.03443328550932568,
 'ind_tjcr_fin_ult1': 0.011477761836441894,
 'ind_valo_fin_ult1': 0.00860832137733142,
 'ind_viv_fin_ult1': 0.0,
 'ind_nomina_ult1': 0.0,
 'ind_nom_pens_ult1': 0.0,
 'ind_recibo_ult1': 0.5882352941176471}

In [57]:
# MODEL BASED

In [58]:
df_uiM = df_train1505.copy()
df_uiM = df_uiM.set_index('ncodpers')

In [59]:
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

In [60]:
import warnings
warnings.filterwarnings('ignore')

In [61]:
df_train1603 = df_train[df_train.fecha_dato == '2016-03-28']
df_train1603 = df_train1603.drop(['fecha_dato'], axis=1, inplace=False)

In [62]:
def modelbased(user_id, df, model=DecisionTreeClassifier(max_depth=9)):
    
    mdbs = {}
    
    for c in df.columns:
        y_train = df[c].astype('int') # series
        x_train = df.drop([c], axis = 1) #dataframe
        clf = model
        clf.fit(x_train, y_train)
        p_train = clf.predict_proba(x_train[x_train.index == user_id])[:,1]
        
        mdbs[c] = p_train[0] # słownik klucz - id usera : wartosc - lista prawdopodobienstwa dla kazedgo itemu
        
    return mdbs

In [63]:
start = timer()

mb_1061608_dt4 = modelbased(1061608, df_uiM, DecisionTreeClassifier(max_depth=9))

end = timer()
print(end - start,' s = ', (end - start)/60,' min')
mb_1061608_dt4

21.788474341999972  s =  0.3631412390333329  min


{'ind_ahor_fin_ult1': 2.0143710989545413e-05,
 'ind_aval_fin_ult1': 0.0,
 'ind_cco_fin_ult1': 0.9354946617944848,
 'ind_cder_fin_ult1': 0.00019134466093726082,
 'ind_cno_fin_ult1': 0.00026302086100808724,
 'ind_ctju_fin_ult1': 3.0062710814759588e-06,
 'ind_ctma_fin_ult1': 0.006624955043928504,
 'ind_ctop_fin_ult1': 0.11178726596000622,
 'ind_ctpp_fin_ult1': 0.02026298551515136,
 'ind_deco_fin_ult1': 0.0026920204617169306,
 'ind_deme_fin_ult1': 0.0006220816686659199,
 'ind_dela_fin_ult1': 0.01911984941538546,
 'ind_ecue_fin_ult1': 0.03153603373338318,
 'ind_fond_fin_ult1': 0.004927893732809265,
 'ind_hip_fin_ult1': 0.00023275532427804286,
 'ind_plan_fin_ult1': 0.001406974657086115,
 'ind_pres_fin_ult1': 0.0016418788567050227,
 'ind_reca_fin_ult1': 0.009584422251260034,
 'ind_tjcr_fin_ult1': 0.005536443775532144,
 'ind_valo_fin_ult1': 0.008798761904495373,
 'ind_viv_fin_ult1': 0.0013513549041826276,
 'ind_nomina_ult1': 0.0,
 'ind_nom_pens_ult1': 1.1663412574558364e-05,
 'ind_recibo_ult1'

In [64]:
# param_grid_tree = {
#      'max_depth' : [5,7,8,9]}

# grid_tree = GridSearchCV(DecisionTreeClassifier(), param_grid_tree, cv=4, scoring='average_precision')

# for c in df_uiM.columns:
#     y_train = df_uiM[c].astype('int') 
#     x_train = df_uiM.drop([c], axis = 1)
#     grid_tree.fit(x_train, y_train)
        
# tree_clf_new = grid_tree.best_estimator_
# tree_clf_score = grid_tree.cv_results_
# tree_clf_score

In [65]:
# def modelbased(user_id, df, model):
#     start2 = timer()
#     start = timer()
    
#     id_preds = defaultdict(list) 
#     ids = list(df.index.values) # numery klienta
    
#     for c in df.columns:
#         y_train = df[c].astype('int') # series
#         x_train = df.drop([c], axis = 1) #dataframe
#         clf = model
#         clf.fit(x_train, y_train)
#         p_train = clf.predict_proba(x_train)[:,1]
        
    
#         for id, p in zip(ids, p_train): # zwraca słownik klucz - id usera : wartosc - lista prawdopodobienstwa dla kazedgo itemu
#             id_preds[id].append(p) #dla kazdego usera, można by tylko dla mojego
            
#     end = timer()
#     print('czas po utwrozeniu słownika id_user: lista prawdopod ',end - start,' s = ', (end - start)/60,' min')

#     start = timer()
    
#     mdit = {}
#     for k, v in id_preds.items():
#         idpr = {}
#         for i in range(len(df.columns)): # tworze słownik klucz - id usera : wartosc - słownik nazwa col : prawdopodobienstwo
#             idpr[df.columns[i]] = v[i] #dla kazdego usera, można by tylko dla mojego

#         mdit[k] = idpr
#     end = timer()
#     print('czas po utwrozeniu słownika id_user: słownik prawdop ',end - start,' s = ', (end - start)/60,' min')
#     end2 = timer()      
#     print('czas calkowity',end2 - start2,' s = ', (end2 - start2)/60,' min')
#     return mdit[user_id]

In [66]:
# HYBRID

In [67]:
def hybrid(user_id, df_p, df_u, df_m, f1, f2, f3):
    pb_h = popularity_based(df_p)
    ui_h = useritem(user_id, df_u)
    mb_h =  modelbased(user_id, df_m)

    hybrid = {}
    for k, v in pb_h.items():
        hybrid[k] = (v * f1) + (ui_h[k] * f2) + (mb_h[k] * f3)
    
    return hybrid

In [68]:
hybrid_rec_1061608 = hybrid(1061608, df_u = df_ui, df_m = df_uiM, df_p = df_train1505, f1 = 0.5, f2 = 0.25, f3 = 0.25)

In [69]:
hybrid_rec_1061608

{'ind_ahor_fin_ult1': 5.503592774738636e-05,
 'ind_aval_fin_ult1': 0.0,
 'ind_cco_fin_ult1': 0.8713736654486212,
 'ind_cder_fin_ult1': 0.0002978361652343152,
 'ind_cno_fin_ult1': 0.050215755215252025,
 'ind_ctju_fin_ult1': 0.006050751567770369,
 'ind_ctma_fin_ult1': 0.008332278933148553,
 'ind_ctop_fin_ult1': 0.11099681649000155,
 'ind_ctpp_fin_ult1': 0.03316574637878784,
 'ind_deco_fin_ult1': 0.0023230051154292325,
 'ind_deme_fin_ult1': 0.0013055204171664799,
 'ind_dela_fin_ult1': 0.040470923616400166,
 'ind_ecue_fin_ult1': 0.1370936210588838,
 'ind_fond_fin_ult1': 0.014258013605368745,
 'ind_hip_fin_ult1': 0.0038581888310695108,
 'ind_plan_fin_ult1': 0.006201743664271529,
 'ind_pres_fin_ult1': 0.0026191497715650647,
 'ind_reca_fin_ult1': 0.04260442694014643,
 'ind_tjcr_fin_ult1': 0.03265355140299351,
 'ind_valo_fin_ult1': 0.0201517708204567,
 'ind_viv_fin_ult1': 0.002837838726045657,
 'ind_nomina_ult1': 0.03275,
 'ind_nom_pens_ult1': 0.033602915853143636,
 'ind_recibo_ult1': 0.239459

In [70]:
# RECCOMENDATION

In [71]:
# def recommendation(user_id, df_u):#, hybrid_nothave):
#     have = []
# #     df_u_ix = list(df_u.index)
#     ix = 0
#     for i in df_u.iloc[df_u_ix.index(user_id)]:
#         if i == 1:
#             have.append(df_u.columns[ix])
#         ix+=1
#     print(have)
# #     for i in hybrid_nothave.copy():
# #         if i in have:
# #             hybrid_nothave.pop(i)


# #     hybrid_rec = dict(sorted(hybrid_nothave.items(), key=lambda item: item[1], reverse=True))
    
# #     return list(hybrid_rec.keys())

In [72]:
def recommendation(user_id, df_u, hybrid_outcome):
    have = []

    for i in df_u[df_u.index == user_id]: 
        if df_u[df_u.index == user_id][i].values[0] == 1: # dodaje do słownika have itemy które user ma w podanym df_u
            have.append(i)

    for i in hybrid_outcome.copy(): # usuwa posiadane przez usera itemy 
        if i in have:
            hybrid_outcome.pop(i)


    hybrid_rec = dict(sorted(hybrid_outcome.items(), key=lambda item: item[1], reverse=True))
    
    return list(hybrid_rec.keys())

In [73]:
recommendation(1061608, df_uiM, hybrid_rec_1061608)

['ind_recibo_ult1',
 'ind_ecue_fin_ult1',
 'ind_ctop_fin_ult1',
 'ind_cno_fin_ult1',
 'ind_reca_fin_ult1',
 'ind_dela_fin_ult1',
 'ind_nom_pens_ult1',
 'ind_ctpp_fin_ult1',
 'ind_nomina_ult1',
 'ind_tjcr_fin_ult1',
 'ind_valo_fin_ult1',
 'ind_fond_fin_ult1',
 'ind_ctma_fin_ult1',
 'ind_plan_fin_ult1',
 'ind_ctju_fin_ult1',
 'ind_hip_fin_ult1',
 'ind_viv_fin_ult1',
 'ind_pres_fin_ult1',
 'ind_deco_fin_ult1',
 'ind_deme_fin_ult1',
 'ind_cder_fin_ult1',
 'ind_ahor_fin_ult1',
 'ind_aval_fin_ult1']

In [74]:
# EVALUATION

In [75]:
df_train1605 = df_train[df_train.fecha_dato == '2016-05-28']
df_train1605 = df_train1605.drop(['fecha_dato'], axis=1, inplace=False)

In [76]:
def real_purchase(user_id, df1, df2):
    add_col = []
    for index1, row1 in df1[df1.ncodpers == user_id].iloc[:,1:].iterrows():
    #     add_col = []
        for index2, row2 in df2[df2.ncodpers == user_id].iloc[:,1:].iterrows():
            for ix_col, col in enumerate(product_col2):
                if row1[ix_col] == 0 and row2[ix_col] == 1:
                    add_col.append(col)

    return {user_id: add_col}

In [77]:
real_purchase(1061608, df_train1505, df_train1605)[1061608]

['ind_cno_fin_ult1',
 'ind_ecue_fin_ult1',
 'ind_nomina_ult1',
 'ind_nom_pens_ult1',
 'ind_recibo_ult1']

In [78]:
def apk(actual, predicted, k=7):
    """
    Computes the average precision at k.
    This function computes the average prescision at k between two lists of
    items.
    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The average precision at k over the input lists
    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=7):
    """
    Computes the mean average precision at k.
    This function computes the mean average prescision at k between two lists
    of lists of items.
    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements
    Returns
    -------
    score : double
            The mean average precision at k over the input lists
    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [79]:
def evaluation(user_id, f1, f2, f3):
    y_real = real_purchase(user_id, df_train1505, df_train1605)[user_id]
    y_pred = recommendation(user_id, df_uiM,
                            hybrid_outcome = hybrid(user_id, df_u = df_ui, df_m = df_uiM,
                                                    df_p = df_train1505,
                                                    f1 = f1, f2 = f2, f3 = f3))
    return apk(y_real, y_pred)

In [80]:
evaluation(1061608, f1 = 0.8, f2=0.5, f3=0.2)

0.6642857142857143

In [81]:
# start = timer()


# chart_val = []

# w = np.linspace(0.1,1,3)
# for i1 in w:
#     for i2 in w:
#         for i3 in w:
#             score = evaluation(1061608, f1 = i1, f2 = i2, f3 = i3)

#             chart_val.append([i1, i2, i3, score])

# chart_val_np = np.array(chart_val)

# end = timer()
# print(end - start,' s = ', (end - start)/60,' min')
# # 0.8, 0.5, 0.3
# # 0.9, 0.3, 0.3
# chart_val_np