# Import Libs

In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import json, requests
import re,sys
#from dataiku import Folder
import pickle
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import hstack
from sklearn.metrics.pairwise import pairwise_distances
from scipy.spatial.distance import cosine
from classCustom import ItemSelector, Scalers
from time import time
from nltk.corpus import stopwords
from nltk.stem.snowball import FrenchStemmer
import unicodedata
from Normalizer import *
from EnrichProduct import *
import mysql.connector
import MySQLdb
from sqlalchemy import create_engine
from pandas.io.json import json_normalize
import warnings
warnings.filterwarnings("ignore")
import collections

# Load Datas

In [2]:
models_path='Models/multiNB.p'
model = pickle.load(open(models_path, "rb" ))

# Environment Datas

In [20]:
##############################################################################################
################################## Environnement variables ###################################
##############################################################################################

list_stop_word_french=['alors','au','aucuns','aussi','autre','avant','avec','avoir','bon','car','ce','cela','ces','ceux','chaque',
                       'ci','comme','comment','dans','des','du','dedans','dehors','depuis','devrait','doit','donc','dos','début',
                       'elle','elles','en','encore','essai','est','et','eu','fait','faites','fois','font','hors','ici','il','ils',
                       'je','juste','la','le','les','leur','là','ma','maintenant','mais','mes','mine','moins','mon','mot','même',
                       'ni','nommés','notre','nous','ou','où','par','parce','pas','peut','peu','plupart','pour','pourquoi','quand',
                       'que','quel','quelle','quelles','quels','qui','sa','sans','ses','seulement','si','sien','son','sont','sous',
                       'soyez','sujet','sur','ta','tandis','tellement','tels','tes','ton','tous','tout','trop','très','tu','voient',
                       'vont','votre','vous','vu','ça','étaient','état','étions','été','être']
# Suppress number
reg_numb = re.compile('[^\D]')
# Suppress punctuation
reg_ponct = re.compile('[^a-z 0-9ÀÁÂÃÄÅàáâãäåÒÓÔÕÖØòóôõöøÈÉÊËèéêëÇçÌÍÎÏìíîïÙÚÛÜùúûüÿÑñ²°Ø×ßŠ”�œ…]')
# Suppress stop words
french_stopwords_ini=stopwords.words('french')
french_stopwords_ini.extend(list_stop_word_french)
french_stopwords = set(french_stopwords_ini)
# Stemming of words
stemmer = FrenchStemmer()

list_cat = ['1001','1002','1003','1004','1005','1006','1007','1008','1010','1011',
            '2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012']

# Get ponderation

In [4]:
##############################################################################################
######################## Recuperation des pondérations des methodes ##########################
##############################################################################################

### Dictionnaire de ponderation des modeles
def getDictPond():
    conn= mysql.connector.connect(host='localhost',database='dataiku',user='dkuadmin',password='Dataiku!')
    req = "SELECT * from pondlistecourse order by date desc limit 1"
    pond_df = pd.read_sql(req, conn)
    conn.close()
    pond_df_temp=pond_df.to_dict(orient='list')
    return {k:v[0] for (k,v) in pond_df_temp.iteritems()}

# Get product infos

In [5]:
##############################################################################################
########################### Recuperation des donnees des produits ############################
##############################################################################################

def mapFeatureDF(features_df):
    """ map features to user_info """
    return features_df['features']['listeCourse'],features_df['features']['listeMagasins'],features_df['features']['listeMarchands']

def getCouponsData(rawJsonLC):
    """ Create a dataframe with all the produts informations"""

    # Set empty dataframe with all possible columns
    df_empty_allcol=pd.DataFrame(columns=['bons-plans','description','distance','id','image','lat-lng','libelle','magasin','marque','metadata.prix unitaire','metadata.source','metadata.unité','position','prix','requete'])

    for i in rawJsonLC[0].keys():
        if len(rawJsonLC[0][i])>0:
            df=json_normalize(rawJsonLC[0][i])
            df['requete']=i
            try:
                df['reduc']=df['bons-plans'].map(lambda x:x[0]['libelle'] if len(x)>0 else "")
            except:
                df['reduc']=''
            # Merge to be sure having all the columns
            df_empty_allcol=df_empty_allcol.append(df, ignore_index=True)

    print (df_empty_allcol.index)
    if len(df_empty_allcol.index) >0:
        # Keep only interesting columns
        result=df_empty_allcol[['requete','libelle','description','prix','id','reduc','magasin','image','distance','position','metadata.prix unitaire','metadata.unité','metadata.source']]
        product_info=pd.DataFrame(result.values,columns=['requete','nomProduit','descriptionProduit','prix','id',
                           'reduc',
                           'magId',
                           'urlImage',
                           'distance',
                           'position','prixUnit','unit','source'])
        product_info['prixMin']=0
        product_info['prixMax']=1000
        # Fill na
        product_info.fillna('',inplace=True)
        # Encoding
        product_info.nomProduit=product_info.nomProduit.str.encode('utf-8')
        product_info.descriptionProduit=product_info.descriptionProduit.str.encode('utf-8')
        product_info.requete=product_info.requete.str.encode('utf-8')
        product_info.reduc=product_info.reduc.str.encode('utf-8')
        product_info.unit=product_info.unit.str.encode('utf-8')
        product_info.source=product_info.source.str.encode('utf-8')
    else :
        product_info=pd.DataFrame(columns=['requete','nomProduit','descriptionProduit','prix','id',
                           'reduc',
                           'magId',
                           'urlImage',
                           'distance',
                           'position','prixUnit','unit','source'])

    return product_info

def mapMagasins(rawJsonLC):
    """ Mapping des donnees sur les magasins """
    magasins=pd.DataFrame(columns=['magId','magZipcode','magDepcom',
                                    'magLatLong','magRue',
                                    'magCat','magHor',
                                    'magMarch','magNom','magRmw'])
    count=0
    for i in range(len(rawJsonLC[1])):
        sub_bp=rawJsonLC[1][i]
        codP=''
        comM=''
        latLo=''
        ruE=''
        catE=''
        horA=''
        marC=''
        noM=''
        rmW=''
        try:
            codP=sub_bp['adresse']['code-postal']
        except:
            codP=''
        try:
            comM=sub_bp['adresse']['commune']
        except:
            comM=''
        try:
            latLo=sub_bp['adresse']['lat-lng']
        except:
            latLo=''
        try:
            ruE=sub_bp['adresse']['rue']
        except:
            ruE=''
        try:
            catE=sub_bp['categories']
        except:
            catE=''
        try:
            horA=sub_bp['horaires']
        except:
            horA=''
        try:
            marC=sub_bp['marchand']
        except:
            marC=''
        try:
            noM=sub_bp['nom'].encode('utf-8')
        except:
            noM=''
        try:
            rmW=sub_bp['rmw']
        except:
            rmW=''

        magasins.loc[i]=[sub_bp['id'],codP,comM,
                          latLo,ruE,
                          catE,horA,
                          marC,noM,rmW]

        count+=1
    return magasins

def mapMarchands(rawJsonLC):
    """ Mapping des donnees sur les marchands """
    marchands=pd.DataFrame(columns=['marchId','magMarch'])
    count=0
    for i in range(len(rawJsonLC)):
        sub_bp=rawJsonLC[1][i]
        marchands.loc[i]=[sub_bp['id'],sub_bp['nom'].encode('utf-8')]

        count+=1
    return marchands

def joinBPMag(rawJsonLC, product_info_raw):
    return pd.merge(product_info_raw, mapMagasins(rawJsonLC), on='magId',how='left')

# Empty product matrix

In [6]:
##############################################################################################
###################### Creation de la matrice qualification des produits #####################
############################################################################################## 

def matrixProductColumnsAndMajProducts(product_info,list_user,list_cat):
    list_index=[]
    for i in list_user:
        for ii in list_cat.keys():
            if i=='prixSens':
                list_index.append(str(i)+'_'+str(ii)+'_highPrice')
                list_index.append(str(i)+'_'+str(ii)+'_lowPrice')
                product_info[str(i)+'_'+str(ii)+'_highPrice']=0
                product_info[str(i)+'_'+str(ii)+'_lowPrice']=0
            elif i=='reductionSens':
                list_index.append(str(i)+'_'+str(ii)+'_noreduc')
                list_index.append(str(i)+'_'+str(ii)+'_freeImm')
                list_index.append(str(i)+'_'+str(ii)+'_2iemGrat')
                list_index.append(str(i)+'_'+str(ii)+'_3iemGrat')
                list_index.append(str(i)+'_'+str(ii)+'_carte')
                list_index.append(str(i)+'_'+str(ii)+'_autres')
                product_info[str(i)+'_'+str(ii)+'_noreduc']=0
                product_info[str(i)+'_'+str(ii)+'_freeImm']=0
                product_info[str(i)+'_'+str(ii)+'_2iemGrat']=0
                product_info[str(i)+'_'+str(ii)+'_3iemGrat']=0
                product_info[str(i)+'_'+str(ii)+'_carte']=0
                product_info[str(i)+'_'+str(ii)+'_autres']=0
            else:
                list_index.append(str(i)+'_'+str(ii))
                product_info[str(i)+'_'+str(ii)]=0
    list_index.append('quantity_unit')
    list_index.append('quantity_family')
    list_index.append('travelTime')
    list_index.append('mag_Fnac')
    list_index.append('mag_Carrefour')
    list_index.append('mag_Monoprix')
    list_index.append('mag_Autres')
    product_info['quantity_unit']=0
    product_info['quantity_family']=0
    product_info['travelTime']=0
    product_info['mag_Fnac']=0
    product_info['mag_Carrefour']=0
    product_info['mag_Monoprix']=0
    product_info['mag_Autres']=0

    return list_index

def matrixProductInit(list_index):
    return pd.DataFrame(index=list_index)
    

# Get user data

In [7]:
##############################################################################################
########################## Recuperation des donnees sur lutilisateur #########################
############################################################################################## 
    
def locUser(features_df):
    return features_df['features']['localisationUser']

def getUserId(features_df):
    return features_df['features']['idUser']

def modeUser(user_info):
    return ['walking'] #'bicycling','walking','transit','driving'

# Tester features mapping ou connecteur mysql direct
def getUsersMatrix(user_id):
    time_mysql=time()
    conn= mysql.connector.connect(host='localhost',database='dataiku',user='dkuadmin',password='Dataiku!')
    req = "SELECT * from userinformations where user_id=\'"+user_id+"\'"
    couponscores = pd.read_sql(req, conn)
    conn.close()
    print 'Time mysql :',time()-time_mysql
    return couponscores
    

# Get product categories

In [17]:
##############################################################################################
################################# Get categories des produits ################################
##############################################################################################

    
#### Get pickle and fit new data ####
    
def getCategory(df_topredict):
    """Get Categories of the product resulting from classification multi-label
    Return : prediction in columns category order"""

    # Prepare dataset to predict
    df_topredict=fromDFToInput(df_topredict)

    # Predict result
    predict_time=time()
    result_raw=model.predict(df_topredict)
    print 'PREDICT PICKLE', time()-predict_time
    result = pipeline_output_formatting(result_raw, list_cat)

    # Cleaning df to predict
    del df_topredict['NomDescr']
    df_topredict['analytics_category']=result
    #print 'Categories differentes :', df_topredict['analytics_category'].unique()
    return df_topredict

def fromDFToInput(df):
    df.fillna('', inplace=True)

    time_norm=time()            

    df['nomProduit']=df['nomProduit'].apply(lambda x:normaliz(x,french_stopwords,reg_numb,reg_ponct,stemmer))
    df['descriptionProduit']=df['descriptionProduit'].apply(lambda x:normaliz(x,french_stopwords,reg_numb,reg_ponct,stemmer))

    df['NomDescr']=df['nomProduit']+' '+df['descriptionProduit']

    return df

### Formatting the output of the pipeline
def pipeline_output_formatting(result_raw, list_cat):
    result = []
    for x in result_raw:
        temp = []
        for idy, y in enumerate(x):
            if y>0:
                temp.append(list_cat[idy])
        result.append(temp)
    return result    

# Enrich product information

In [18]:
##############################################################################################
######################### Enrichissement des donnees sur les produits ########################
##############################################################################################


### Add columns to result for transposition into matrix_product ###
def enrichDataProduct(product_info, locUser, modeUser):

    # Add travel time
    product_info=getTpsTrajet(product_info,locUser,modeUser)
    #print 'Temps trajet :', product_info['walking'].unique()
    # Add bio and surgele
    product_info['bio']=(product_info['nomProduit'].str.contains("bio")|product_info['nomProduit'].str.contains("certifie AB"))
    #print 'Bio :', product_info['bio'].unique()
    product_info['surgele']=product_info['nomProduit'].str.contains("surgele")
    # Caculate new columns of quantity and unity
    product_info['unite']=product_info.descriptionProduit.map(lambda x:getUnit(x).replace(' ',''))
    product_info['quantite']=product_info.descriptionProduit.map(lambda x:getQuantity(x))
    product_info['unite_val']=product_info.unite.map(lambda x:dictUnit(x))
    product_info['quantite_unite']=product_info['unite_val']*product_info['quantite']
    product_info['price_unit']=product_info['prix']/product_info['quantite_unite']
    product_info['price_unit']=product_info.apply(bestPriceUnit,axis=1)
    #print 'Price unit :', product_info['price_unit'].unique()
    del product_info['unite_val']

    # Add column analytics category
    product_info=getCategory(product_info)   

    # Quantile
    #product_info['quantile_prixraw']=product_info.groupby(['requete'])['prix'].apply(lambda x: qcut_custom(x,4))
    product_info['quantile_prixraw']=0
    product_info['quantile_prixquantite']=product_info.groupby(['requete'])['price_unit'].apply(lambda x: pct_rank_qcut(x,6))
    product_info['quantile_prixquantite']=product_info['quantile_prixquantite'].to_dict().values()
    # Reduction type
    product_info['reduc_type']=product_info['reduc'].map(lambda x:getReducType(x))

    # Univers de consommation
    regex = re.compile('[\[\] ]')
    product_info['analytics_category_list']=product_info['analytics_category']
    product_info['analytics_category_count']=product_info['analytics_category_list'].map(lambda x:len(x))
    #print 'LISTANALYTICS', type(product_info['analytics_category_list'][0])

    # Enrichissement des donnees des produits pour quils correspondent avec la matrice utilisateur
    listCatProd=['1000','1001','1002','1003','1004','1005','1006','1007','1008','1009','1010','1011',
                 '2000','2001','2002','2003','2004','2005','2006','2007','2008','2009','2010','2011','2012']

    for i in listCatProd:
        # Univers Conso
        product_info['universConso_'+str(i)]=product_info['analytics_category_list'].map(lambda x:1 if i in x else 0)/product_info['analytics_category_count']
        # Price sensitivity
        product_info['prixSens_'+str(i)+'_lowPrice']=product_info['quantile_prixquantite'].map(lambda x:float(abs(x-3))/2 if x<4 else 0)*product_info['universConso_'+str(i)]
        product_info['prixSens_'+str(i)+'_highPrice']=product_info['quantile_prixquantite'].map(lambda x:float((x%4))/2 if x>3 else 0)*product_info['universConso_'+str(i)]
        # Reduction Sensitivity
        list_red=['carte','noreduc','2iemGrat','3iemGrat','freeImm','autres']
        for y in list_red:
            product_info['reductionSens_'+str(i)+'_'+y]=product_info['reduc_type'].map(lambda x:1 if x==y else 0)*product_info['universConso_'+str(i)]

        # Bio Sensitiv
        product_info['bioSens_'+str(i)]=product_info['bio']*product_info['universConso_'+str(i)]
    del product_info['analytics_category_count']
    # Magasin
    product_info['mag_Fnac']=product_info['magNom'].map(lambda x:1 if 'fnac' in x.lower() else 0)
    product_info['mag_Carrefour']=product_info['magNom'].map(lambda x:1 if 'carrefour' in x.lower() else 0)
    product_info['mag_Monoprix']=product_info['magNom'].map(lambda x:1 if 'monop' in x.lower() else 0)
    product_info['mag_Autres']=product_info['magNom'].map(lambda x:1 if 'other' in x.lower() else 0)    


    return product_info

# Calculating similarities with past buy

In [23]:
##############################################################################################
############################ Calculating similarities with past buy ##########################
##############################################################################################

def getHistoricBuy(features_df):
    """ Get all the past buyings of the user. Put them into a database"""
    time_history=time()
    # Connecting to DB
    conn= mysql.connector.connect(host='localhost',database='dataiku',user='dkuadmin',password='Dataiku!')

    req='select * from dataiku.userlistecourse where (clipped=True or burned=True) and userId=\''+features_df['features']['idUser']+'\''
    df_buy = pd.read_sql(req, conn)
    conn.close()
    #print 'Time get History :',time()-time_history
    return df_buy

def asHist(old_df):
    """ Return True if the user has history"""
    result=False
    if old_df.shape[0]>0:
        result=True
    return result

def mergeNewOldDB(bon_plans,features_df,old_plans):
    """ Merge the new bons plans and the ones bought in the pass by the user"""
    # Get weight
    old_plans['weight']=old_plans.apply(getWeights,axis=1)
    #print old_plans['weight']

    op=old_plans[['idProd','nomProduit','descriptionProduit','prix','weight']]
    op['from']='old'
    # Get new bons plans
    bon_plans['weight']=0
    np=bon_plans[['id','nomProduit','descriptionProduit','prix','weight']]
    np['from']='new'
    # Merge data
    return pd.concat([op,np])

def similarityCalculation(df):
    """Calculate the similarity between products"""

    # Creating common index
    df['id2']=range(df.shape[0])
    df.set_index('id2',inplace=True)

    # Create a full description columns
    df.loc[:,'fulldescriptionProduit']=pd.Series(df.loc[:,'nomProduit'].str.cat(df.loc[:,'descriptionProduit'].values, sep=' '),index=df.index)


    # Scale the price with MinMaxScaler : price is in [0,1] range (for the distance)
    # Price is very low for the majority of data => log to recenter the distribution
    scaler = MinMaxScaler()
    df['prix']=df['prix'].map(lambda x:x if x>0 else 0.00001)
    df.loc[:,'scaledlogprix']=scaler.fit_transform(np.log(df.loc[:,'prix'].values).reshape(-1, 1))
    #print df

    # TFIDF
    tfidf_vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.8,ngram_range=(1,2))
    tfidf_productDescr=tfidf_vectorizer.fit_transform(df['fulldescriptionProduit'])

    ## Concatenate index / TFIDF product name / Product price - Dense matrix
    dist_data_dense=np.hstack((df.index.values.reshape(-1,1),tfidf_productDescr.toarray(),df.loc[:,'scaledlogprix'].values.reshape(-1,1)))

    def custom_distance(x, y):
        """ Creating the custom distance function between records"""
        i, j = int(x[0]), int(y[0])  # extract index which is id from pandas df
        # Computes cosine similarity on tf idf features (all features execpt id (0) and price (-1))
        tfidf_dist=cosine(dist_data_dense[i,1:-1],dist_data_dense[j,1:-1])
        # Price distance - absolute values
        price_distance=np.abs(dist_data_dense[i,-1]-dist_data_dense[j,-1])
        return tfidf_dist+price_distance

    ## Distinction between old and new products
    nb_old=df[df['from']=='old'].shape[0]
    nb_new=df.shape[0]-nb_old
    print 'NB_OLD and NB_NEW:',nb_old, nb_new


    ## Distance pairwise
    dist_df=pd.DataFrame(dist_data_dense)
    return pd.DataFrame(pairwise_distances(dist_df.iloc[0:nb_old],dist_df.iloc[nb_old:],metric=custom_distance).transpose(),columns=['Row'+str(i) for i in range(1, nb_old+1)],index=df.id[nb_old:])

def convertNeighborUnit(df):
    """ Convert the 0: close to 2:far unit into a 0:far to 1:close unit"""
    for i in df.columns:
        #for y in range(len(df[i])):
            #df.ix[y,i]=fabs(df.ix[y,i]-2)/2
        df[i]=df[i].apply(lambda x:float(x)/2)
    return df

def getWeights(df):
    result=0
    if df['burned']:
        result=1
    elif df['clipped']:
        result=.5
    return result

def getHistoricalWeight(df_with_weight, df_with_neight):
    # Get weight clipped or burned
    df_weight=df_with_weight.ix[df_with_weight['weight']>0,'weight']
    # multiplication weight with neightbor
    df_res=pd.DataFrame(df_with_neight.values*df_weight.transpose().values, columns=df_with_neight.columns, index=df_with_neight.index)
    df_res['hist_user_dist']=df_res.sum(axis=1)/df_res.shape[1]

    return df_res

def mergeScoringToListeCourse(product_info,df_score):
    """ Merge the scoring values to the bons plans """
    #print product_info.columns
    #print df_score.columns
    df_score2=df_score[['hist_user_dist']]
    return pd.merge(product_info, df_score2, left_on='id',right_on=df_score2.index.values,how='left')
    

# Similarity with request

In [11]:
##############################################################################################
################################### Similarity with request ##################################
##############################################################################################
    
def similarityRequest(df):
    """Calculate the similarity between the request and the responses"""
    result=pd.DataFrame(columns=['requete','nomProduit','id','dist_req'])
    #print df.columns
    for i in df['requete'].unique():
        # df_temp
        df_temp=df[['requete','nomProduit','id']]
        df_temp=df_temp[df_temp['requete']==i]
        df_temp=pd.DataFrame(np.array([[i, normaliz(i,french_stopwords,reg_numb,reg_ponct,stemmer), 'requete']])
                             , columns=['requete','nomProduit','id']).append(df_temp, ignore_index=True)
        # First words of the result
        len_req=len(i.split(' '))
        df_temp['shortNom']=df_temp['nomProduit'].map(lambda x:' '.join(x.split(' ')[0:len_req]))
        # TFIDF
        tfidf_vectorizer = TfidfVectorizer(min_df=0.001,max_df=0.8,ngram_range=(1,1))
        try:
            #print 'GO INTO REQUEST SIMILARITY'
            tfidf_productDescr=tfidf_vectorizer.fit_transform(df_temp['shortNom'])
            ## Distance cosinus
            dist_df=pd.DataFrame(tfidf_productDescr.toarray())
            res_temp=pairwise_distances(dist_df.iloc[0:1],dist_df.iloc[1:],metric='euclidean')
            df_temp=df_temp[df_temp['id']!='requete']
            df_temp.loc[:,'dist_req']=res_temp.transpose()
            df_temp.loc[:,'dist_req']=1-MinMaxScaler().fit_transform(df_temp.loc[:,'dist_req'])
            #print 'RES TEMP :', res_temp
        except:
            df_temp.loc[:,'dist_req']=0
        result=result.append(df_temp,ignore_index=True)
    return result

def addReqDistCol(df):
    result_dist=similarityRequest(df)
    result_dist.set_index('id',inplace=True)
    del result_dist['nomProduit']
    del result_dist['shortNom']
    del result_dist['requete']
    return pd.merge(df,result_dist,left_on='id',right_on=result_dist.index.values,how='left')
        

# Multipliation matrix 

In [12]:
##############################################################################################
################################### Multipliation matrix #####################################
##############################################################################################

def settingMatrix( matrix_user,product_info):
    # Setting the product matrix with only the necessarly data
    del matrix_user['age']
    del matrix_user['sexe']
    del matrix_user['situation']
    del matrix_user['zone']
    cols = [col for col in matrix_user.columns if col not in ['user_id']]
    matrix_product=product_info[cols]
    matrix_product.index=product_info['id']
    #matrix_product.to_csv('test_export_matrix_product.csv',sep=';')
    #product_info.to_csv('test_export_product_info.csv',sep=';',encoding='utf-8')
    return matrix_product

def multiData(matrix_user,matrix_product):
    # Multiply the data
    return pd.DataFrame(matrix_product.values*matrix_user.values, columns=matrix_product.columns, index=matrix_product.index)

def scoring( matrix_result_temp):
    # Summing by columns
    return matrix_result_temp.sum(axis=1).to_frame('score_analytics')

def sortingData(product_info,matrix_result,asHist,dict_pond_mod):
    # Transforming into dataframe for sorting
    product_info.index=product_info['id']
    # Concate simil and user form
    if ~asHist:
        product_info['hist_user_dist']=0

    result_final=pd.merge(product_info,matrix_result, left_on=product_info.index.values,right_on=matrix_result.index.values,how='left')  
    # Combining recommendation systems
    #print result_final['score_analytics'].unique()
    #print result_final['hist_user_dist'].unique()
    # Suppress price=0
    result_final=result_final[result_final['prix']>0]
    #print result_final['dist_req'].unique()
    result_final['order_reco']=result_final['score_analytics']*dict_pond_mod['form_user']+result_final['hist_user_dist']*dict_pond_mod['hist_user']+result_final['dist_req']*dict_pond_mod['req_sim']
    return result_final.groupby(['walking','magId','requete']).apply(lambda x:x.sort_values(by=['walking','order_reco','position','price_unit'], ascending=[True,False,True,True]))

# Mapping result to json

In [13]:
##############################################################################################
################################# Mapping result to json #####################################
##############################################################################################
    
# Reverse mapping : DataFrame to json
def mapToJson(select_final):
    """Conversion du dataframe de resultats en json pour etre envoye a apps

    Args:
        result: resultats avec le score des produits pour lutilisateur

    Returns:
        json de resultat
    """
    res=collections.OrderedDict()
    for ids,id_mag in enumerate(select_final.magId.unique()):
        #print id_mag
        #data={}
        df_req=select_final[select_final['magId']==id_mag]
        firsts=df_req.groupby('requete').first()
        data2={}
        for i in range(len(firsts)):
            #print 'i', i
            #data2[firsts.index[i]] = firsts['nomProduit'].values[i]
            data2[firsts.index[i]] = firsts['id'].values[i]
        res[id_mag]=data2
        #res.append(data)
    return res
    

## Saving results

In [25]:
##############################################################################################
################################# Saving the results #########################################
##############################################################################################

def saveResults(features_df,result_final):
    """ Save the recommandation into a database """
    # Shaping the dataframe
    df_to_save=pd.DataFrame(result_final)
    df_to_save['userId']=features_df['features']['idUser'][0]
    df_to_save['date']=time()
    df_to_save['clipped']=''
    df_to_save['burned']=''
    del df_to_save['analytics_category_list']
    del df_to_save['key_0']
    df_to_save.rename(columns={'id': 'idProd'}, inplace=True)

    # First way : dataiku
    #time_dtk=time()
    #output_ds = dtk.Dataset('userListeCourse_prepared',project_key='BONSPLANS')
    #output_ds.write_from_dataframe(df_to_save)
    #print 'Time Saving Dataiku :',time()-time_dtk

    # Second way : sql

    # Writing the data
    del df_to_save['magCat']
    del df_to_save['magHor']

    time_msql=time()
    # Connecting to DB
    #time_connect_db=time()
    engine = create_engine('mysql+mysqlconnector://dkuadmin:Dataiku!@localhost:3306/dataiku', echo=False)
    #print 'Time to connect to DB :', time()-time_connect_db

    #sql.write_frame(df_to_save,name='userlistecourse',con=conn, if_exists='append', flavor='mysql',index=False)
    df_to_save.to_sql(name='userlistecourse',con=engine, if_exists='append',index=False)
    conn.close()
    #req='INSERT INTO userlistecourse 
    #pd.read_sql(req, conn)


    #print 'Time mysql :',time()-time_msql

# Predict

In [27]:
##############################################################################################
##################################### Prediction ou calcul ###################################
##############################################################################################          
        
def predict( features_df):

    # Note: this sample uses the second form (decision_series, proba_df)

    # Note: this sample "cheats" and always returns 5 predictions.
    # You should actually return 1 prediction per row in the features_df

    ##### Liste des variables #####
    #list_prod=['pain','oeuf','champagne','eau']
    list_cat={'1000':'Alimentation','1001':'Puericulture et Enfants','1002':'Equipements de la maison et High-Tech',
      '1003':'Sports, Loisirs et Culture','1004':'Bricolage, Decoration, Jardinerie et Animalerie',
      '1005':'Mode et Accessoires','1006':'Auto et Moto','1007':'Beaute Sante et Bien-etre',
      '1008':'Hotel Restaurant et Cafes','1009':'Banques et Assurances','1010':'Voyages et transports',
      '1011':'Services',
      '2000':'Bebe','2001':'Boissons','2002':'Boucherie','2003':'Boulangerie','2004':'Charcuterie et Traiteur',
      '2005':'Cremerie','2006':'Epicerie salee','2007':'Epicerie sucree','2008':'Fruits frais','2009':'Legumes frais',
      '2010':'Produits de la mer','2011':'Produits dietetiques','2012':'Surgeles'}




    #print "Features DataFrame %s" % features_df

    # Get ponderation models
    dict_pond_mod=getDictPond()

    # Get raw data
    get_raw_time=time()
    rawJsonLC=mapFeatureDF(features_df)
    get_r_d=time()-get_raw_time
    print 'Temps recuperation raw data : ',get_r_d

    # Get initial product infos
    get_coupon_time=time()
    product_info_raw=getCouponsData(rawJsonLC)
    if len(product_info_raw.index)>0:
        product_info=joinBPMag(rawJsonLC,product_info_raw)
        get_i=time()-get_coupon_time
        print 'Temps recuperation coupons : ',get_i

        # List columns to add into product_info
        time_mat_prod=time()
        list_user=['universConso','bioSens','prixSens','reductionSens']
        list_index=matrixProductColumnsAndMajProducts(product_info,list_user,list_cat)

        # Creation of matrix product
        matrix_product=matrixProductInit(list_index)
        get_p=time()-time_mat_prod
        print 'Temps creation matrix product empty', get_p

        # Get user  infos
        get_user_info_time=time()
        loc_user=locUser(features_df)
        user_id=getUserId(features_df)
        matrix_user=getUsersMatrix(user_id)
        mode_user=modeUser(matrix_user)
        get_us=time()-get_user_info_time
        print 'Temps recuperation infos user : ',get_us

        # Enrich product infos
        get_product_infos=time()
        product_info=enrichDataProduct(product_info,loc_user, mode_user)
        #product_info.to_csv('n_enrich_product.csv',sep=';')
        get_produ=time()-get_product_infos
        print 'Temps enrichissement produit: ',get_produ

        # Mutlipy the matrix
        multi_matrix_time=time()
        matrix_product=settingMatrix(matrix_user,product_info)
        del matrix_user['user_id']
        matrix_result_temp=multiData(matrix_user,matrix_product)
        matrix_result=scoring(matrix_result_temp)
        #matrix_result.to_csv('n_result.csv',sep=';')
        get_mutl=time()-multi_matrix_time
        print 'Temps multiplication matrice: ',get_mutl

        # Get historic buyings
        time_simil=time()
        # Get bought bons plans
        old_plans=getHistoricBuy(features_df)
        asH=asHist(old_plans)
        if asH:
            old_new_bp=mergeNewOldDB(product_info,features_df,old_plans)
            res=similarityCalculation(old_new_bp)
            res_scale=convertNeighborUnit(res)
            result=getHistoricalWeight(old_new_bp, res_scale)
            product_info=mergeScoringToListeCourse(product_info,result)
            del product_info['weight']
        time_similarity=time()-time_simil
        print 'Time calculate similarity : ', time_similarity

        # Request similarity
        time_reqsim=time()
        product_info=addReqDistCol(product_info)
        time_reqsimil=time()-time_reqsim
        print 'Time Request Similarity : ', time_reqsimil

        # Sort data
        sort_data_time=time()
        result_final=sortingData(product_info,matrix_result,asH,dict_pond_mod)
        #result_final.to_csv('test_export_result_final.csv',sep=';',encoding='utf-8')
        #result_final.to_csv('n_result_final.csv',sep=';',encoding='utf-8',decimal=',')
        sort_time=time()-sort_data_time
        print 'Temps sorting data: ',sort_time

        # Mapping the result
        time_map_result=time()
        res=mapToJson(result_final)
        json_data=json.dumps(res,indent=3)
        time_map_js=time()-time_map_result
        print 'map results: ',time_map_js

        # Saving the results
        time_resultsave=time()
        #saveResults(features_df,result_final)
        save_t=time()-time_resultsave
        print 'save results: ',save_t

        # predictions, one per record (features_df row)
        predictions = pd.Series(json_data)

        # Printing execution time
        print 100*'*'
        print 'Temps recuperation raw data : ',get_r_d
        print 'Temps recuperation coupons : ',get_i
        print 'Temps creation matrix product empty', get_p
        print 'Temps recuperation infos user : ',get_us
        print 'Temps enrichissement produit: ',get_produ
        print 'Temps multiplication matrice: ',get_mutl
        print 'Temps calculate similarity : ', time_similarity
        print 'Temps request similarity : ', time_reqsimil
        print 'Temps sorting data: ',sort_time
        print 'Temps map results: ',time_map_js
        print 'Temps save results: ',save_t
        print 50*'*'
        print 'Total time: ',time()-get_raw_time
        print 100*'*'
    else :
        predictions = pd.Series([json.dumps([''],indent=3)])

    return (predictions)

In [28]:
features_df = json.loads(open('../inputs_bouchons/liste_course.json').read())
predict(features_df)

Temps recuperation raw data :  6.91413879395e-06
RangeIndex(start=0, stop=124, step=1)
Temps recuperation coupons :  0.0541110038757
Temps creation matrix product empty 0.080983877182
Time mysql : 0.276537895203
Temps recuperation infos user :  0.276617050171
PREDICT PICKLE 0.0174038410187
Temps enrichissement produit:  0.40919303894
Temps multiplication matrice:  0.00867199897766
Time calculate similarity :  0.858129024506
Time Request Similarity :  0.154882907867
Temps sorting data:  0.0323619842529
map results:  0.00736594200134
save results:  9.53674316406e-07
****************************************************************************************************
Temps recuperation raw data :  6.91413879395e-06
Temps recuperation coupons :  0.0541110038757
Temps creation matrix product empty 0.080983877182
Temps recuperation infos user :  0.276617050171
Temps enrichissement produit:  0.40919303894
Temps multiplication matrice:  0.00867199897766
Temps calculate similarity :  0.858129024

0    {\n   "aHR0cHM6Ly93d3cubW9ub3ByaXguZnIvbW9ub3A...
dtype: object