# XGBoost model building

In [1]:
import numpy as np
import pandas as pd
import time

In [2]:
data = pd.read_csv('train2.txt',sep='|')

In [3]:
data.head()

Unnamed: 0,ID_Customer,Cod_Prod,Cod_Fecha,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05
0,A0000001,601,2007-05,5,4,3,1,0
1,A0000001,704,2013-04,5,4,3,1,0
2,A0000001,2501,2006-03,5,4,3,1,0
3,A0000001,2503,2006-03,5,4,3,1,0
4,A0000001,1011,2011-04,5,4,3,1,0


In [6]:
data2 = data.copy(deep=True)
data2['Year'] = data2.Cod_Fecha.apply(lambda x: x[:4])
data2['Month'] = data2.Cod_Fecha.apply(lambda x: x[-2:])
data2.head()

Unnamed: 0,ID_Customer,Cod_Prod,Cod_Fecha,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05,Year,Month
0,A0000001,601,2007-05,5,4,3,1,0,2007,5
1,A0000001,704,2013-04,5,4,3,1,0,2013,4
2,A0000001,2501,2006-03,5,4,3,1,0,2006,3
3,A0000001,2503,2006-03,5,4,3,1,0,2006,3
4,A0000001,1011,2011-04,5,4,3,1,0,2011,4


# Transformed data considering only the last N years (when most products are available)

In [7]:
N_YEARS=2017-1990

In [8]:
dataLastN = data2.copy(deep=True)
dataLastN.Year = dataLastN.Year.astype(int)
dataLastN = dataLastN[dataLastN.Year >= 2017-N_YEARS]

# Select the clients which are useful for a supervised classifier, that is, those with a recent product purchased

In [9]:
#Sort data by date
dataLastN.sort_values(by='Cod_Fecha',inplace=True)

In [10]:
#Last product of every customer
idAndLastProd = dataLastN.groupby('ID_Customer').last()
idAndLastProd.head()

Unnamed: 0_level_0,Cod_Prod,Cod_Fecha,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05,Year,Month
ID_Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A0000001,704,2013-04,5,4,3,1,0,2013,4
A0000002,9992,2015-02,5,5,1,1,0,2015,2
A0000003,201,2016-02,5,5,5,2,0,2016,2
A0000004,2301,2016-05,5,5,3,1,0,2016,5
A0000006,2302,2015-05,5,5,3,1,0,2015,5


In [11]:
idAndLastProd.shape

(670160, 9)

In [12]:
#Years from today in which last product is considered
YEARS_LAST_PRODUCT = 5

In [13]:
print 'Clients with older products (they fall out of the classifier): ', idAndLastProd[idAndLastProd.Year < 2017 - YEARS_LAST_PRODUCT].shape[0], idAndLastProd[idAndLastProd.Year < 2017 - YEARS_LAST_PRODUCT].shape[0]/float(idAndLastProd.shape[0])*100, '%'
print 'Clients with recent products: ', idAndLastProd[idAndLastProd.Year >= 2017 - YEARS_LAST_PRODUCT].shape[0], idAndLastProd[idAndLastProd.Year >= 2017 - YEARS_LAST_PRODUCT].shape[0]/float(idAndLastProd.shape[0])*100, '%'

Clients with older products (they fall out of the classifier):  148053 22.0921869404 %
Clients with recent products:  522107 77.9078130596 %


In [14]:
clients_sup = idAndLastProd[idAndLastProd.Year >= 2017 - YEARS_LAST_PRODUCT].index.values
clients_unsup = idAndLastProd[idAndLastProd.Year < 2017 - YEARS_LAST_PRODUCT].index.values

In [15]:
#Returns the dataframe with the corresponding clients
def selectClients(data,clients):
    #df is a dataframe
    #clients is an array of strings
    df = pd.DataFrame(columns=data.columns.values)
    df = data[data.ID_Customer.isin(clients)]
    
    return df

In [16]:
data_sup = selectClients(dataLastN,clients_sup)
print data_sup.shape, data_sup.ID_Customer.unique().size
data_unsup = selectClients(dataLastN,clients_unsup)
print data_unsup.shape, data_unsup.ID_Customer.unique().size

(2935685, 10) 522107
(361939, 10) 148053


## ID vs Products in last N years. Classifier input.

In [17]:
colnames = data_sup.Cod_Prod.unique()
colnames.sort()
rownames = data_sup.ID_Customer.unique()
rownames.sort()
idVSProdLastN = pd.DataFrame(data=np.zeros((rownames.size,colnames.size)),columns=colnames,index=rownames,dtype=int)

In [18]:
groupedDataLastN = data_sup.groupby(['ID_Customer','Cod_Prod'])['Cod_Prod'].count()
groupedDataLastN.head(10)

ID_Customer  Cod_Prod
A0000001     601         1
             704         1
             1011        1
             2501        1
             2503        1
A0000002     301         1
             601         1
             801         1
             9992        1
A0000003     201         1
Name: Cod_Prod, dtype: int64

In [19]:
lastProduct = data_sup.groupby('ID_Customer')['Cod_Prod'].last()
lastProduct.head()

ID_Customer
A0000001     704
A0000002    9992
A0000003     201
A0000004    2301
A0000006    2302
Name: Cod_Prod, dtype: int64

### We put the last product to 0 in groupedDataLastN

In [20]:
groupedDataLastN.loc[zip(lastProduct.index.values,lastProduct.values)] = 0
groupedDataLastN.head(10)

ID_Customer  Cod_Prod
A0000001     601         1
             704         0
             1011        1
             2501        1
             2503        1
A0000002     301         1
             601         1
             801         1
             9992        0
A0000003     201         0
Name: Cod_Prod, dtype: int64

In [21]:
#Save data on matrix
for product in groupedDataLastN.index.levels[1].values:
    columna = groupedDataLastN[:,product]
    idVSProdLastN.loc[columna.index,product] = columna.values
idVSProdLastN.head()

Unnamed: 0,101,102,103,104,201,301,502,503,504,506,...,2706,2707,2801,2901,3001,3101,3401,9991,9992,9993
A0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0000002,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0000003,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0000004,0,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0000006,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


In [22]:
#Customer features
customers = data_sup.groupby('ID_Customer')['Socio_Demo_01','Socio_Demo_02','Socio_Demo_03','Socio_Demo_04','Socio_Demo_05'].first()
customers[:5]

Unnamed: 0_level_0,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05
ID_Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0000001,5,4,3,1,0
A0000002,5,5,1,1,0
A0000003,5,5,5,2,0
A0000004,5,5,3,1,0
A0000006,5,5,3,1,0


In [23]:
#New rows
idVSProdLastN['Socio_Demo_01_1'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_01_2'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_01_3'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_01_4'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_01_5'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_02_1'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_02_2'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_02_3'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_02_4'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_02_5'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_03_1'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_03_2'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_03_3'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_03_4'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_03_5'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_04_1'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_04_2'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_05_1'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_05_2'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_05_3'] = np.zeros(idVSProdLastN.shape[0],dtype=int)
idVSProdLastN['Socio_Demo_05_4'] = np.zeros(idVSProdLastN.shape[0],dtype=int)

In [24]:
socioDemoCol = ['Socio_Demo_01_1','Socio_Demo_01_2','Socio_Demo_01_3','Socio_Demo_01_4','Socio_Demo_01_5','Socio_Demo_02_1','Socio_Demo_02_2','Socio_Demo_02_3','Socio_Demo_02_4','Socio_Demo_02_5','Socio_Demo_03_1','Socio_Demo_03_2','Socio_Demo_03_3','Socio_Demo_03_4','Socio_Demo_03_5','Socio_Demo_04_1','Socio_Demo_04_2','Socio_Demo_05_1','Socio_Demo_05_2','Socio_Demo_05_3','Socio_Demo_05_4']

### One-hot encoding

In [25]:
#We sum 1 to Socio_Demo_05 in order to compute one hot encoding
customers.Socio_Demo_05 += 1
customers.values

array([[5, 4, 3, 1, 1],
       [5, 5, 1, 1, 1],
       [5, 5, 5, 2, 1],
       ..., 
       [1, 1, 1, 2, 1],
       [3, 2, 3, 1, 1],
       [3, 2, 2, 1, 1]])

In [26]:
#Introduce one-hot encoding to matrix.
start = 94
i = 0
for ncols in np.array([5,5,5,2,4]):
    idVSProdLastN.iloc[:,start:start+ncols] = pd.get_dummies(customers.values[:,i]).values
    start += ncols
    i += 1

In [27]:
pd.get_dummies(customers.values[:,0]).values

array([[0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1],
       ..., 
       [1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0],
       [0, 0, 1, 0, 0]], dtype=uint8)

In [28]:
idVSProdLastN.head()

Unnamed: 0,101,102,103,104,201,301,502,503,504,506,...,Socio_Demo_03_2,Socio_Demo_03_3,Socio_Demo_03_4,Socio_Demo_03_5,Socio_Demo_04_1,Socio_Demo_04_2,Socio_Demo_05_1,Socio_Demo_05_2,Socio_Demo_05_3,Socio_Demo_05_4
A0000001,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
A0000002,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
A0000003,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
A0000004,0,0,0,0,1,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
A0000006,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0


In [29]:
idVSProdLastN.shape

(522107, 115)

In [30]:
#Prediction
lastProduct.shape

(522107,)

# Save database to arff file

import csv

csvfile = open('wekaDataHeader.arff','w')
csvwriter = csv.writer(csvfile, delimiter=',')

products = dataLastN.Cod_Prod.unique()
products.sort()
csvfile.write('@RELATION cajamar\n\n')
for product in products:
    csvfile.write('@ATTRIBUTE ' + str(product) + ' NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_01_1 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_01_2 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_01_3 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_01_4 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_01_5 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_02_1 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_02_2 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_02_3 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_02_4 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_02_5 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_03_1 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_03_2 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_03_3 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_03_4 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_03_5 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_04_1 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_04_2 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_05_1 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_05_2 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_05_3 NUMERIC\n')
csvfile.write('@ATTRIBUTE socio_demo_05_4 NUMERIC\n')
csvfile.write('@ATTRIBUTE class {')
for product in products[:-1]:
    csvfile.write(str(product)+',')
csvfile.write(str(products[-1:])[1:-1]+'}\n\n')
csvfile.write('@DATA\n')

csvfile.close()

idVSProdLastN.to_csv('wekaData.arff',header=False,index=False)

# Model construction

In [2]:
import xgboost
from sklearn import model_selection
from sklearn.metrics import accuracy_score

ImportError: No module named xgboost

# Logistic Regression
### We create a logistic regression for every product

In [31]:
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
logist_reg = LogisticRegression

In [32]:
print 'N. of products: ', lastProduct.unique().size
print lastProduct.value_counts().tail(10)

N. of products:  87
1008    4
804     3
1014    2
1004    2
1305    2
1015    2
803     1
1308    1
2901    1
2703    1
Name: Cod_Prod, dtype: int64


log_reg_vect = []
train_size = idVSProdLastN.shape[0]/10
start_time = time.time()
for product in lastProduct.unique():
    x_train = idVSProdLastN[train_size:]
    y_train = lastProduct[train_size:]==product
    x_test = idVSProdLastN[:train_size]
    y_test = lastProduct[:train_size]==product
    logist_reg = LogisticRegression
    logist_reg.fit(x_train,y_train)
    log_reg_vect.append((product,logist_reg))
print 'Total time: ', time.time()-start_time, 'seconds'

In [43]:
start_time = time.time()
train_size = idVSProdLastN.shape[0]/1000
x_train = idVSProdLastN[-train_size:]
y_train = lastProduct[-train_size:]
x_test = idVSProdLastN[:train_size]
y_test = lastProduct[:train_size]
logist_reg = LogisticRegression(C=1e5,multi_class='multinomial',solver='newton-cg')
logist_reg.fit(x_train,y_train)
print 'Time:', time.time()-start_time
print logist_reg.predict(x_test)

Time: 13.5929961205
[ 704 2106  601 2302 1002  301 3001  201 2302  201  601  601 2601 2102  601
  301 2302 9991 2601  706 2201 2302 2302 3001  301  301 3001  301  601  301
  601 2601 2302 2301 2201 2301 2106 1022 2601  601 2102 9992  201 2102  201
  601 2302 2201  301  301  704 2302  201  601 3001 2302  201  201  601  301
 2302  706 1804  201 2106 2302 9993  601  601  706  601 1011  201  601  601
 2102 2102 2601 3001 2302  201 2301  201  201  601  704  301  201  601  201
 2302  201  601 2302 2601 2201 2301 2301 1022 2205  201 2301 2102 1802 2205
 2102 2301 9993  201  601  706 2102 2201  601 2106  301 2601  201  201  704
 2201  601 2302  601 3001  301  601 9992  601  601  601 1001 2102  301 1002
  601 2302 2301  201 2201  601 2102 9992  201 2102 2301 1802 2301 9991 9992
 2601 1002  201 2301  601 9992  301  601  601 2201  201  601  601  601  201
  706 9992  201 2205 2102  704 2301 2102  201 2102  706 2102  601  301  201
  704 2301 9993 2302 2205 2102  704 3001  301 2302  601 3001  601 22

In [152]:
y_test[logist_reg.predict(x_test)==601]

ID_Customer
A0000003     201
A0000016     704
A0000021    2301
A0000026     301
A0000041    2705
A0000043    1011
A0000052     201
A0000062     706
A0000072     704
A0000078     301
A0000092     704
A0000094    2705
A0000097    9992
A0000101     704
A0000104    2205
A0000117    1011
A0000124    9992
A0000129     706
A0000153     601
A0000159    2302
A0000167     301
A0000169     706
A0000175     301
A0000179     706
A0000181     704
A0000183    2302
A0000190    9992
A0000201     201
A0000217    2705
A0000221     707
            ... 
A0000479    2202
A0000490    2705
A0000498    2302
A0000503    2501
A0000516     201
A0000519    1022
A0000523    2205
A0000525    2302
A0000574    2205
A0000583     706
A0000589    2302
A0000601    2302
A0000606    1020
A0000617     301
A0000641    2205
A0000643    2301
A0000645    2205
A0000661     707
A0000663     704
A0000671     704
A0000676    2301
A0000687    9992
A0000693     201
A0000700    2705
A0000705    9991
A0000707    2302
A0000709    2302
A0

In [65]:
a= y_train.unique()
a.sort()
a

array([ 201,  301,  601,  704,  706,  707,  708,  801, 1001, 1002, 1011,
       1020, 1022, 1309, 1310, 1801, 1802, 1804, 2102, 2106, 2201, 2205,
       2206, 2301, 2302, 2401, 2501, 2601, 2602, 2701, 2704, 2705, 2706,
       2707, 3001, 9991, 9992, 9993])

In [80]:
a= logist_reg.predict_proba(x_test[:1])[0]
b = y_train.unique()
b.sort()
ab = pd.Series(data=a,index=b)
ab.nlargest(3)

704     1.000000e+00
301     1.392967e-09
2302    5.669144e-10
dtype: float64

In [81]:
print 'Score (R square):', logist_reg.score(x_test, y_test) 
print 'Training MSE: ', np.mean((logist_reg.predict(x_train) - y_train)**2)
print 'Test MSE: ', np.mean((logist_reg.predict(x_test) - y_test)**2)

Score (R square): 0.199233716475
Training MSE:  480295.783525
Test MSE:  10251743.1398


In [82]:
print 'Score with 9/10 of data:', logist_reg.score(idVSProdLastN[:-train_size],lastProduct[:-train_size])

Score with 9/10 of data: 0.303960044863


# Recomender based on user similarity by features and product purchase

In [52]:
idVSProdLastN.head()

Unnamed: 0,101,102,103,104,201,301,502,503,504,506,...,Socio_Demo_03_2,Socio_Demo_03_3,Socio_Demo_03_4,Socio_Demo_03_5,Socio_Demo_04_1,Socio_Demo_04_2,Socio_Demo_05_1,Socio_Demo_05_2,Socio_Demo_05_3,Socio_Demo_05_4
A0000001,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
A0000002,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,1,0,1,0,0,0
A0000003,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,1,1,0,0,0
A0000004,0,0,0,0,1,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
A0000006,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0


In [66]:
customers.head()

Unnamed: 0_level_0,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05
ID_Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0000001,5,4,3,1,1
A0000002,5,5,1,1,1
A0000003,5,5,5,2,1
A0000004,5,5,3,1,1
A0000006,5,5,3,1,1


In [125]:
# Returns a naive similarity score for user1 and user2
def simProdFreq(dataFrame,user1,user2,min_common_items=1):
    # GET PRODUCTS OF USER1
    prod_user1 = dataFrame.loc[user1][:94]
    # GET PRODUCTS OF USER2
    prod_user2 = dataFrame.loc[user2][:94]
    
    # Frequency of prodcuts
    freq = 1-idVSProdLastN.sum(axis=0)[:94]/float(np.sum(idVSProdLastN.sum(axis=0)[:94]))
    # Less frequent products are more significant for similarity
    
    # FIND SHARED PRODUCTS
    shared = prod_user1 + prod_user2    
    if len(shared[shared==2])==0:
        return 0
    if(len(shared[shared==2])<min_common_items):
        return 0
    #return distance weighted with product frequencies 
    return np.sum(np.dot(prod_user1[shared==2],freq[shared==2]))/float(prod_user1[shared==2].size)

def simDist(feat_user1,feat_user2):
    return 1 - (np.sum(np.square(np.subtract(feat_user1[:3],feat_user2[:3])))+(feat_user1[3]!=feat_user2[3])+(feat_user1[4]!=feat_user2[4]))/float(max(np.sum(feat_user1),np.sum(feat_user2)))

# Returns a naive similarity score for user1 and user2
# NOT using one hot encoding
def simUserFeat(dataFrame,user1,user2):
    # GET FEATURES OF USER1
    feat_user1 = dataFrame.loc[user1]
    # GET FEATURES OF USER2
    feat_user2 = dataFrame.loc[user2]
    
    # COMPUTE SIMILARITY
    return max(0, simDist(feat_user1,feat_user2))

In [126]:
print simProdFreq(idVSProdLastN,'A0000001','A0000002')
print simUserFeat(customers,'A0000001','A0000002')

0.812256326499
0.642857142857


## Matrix of user similarities (10x10 for the moment)

In [84]:
users = customers.index.values[:10]
users

array(['A0000001', 'A0000002', 'A0000003', 'A0000004', 'A0000006',
       'A0000007', 'A0000008', 'A0000011', 'A0000013', 'A0000015'], dtype=object)

In [115]:
def computeSimMatrix(users):
    matrix = np.ones((users.size,users.size),dtype=float)
    j= 0
    for user1 in users:
        i = 0
        for user2 in users:
            if j >= i:
                i += 1
                continue
            matrix[j,i] = 0.5*(simProdFreq(idVSProdLastN,user1,user2)+simUserFeat(customers,user1,user2))
            matrix[i,j] = matrix[j,i]
            i += 1
        j += 1
    return matrix

In [138]:
start_time = time.time()
testMatrix = computeSimMatrix(users)
print 'Time:', time.time()-start_time

Time: 62.4714381695


In [139]:
print simProdFreq(idVSProdLastN,users[0],users[2])
print simUserFeat(customers,users[0],users[2])

0.983262194137
0.666666666667


In [140]:
testMatrix

array([[ 1.        ,  0.72755673,  0.82496443,  0.87279483,  0.87279483,
         0.80612816,  0.67279483,  0.82209483,  0.69709483,  0.80423769],
       [ 0.72755673,  1.        ,  0.46707848,  0.7893811 ,  0.7893811 ,
         0.8394615 ,  0.7476725 ,  0.57896443,  0.67535893,  0.81557157],
       [ 0.82496443,  0.46707848,  1.        ,  0.80041181,  0.80041181,
         0.25      ,  0.52263403,  0.88374514,  0.13888889,  0.66152292],
       [ 0.87279483,  0.7893811 ,  0.80041181,  1.        ,  0.92271443,
         0.8394615 ,  0.72271443,  0.82896443,  0.72779662,  0.82271443],
       [ 0.87279483,  0.7893811 ,  0.80041181,  0.92271443,  1.        ,
         0.8394615 ,  0.74316225,  0.85648898,  0.7426121 ,  0.82271443],
       [ 0.80612816,  0.8394615 ,  0.25      ,  0.8394615 ,  0.8394615 ,
         1.        ,  0.70612816,  0.74987816,  0.77279483,  0.87279483],
       [ 0.67279483,  0.7476725 ,  0.52263403,  0.72271443,  0.74316225,
         0.70612816,  1.        ,  0.72441225

In [130]:
customers.head(10)

Unnamed: 0_level_0,Socio_Demo_01,Socio_Demo_02,Socio_Demo_03,Socio_Demo_04,Socio_Demo_05
ID_Customer,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0000001,5,4,3,1,1
A0000002,5,5,1,1,1
A0000003,5,5,5,2,1
A0000004,5,5,3,1,1
A0000006,5,5,3,1,1
A0000007,5,5,2,2,1
A0000008,3,5,2,1,4
A0000011,4,5,4,2,1
A0000013,3,5,2,2,1
A0000015,4,5,2,2,1


## Take the 3 most similar users and recommend the most frequent product

In [131]:
recommendation = pd.Series(data=np.zeros(users.size),index=users)

In [None]:
def recommendProd(user,simVector):
    
    top3users = np.argsort(simVector)[-4:-1]
    

In [141]:
vector = testMatrix[0]

In [166]:
vector

array([ 1.        ,  0.72755673,  0.82496443,  0.87279483,  0.87279483,
        0.80612816,  0.67279483,  0.82209483,  0.69709483,  0.80423769])

In [164]:
np.argsort(vector)

array([6, 8, 1, 9, 5, 7, 2, 3, 4, 0])

In [163]:
np.argsort(np.array([0,0,1,0,3]))

array([0, 1, 3, 2, 4])

In [154]:
groupedDataLastN.head(10)

ID_Customer  Cod_Prod
A0000001     601         1
             704         0
             1011        1
             2501        1
             2503        1
A0000002     301         1
             601         1
             801         1
             9992        0
A0000003     201         0
Name: Cod_Prod, dtype: int64

In [161]:
groupedDataLastN['A0000001']+groupedDataLastN['A0000002']

Cod_Prod
301     NaN
601     2.0
704     NaN
801     NaN
1011    NaN
2501    NaN
2503    NaN
9992    NaN
Name: Cod_Prod, dtype: float64