In [1]:
#%load_ext autoreload
#%autoreload 2

import pandas as pd
import numpy as np
import time
import turicreate as tc
from sklearn.model_selection import train_test_split

import sys
sys.path.append("..")
#import scripts.data_layer as data_layer

In [6]:
# Function to Categorise clients
def categorise(data,gender):
    #Filter Gender as Men from dump
    dump_data=data[data["Gender"]==gender]

    #Filter required columns
    data1=dump_data.drop(['Client_Category','Qty','Tax','Retail_Price','Mem_Disc','Oth_Disc'],axis=1)

    #Derive frequency of clients as featured column
    data1['Frequency']=data1.groupby("ClientID")["Created_Date2"].transform('nunique')
    #Derive total lifetime spendings of clients as featured column
    data1['Spendings']=data1.groupby("ClientID")["Total"].transform('sum')

    #Function to categorise clients based on frequency &  spendings
    def f(row):
        if row['Frequency'] >= 9:
            val = 'Loyal'
        elif row['Frequency'] == 1:
            val = 'Walkin'
        elif row['Frequency'] > 4 and  row['Spendings'] >= 50000:
            val = 'Premium'
        elif row['Frequency'] > 4 and  row['Spendings'] >= 35000:
            val = 'VIP'
        elif row['Frequency'] >=2 :
            val='Repeat'
        else:
            val="others"    
        return val

    #Derive category of clients based on frequency &  spendings as featured column
    data1['Client_Category'] = data1.apply(f, axis=1)

    #Save file to csv
    return data1

## Prepare Data
# Function to get freq data
def data_freq(data,client_category):
    #Modify Date column
    data[['Created_Date2']] = data[['Created_Date2']].applymap(str)\
                                                     .applymap(lambda s: "{}/{}/{}".format(s[4:6],s[6:], s[0:4]))
    #Filter Services
    data_1 = data[data['Type']=='S']
    #Filter Cliet category
    data_1 = data_1[data_1['Client_Category']==client_category]
    #Filter required columns
    data_1 = data_1[['ClientID','ProdID','Created_Date2']]

    #Clients unique
    customers = pd.DataFrame(np.unique(data_1['ClientID']))
    customers.columns = ['customerId']

    #Transactions
    transactions = data_1.groupby(['ClientID','Created_Date2']).agg({"ProdID":lambda x : "|".join(x)}).reset_index()
    transactions = transactions[['ClientID','ProdID']]
    transactions.columns = ['customerId','products']
    transactions['products'] = transactions['products'].apply(lambda x: [str(i) for i in x.split('|')])

    print(customers.shape)
    print(transactions.shape)

    s=time.time()

    data = pd.melt(transactions.set_index('customerId')['products'].apply(pd.Series).reset_index(), 
                 id_vars=['customerId'],
                 value_name='products') \
        .dropna().drop(['variable'], axis=1) \
        .groupby(['customerId', 'products']) \
        .agg({'products': 'count'}) \
        .rename(columns={'products': 'purchase_count'}) \
        .reset_index() \
        .rename(columns={'products': 'productId'})
    data['productId'] = data['productId'].astype(str)

    print("Execution time:", round((time.time()-s)/60,2), "minutes")
    print(data.shape)
    return data,customers

# Function to Create data dummy
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['purchase_dummy'] = 1
    return data_dummy

# Function to Create data normalized
def normalize_data(data):
    df_matrix = pd.pivot_table(data, values='purchase_count', index='customerId', columns='productId')
    df_matrix_norm = (df_matrix-df_matrix.min())/(df_matrix.max()-df_matrix.min())
    d = df_matrix_norm.reset_index()
    d.index.names = ['scaled_purchase_freq']
    return pd.melt(d, id_vars=['customerId'], value_name='scaled_purchase_freq').dropna()

# Function to Test train data split
def split_data(data):
    train, test = train_test_split(data, test_size = .2)
    train_data = tc.SFrame(train)
    test_data = tc.SFrame(test)
    return train_data, test_data

# Function to model selection
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target,
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
    elif name == 'jaccard':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='jaccard')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

# Function to get product vs description
def product_descr(item_data,gender,client_category):
    item_data = item_data[item_data['Type']=='S']
    item_data = item_data[(item_data['Gender']==gender) & (item_data['Client_Category']==client_category)]
    item_data = item_data[['ProdID','Descr']].drop_duplicates()
    item_data.columns = ['productId','Descr']
    return item_data
    
# Load data
dump_data1=pd.read_csv("datadump_cleaned.csv")

#Load item description cleaned data
item_data = pd.read_csv("datadump_cleaned_v1.csv")
item_data = product_descr(item_data,'Men','Loyal')

# Generate data filtered by gender
data = categorise(dump_data1,"Men")
# Generate transformed data filtered by client category
data,customers = data_freq(data,'Loyal')

# Generate dummy table and scaled/normalized purchase table
data_dummy = create_data_dummy(data)
data_norm = normalize_data(data)

# Split dummy table and scaled/normalized purchase table
train_data_dummy, test_data_dummy = split_data(data_dummy)
train_data_norm, test_data_norm = split_data(data_norm)

# variables to define field names_constant variables include:
user_id = 'customerId'
item_id = 'productId'
users_to_recommend = list(customers[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to print the head / first few rows in a defined dataset

# Model
final_model = tc.item_similarity_recommender.create(tc.SFrame(data_dummy), 
                                            user_id=user_id, 
                                            item_id=item_id, 
                                            target='purchase_dummy', 
                                            similarity_type='jaccard')
#final_model = tc.item_similarity_recommender.create(tc.SFrame(data_norm), 
#target='scaled_purchase_freq', similarity_type='cosine', similarity_type='jaccard'

recom = final_model.recommend(users=users_to_recommend, k=n_rec)
#final_model.save('my_model')
recom.print_rows(n_display)
df_rec = recom.to_dataframe()

rec = df_rec
pd.merge(rec,item_data,on='productId').sort_values(by=['customerId','rank'])

(421, 1)
(5308, 2)
Execution time: 0.04 minutes
(2032, 3)


+--------------------+-----------+----------------------+------+
|     customerId     | productId |        score         | rank |
+--------------------+-----------+----------------------+------+
|  AARTHI9840277327  |  GOTRLLRD | 0.11992592470986503  |  1   |
|  AARTHI9840277327  |  GHCOCADE | 0.09497458594185966  |  2   |
|  AARTHI9840277327  |  GHOLIODY |  0.0903415594782148  |  3   |
|  AARTHI9840277327  |  GHCHSTHA |  0.0895000696182251  |  4   |
|  AARTHI9840277327  |  GHGLNOAM | 0.08482720170702253  |  5   |
|  AARTHI9840277327  |  GHHCUTNR |  0.0770931329045977  |  6   |
|  AARTHI9840277327  |  GHGLNTOP | 0.07513303416115898  |  7   |
|  AARTHI9840277327  |  GHALSTTM | 0.06333546979086739  |  8   |
|  AARTHI9840277327  |  GSFARIAL | 0.06166854075023106  |  9   |
|  AARTHI9840277327  |  GHCHSTOP |  0.0609325681413923  |  10  |
| AATHIVA9626640440  |  GSMOUBET | 0.23674056927363077  |  1   |
| AATHIVA9626640440  |  GSSHAVIN | 0.13111121455828348  |  2   |
| AATHIVA9626640440  |  G

Unnamed: 0,customerId,productId,score,rank,Descr
0,AARTHI9840277327,GHCOCADE,0.094975,2,COCONUT CADENCE H.MSG - (GEN)
338,AARTHI9840277327,GHOLIODY,0.090342,3,OLIVE ODYSSEY H.MSG - (GEN)
666,AARTHI9840277327,GHCHSTHA,0.089500,4,CHANGE OF STYLE HAIRCUT - (GEN)
960,AARTHI9840277327,GHGLNOAM,0.084827,5,GLOBAL -NON AMMONIA COLOR - (GEN)
1235,AARTHI9840277327,GHGLNTOP,0.075133,7,GLOBAL -NON AMMONIA COLOR - (GEN) - SENIOR
...,...,...,...,...,...
665,YUGENDRA9845511172,GHOLIODY,0.101636,5,OLIVE ODYSSEY H.MSG - (GEN)
959,YUGENDRA9845511172,GHCHSTHA,0.100657,6,CHANGE OF STYLE HAIRCUT - (GEN)
337,YUGENDRA9845511172,GHCOCADE,0.098899,7,COCONUT CADENCE H.MSG - (GEN)
1234,YUGENDRA9845511172,GHGLNOAM,0.082383,8,GLOBAL -NON AMMONIA COLOR - (GEN)


In [39]:
#data.groupby('customerId').sum().reset_index().sort_values(by='purchase_count',ascending=False).head(5)

### 8.2. User History

In [11]:
# example 2: organize a given table into a dataframe with customerId, single productId, and purchase count
purch = pd.melt(dump_data1.set_index('ClientID')['Descr'].apply(pd.Series).reset_index(), 
             id_vars=['ClientID'],
             value_name='Descr') \
    .dropna().drop(['variable'], axis=1) \
    .groupby(['ClientID', 'Descr']) \
    .agg({'Descr': 'count'}) \
    .rename(columns={'Descr': 'purchase_count'}) \
    .reset_index() \
    .rename(columns={'Descr': 'productId'})

purch.sort_values(by='purchase_count',ascending=False).head(10)
#purch[(purch.ClientID == 'NEETHU9886523332')]
#dump_data1.groupby('Descr')['ProdID'].count().reset_index().sort_values(by='ProdID',ascending=False).head(50)

Unnamed: 0,Descr,ProdID
560,REGULAR HAIRCUT - (GEN),12939
190,EYEBROWS THREADING - (LAD),9252
497,MOUSTACHE / BEARD TRIM - (GEN),4796
664,UPPER LIP THREADING - (LAD),4377
561,REGULAR HAIRCUT - (GEN) - SENIOR,4320
592,SHAVING - (GEN),1877
65,BEARD DESIGNING - (GEN),1799
85,CHANGE OF STYLE HAIR CUT - (LAD) - SENIOR,1186
86,CHANGE OF STYLE HAIRCUT - (GEN),1162
529,P.WAX- UNDER ARMS - (LAD),1059
