In [63]:
import pandas as pd
import itertools
import numpy as np    
from scipy.spatial.distance import squareform

### Load and clean working data

In [3]:
customerData = pd.read_csv('./groceries.csv', sep=";")
print(customerData.shape)
customerData.sample(5)

(10000, 10)


Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries
9927,9928,21,male,single,secondary,21000,good,1,entrepreneur,"hygiene articles,house keeping products"
7664,7665,69,male,married,primary,23000,excellent,3,retired,"dog food,pet care,shopping bags"
939,940,31,female,married,secondary,48000,very_good,5,housemaid,"sparkling wine,dish cleaner"
5804,5805,65,female,married,secondary,38000,good,3,retired,"sausage,beef,citrus fruit,tropical fruit,berri..."
947,948,51,female,single,tertiary,19000,fair,1,housemaid,"pork,tropical fruit,grapes,hard cheese,soda,na..."


In [8]:
customerData.isnull().sum()

Customer_ID             0
Age                     0
Sex                     0
Marital_Status          0
Education               0
Income                  0
Customer_Rating         0
Persons_in_Household    0
Occupation              0
Groceries               0
dtype: int64

In [14]:
print(customerData.dtypes)
customerData[["Persons_in_Household","Income","Age"]].apply(min)

Customer_ID              int64
Age                     object
Sex                     object
Marital_Status          object
Education               object
Income                  object
Customer_Rating         object
Persons_in_Household     int64
Occupation              object
Groceries               object
dtype: object


Persons_in_Household    1
Income                   
Age                      
dtype: object

In [171]:
cData = customerData.copy()

In [172]:
cData[["Income","Age"]] = cData[["Income","Age"]].apply(pd.to_numeric, errors = "coerce")

In [173]:
cData.isna().sum()

Customer_ID               0
Age                     473
Sex                       0
Marital_Status            0
Education                 0
Income                  477
Customer_Rating           0
Persons_in_Household      0
Occupation                0
Groceries                 0
dtype: int64

In [174]:
cData[["Income","Age"]] = cData[["Income","Age"]].fillna(cData[["Income","Age"]].mean())
cData.isna().sum()

Customer_ID             0
Age                     0
Sex                     0
Marital_Status          0
Education               0
Income                  0
Customer_Rating         0
Persons_in_Household    0
Occupation              0
Groceries               0
dtype: int64

In [175]:
# Set the new index to be customer ID
# cData.set_index("Customer_ID", inplace=True)
# cData.sample(5)

In [176]:
print(cData.Marital_Status.unique())
print(cData.Education.unique())
print(cData.Customer_Rating.unique())
print(cData.Occupation.unique())

['married' 'single' 'divorced']
['primary' 'secondary' 'tertiary']
['very_good' 'good' 'fair' 'excellent' 'poor']
['retired' 'housemaid' 'blue-collar' 'unemployed' 'entrepreneur'
 'self-employed' 'management' 'services' 'unknown' 'technician']


In [177]:
# Factorize ordinal columns
eduMapper = {
    "primary":0,
    "secondary":1,
    "tertiary":2
}
rateMapper = {
    "poor":0,
    "fair":1,
    "good":2,
    "very_good":3,
    "excellent":4
}

In [178]:
cData["Education"] = cData["Education"].replace(eduMapper)
cData["Customer_Rating"] = cData["Customer_Rating"].replace(rateMapper)

### Compute dissimilarity
#### Jaccard similarity is used between sets <br> Calc. the avg to derive the dissimilarity for all attributes


In [179]:
cData.sample(5)

Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries
9946,9947,34.0,female,married,1,15000.0,2,3,blue-collar,soda
1736,1737,53.0,male,married,1,33000.0,0,4,entrepreneur,"frankfurter,white bread,pastry,dog food,soda,s..."
4283,4284,53.381757,female,single,1,41000.0,4,1,blue-collar,frankfurter
4661,4662,43.0,female,married,1,38000.0,1,3,self-employed,"UHT-milk,condensed milk,coffee,cling film/bags"
9820,9821,39.0,female,married,0,43000.0,2,4,entrepreneur,"beef,citrus fruit,cream cheese,frozen fish,flo..."


In [180]:
rateMax = max(cData["Customer_Rating"])
rateMin = min(cData["Customer_Rating"])
eduMax = max(cData["Education"])
eduMin = min(cData["Education"])
ageMax = max(cData["Age"])
ageMin = min(cData["Age"])
incMax = max(cData["Income"])
incMin = min(cData["Income"])

In [181]:
def jaccard_sim(set1, set2):
    intersection = len(list(set(set1).intersection(set2)))
    union = (len(set1) + len(set2)) - intersection
    return float(intersection) / union

In [182]:
def calcDis(cust1, cust2): 
    # For categoricals
    sexDis = 0
    marDis = 0
    occDis = 0
    if(cust1["Sex"] != cust2["Sex"]):
        sexDis = 1
    if(cust1["Marital_Status"] != cust2["Marital_Status"]):
        marDis = 1
    if(cust1["Occupation"] != cust2["Occupation"]):
        occDis = 1
    
    # For ordinal vars
    rateDis = abs(cust1["Customer_Rating"] - cust2["Customer_Rating"]) / (rateMax - rateMin)
    eduDis = abs(cust1["Education"] - cust2["Education"]) / (eduMax - eduMin)

    # For numerical vars
    ageDis = abs(cust1["Age"] - cust2["Age"]) / (ageMax - ageMin)
    incDis = abs(cust1["Income"] - cust2["Income"])/ (incMax - incMin)

    # For set dist
    grocDis = 1 - jaccard_sim(cust1["Groceries"].split(","), cust2["Groceries"].split(","))

    return ((sexDis + marDis + occDis + rateDis + eduDis + ageDis + incDis + grocDis)/8)


In [183]:
dummyStuff = cData.iloc[0:3]
dummyStuff

Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries
0,1,75.0,male,married,0,20000.0,3,3,retired,"citrus fruit,semi-finished bread,margarine,rea..."
1,2,61.0,female,single,1,28000.0,2,1,housemaid,"tropical fruit,yogurt,coffee"
2,3,32.0,male,single,1,34000.0,3,1,blue-collar,whole milk


In [202]:
def calcForDf(df, dfToCompare=None):
    result=pd.DataFrame()
    runDataframe = dfToCompare
    if dfToCompare is None:
        runDataframe = df
    for i in range(0,len(runDataframe)):
        for j in range(0, len(df)):
            result.loc[i,j] = calcDis(runDataframe.iloc[i],df.iloc[j])
    return result

In [261]:
# calcForDf(dummyStuff)
# # Takes too long
# disMatrix = calcForDf(cData)

### Search for the 10 NN of Customers w/ IDs 73, 563, 1603, 2200, 3703, 4263, 5300, 6129, 7800, 8555

In [277]:
def calcSimForOne(id, df, trim):
    cust = cData.loc[cData["Customer_ID"]==id].squeeze()
    result = []
    runDf = df.loc[df["Customer_ID"]!=cust["Customer_ID"]]
    for i in range(0, len(runDf)):
        compCust = runDf.iloc[i]
        result.append([compCust["Customer_ID"], 1-calcDis(cust,compCust)])

    # Sort the results by desc. similarity
    sortedResult = sorted(result, key=lambda x: x[1], reverse=True)
    if trim: 
        return sortedResult[0:trim]
    return sortedResult


[1, 2]

In [189]:
pickedCustomers = cData.loc[cData["Customer_ID"].isin([73, 563, 1603, 2200, 3703, 4263, 5300, 6129, 7800, 8555])]
pickedCustomers.head(5)

Unnamed: 0,Customer_ID,Age,Sex,Marital_Status,Education,Income,Customer_Rating,Persons_in_Household,Occupation,Groceries
72,73,78.0,female,divorced,2,32000.0,3,2,retired,"frankfurter,citrus fruit,whole milk,domestic e..."
562,563,22.0,female,married,2,38000.0,2,2,housemaid,whole milk
1602,1603,24.0,male,single,0,40000.0,1,2,self-employed,"other vegetables,rolls/buns,canned fish"
2199,2200,24.0,male,divorced,0,32000.0,1,1,unknown,"rolls/buns,tea"
3702,3703,70.0,female,single,1,33000.0,4,2,retired,"citrus fruit,frozen vegetables,domestic eggs,b..."


In [288]:
# To run for the entire Customer set, just change the dataset 
dfToRun = cData # --> cData
for id in pickedCustomers["Customer_ID"]:
    scores = pd.DataFrame(calcSimForOne(id, dfToRun, 10), columns = ["Customer_ID", "Similarity Score"])
    print("\tCustomer ID:", id)
    print(scores,"\n\n")


	Customer ID: 73
   Customer_ID  Similarity Score
0         1846          0.887203
1         1291          0.878961
2         1203          0.863634
3         3953          0.853480
4         6904          0.850184
5         5922          0.849465
6         8881          0.846340
7         3623          0.844805
8         4488          0.839961
9         7933          0.839380 


	Customer ID: 563
   Customer_ID  Similarity Score
0         3634          0.947187
1         6168          0.895594
2         1564          0.873134
3         4290          0.871226
4         2839          0.866726
5         4433          0.866393
6         9234          0.860017
7         6196          0.859902
8         6205          0.858221
9         3391          0.858209 


	Customer ID: 1603
   Customer_ID  Similarity Score
0          568          0.853161
1          109          0.852497
2         7345          0.852440
3         6751          0.844381
4         7335          0.842541
5         6841  

### Customer rating prediction
#### Customer classification to predict customer rating quality 

Re-write the business logic functions

In [305]:
def calcSimilarityNoRating(cust1, cust2): 
    # For categoricals
    sexDis = 0
    marDis = 0
    occDis = 0
    if(cust1["Sex"] != cust2["Sex"]):
        sexDis = 1
    if(cust1["Marital_Status"] != cust2["Marital_Status"]):
        marDis = 1
    if(cust1["Occupation"] != cust2["Occupation"]):
        occDis = 1
    
    # For ordinal vars
    eduDis = abs(cust1["Education"] - cust2["Education"]) / (eduMax - eduMin)

    # For numerical vars
    ageDis = abs(cust1["Age"] - cust2["Age"]) / (ageMax - ageMin)
    incDis = abs(cust1["Income"] - cust2["Income"])/ (incMax - incMin)

    # For set dist
    grocDis = 1 - jaccard_sim(cust1["Groceries"].split(","), cust2["Groceries"].split(","))

    return (1-((sexDis + marDis + occDis + eduDis + ageDis + incDis + grocDis)/7))


In [361]:
def calcSim10ForOne(cust, df):
    result = []
    runDf = df.loc[df["Customer_ID"]!=cust["Customer_ID"]]
    for i in range(0, len(runDf)):
        compCust = runDf.iloc[i]
        result.append([compCust["Customer_ID"], compCust["Customer_Rating"], calcSimilarityNoRating(cust,compCust)])

    # Sort the results by desc. similarity
    return sorted(result, key=lambda x: x[2], reverse=True)[0:10]
    # return sorted(result, key=lambda x: x[1], reverse=True)[0:10]

In [365]:
def calcRating(cust, df, weighted=False):
    simArray = calcSim10ForOne(cust, df)
    total=0
    if weighted==False:
        for i in range(0, len(simArray)):
            total += simArray[i][1]
        total = total / len(simArray)
    else:
        simTotal = 0
        for i in range(0, len(simArray)):
            total += simArray[i][2] * simArray[i][1]
            simTotal += simArray[i][2]
        total = total / simTotal
    return int(round(total))

### Average rating

#### Average rating Error

In [370]:
abs(calcRating(cData.iloc[0], cData) - cData.iloc[0]["Customer_Rating"])

1

In [375]:
totalAverageRatingError = 0
for i in range(0, 50):
    totalAverageRatingError += abs(calcRating(cData.iloc[i], cData) - cData.iloc[i]["Customer_Rating"])
print("Total AverageRatingAlgo error: ", (totalAverageRatingError/50))

Total AverageRatingAlgo error:  0.4


### Weighted average rating

In [367]:
calcRating(cData.iloc[27], cData, True)

2

#### Weighted Average rating Error

In [None]:
totalAverageRatingError = 0
for i in range(0, 50):
    totalAverageRatingError += abs(calcRating(cData.iloc[i], cData, True) - cData.iloc[i]["Customer_Rating"])
print("Total Weighted AverageRatingAlgo error: ", (totalAverageRatingError/50))