In [1]:
import pandas as pd

invoice_df = pd.read_csv('invoice.csv')
item_df = pd.read_csv('item.csv')

In [2]:
invoice_df.head(5)

Unnamed: 0,Invoice_id,Date,Item_id,Vendor_id,Vendor_Name,Store_id,Store_Name,Address,City_Name,Zip_Code,County_id,County_Name,Bottles_Sold
0,INV-00013400001,2016-08-29,35918,297,Katou Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
1,INV-00013400002,2016-08-29,23828,297,Katou Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
2,INV-00013400003,2016-08-29,36908,300,Katsuragi Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
3,INV-00013400004,2016-08-29,34359,35,Archer Inc.,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,8
4,INV-00013400005,2016-08-29,36903,300,Katsuragi Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1


In [None]:
# Produce basic ratings CSV relating customers to invoices to ratings.

import numpy as np

merged_df = invoice_df.merge(item_df,how='left',left_on='Item_id',right_on='Item_id')
num_invoices = len(merged_df['Invoice_id'])

num_customers = 10000
customer_ids = np.random.randint(1000,10000,size=(num_customers,))

customer_series = pd.Series(np.random.choice(customer_ids,size=(num_invoices,), replace=True),name="Customer_id")
invoice_series  = pd.Series(np.random.choice(merged_df['Invoice_id'],size=(num_invoices,), replace=True),name="Invoice_id")
ratings_series  = pd.Series(np.random.randint(1,6,size=(num_invoices,)),name="Rating")

ratings_df = pd.DataFrame(customer_series).join(invoice_series).join(ratings_series)
ratings_df.sort_values(by='Invoice_id').reset_index(drop=True)

ratings_df.to_csv('ratings.csv')

In [None]:
# Break into separate files because that's how the game is played.
from csv import DictReader

entries = []
with open('ratings.csv','r') as ratings_file:
    ratings_data = DictReader(ratings_file)

    for line in ratings_data:
        entries.append(line)

import sys
original_stdout = sys.stdout # Save a reference to the original standard output

for entry in entries[0:20000]:
    with open(f'./reviews/review{str(entry[""]).zfill(5)}.dat','w') as review_file:
        sys.stdout = review_file
        print(f'Customer ID: {entry["Customer_id"]}')
        print(f'Invoice ID: {entry["Invoice_id"]}')
        print(f'Product Rating: {entry["Rating"]}/5',end='')
        sys.stdout = original_stdout

In [3]:
# Load back from disk.

import glob
files = glob.glob('./reviews/*.dat')[:20000]  # start with a subset

fields = ['Review_id','Customer_id','Invoice_id','Rating']
review_df = pd.DataFrame(columns=fields)

for f in files:
    with open(f,'r') as review_file:
        review_data = review_file.readlines()
        tmp_dict = {}
        tmp_dict['Review_id'] = str(f).split('/')[-1][6:-4]
        tmp_dict['Customer_id'] = review_data[0][12:].strip().strip()
        tmp_dict['Invoice_id'] = review_data[1][11:].strip()
        tmp_dict['Rating'] = int(review_data[2][16:17].strip())
        review_df = review_df.append(tmp_dict,ignore_index=True)

In [4]:
review_df.head()

Unnamed: 0,Review_id,Customer_id,Invoice_id,Rating
0,71973,2670,S29449000106,2
1,309240,1649,S14277100031,1
2,732110,5158,S06790100007,2
3,476901,6019,S19411900009,4
4,251573,4655,S25044100034,5


In [5]:
merged_df = invoice_df.merge(review_df,how='left',left_on='Invoice_id',right_on='Invoice_id')
merged_df = merged_df.merge(item_df,how='left',left_on='Item_id',right_on='Item_id')
merged_df = merged_df.dropna().reset_index(drop=True)

In [6]:
merged_df.head()

Unnamed: 0,Invoice_id,Date,Item_id,Vendor_id,Vendor_Name,Store_id,Store_Name,Address,City_Name,Zip_Code,...,Bottles_Sold,Review_id,Customer_id,Rating,Item_Description,Category,Pack,Bottle_Volume_ml,Bottle_Cost,Bottle_Retail_Price
0,INV-00013800009,2016-08-29,20246,65,Ayanami Brands,2554,Hy-Vee Food Store / Davenport,3019 ROCKINGHAM ROAD,DAVENPORT,52802,...,1,293221,9693,1,Hisoka's Cream Soda,Cream Soda,12,750,2.13,3.2
1,INV-00013800046,2016-08-29,43036,35,Archer Inc.,2554,Hy-Vee Food Store / Davenport,3019 ROCKINGHAM ROAD,DAVENPORT,52802,...,1,534710,9336,3,Saber's Energy,Energy Drink,12,750,2.52,3.91
2,INV-00013800069,2016-08-29,77842,260,Inuyasha Brands,2554,Hy-Vee Food Store / Davenport,3019 ROCKINGHAM ROAD,DAVENPORT,52802,...,2,232499,4010,1,Nami's Black Cherry,Cherry Soda,12,750,2.66,3.99
3,INV-00013900006,2016-08-29,36903,300,Katsuragi Company,3805,Washington Street Mini Mart,1601 WASHINGTON ST,DAVENPORT,52084,...,1,880846,3953,3,Hisoka's Wild Cherry,Cherry Soda,48,200,1.66,2.49
4,INV-00014100027,2016-08-29,55084,434,Minami LLC,3917,Smokin' Joe's #2 Food and Durg,1606 W LOCUST ST,DAVENPORT,52804,...,3,14748,6631,1,Kotarou's Orange Soda,Orange Soda,24,375,2.0,3.0


In [7]:
from holden import holden

rec = holden()
rec.initialize(merged_df)

In [8]:
item = "Azusa's Root Beer"

rec.match(item)

["Meiko's Cream Soda",
 "Arataka's Vanilla Cream Soda",
 "Hisoka's Cream Soda",
 "Saber's Energy",
 "Nami's Black Cherry"]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

features = ['Customer_id','Rating','Item_Description']

In [None]:
def combine(entry):
    '''
    Assess closeness based on customer and rating.
    
    Notably, we omit description and classification.  This also
    spuriously associates customers with similar IDs.
    '''
    return f"{entry['Customer_id']} {entry['Rating']}"

In [None]:
for feature in features:
    merged_df[feature] = merged_df[feature].fillna('')

merged_df['Combined'] = merged_df.apply(combine,axis=1)

cv = CountVectorizer()
count_matrix = cv.fit_transform(merged_df['Combined'])

In [None]:
cosine_sim = cosine_similarity(count_matrix)

In [None]:
def get_item_from_id(id):
    return merged_df[merged_df['Item_id'] == id]['Item_Description'].iloc[0]
def get_id_from_item(item):
    return merged_df[merged_df['Item_Description'] == item]['Item_id'].iloc[0]

def get_index_from_id(id):
    return merged_df[merged_df['Item_id'] == id].index[0]
def get_item_from_index(index):
    return merged_df[merged_df.index == index]['Item_Description'].iloc[0]

In [None]:
item = "Azusa's Root Beer"
index = 67540
merged_df[merged_df['Item_id'] == index]['Item_id'].index[0]

In [None]:
item = "Azusa's Root Beer"
id = 67540

merged_df[merged_df['Item_Description'] == item].index[0]

print(get_id_from_item(item))
print(get_id_from_item(get_item_from_id(id)))
print(get_item_from_id(67540))
print(get_item_from_id(get_id_from_item(item)))
print(get_index_from_id(67540))
print(get_index_from_id(get_id_from_item(item)))

merged_df[merged_df['Item_Description']==item]

In [None]:
item = "Azusa's Root Beer"
merged_df[merged_df['Item_Description'] == item].iloc[0]['Item_id']

In [None]:
item_index = get_index_from_id(get_id_from_item("Azusa's Root Beer"))
print(item_index)
similar_soft_drinks = list(enumerate(cosine_sim[item_index]))

In [None]:
similar_sorted = sorted(similar_soft_drinks,key=lambda x:x[1],reverse=True)[1:]
similar_sorted[:10]

In [None]:
print("Top 5 similarly recommended soft drinks are:\n")
for i in range(0,5):
    try:
        print(get_item_from_index(similar_sorted[i][0]))
    except IndexError:
        print('Empty DF')

https://medium.com/@sumanadhikari/building-a-movie-recommendation-engine-using-scikit-learn-8dbb11c5aa4b

In [None]:
merged_df