In [1]:
import pandas as pd

invoice_df = pd.read_csv('invoice.csv')
item_df = pd.read_csv('item.csv')

In [2]:
invoice_df.head(5)

Unnamed: 0,Invoice_id,Date,Item_id,Vendor_id,Vendor_Name,Store_id,Store_Name,Address,City_Name,Zip_Code,County_id,County_Name,Bottles_Sold
0,INV-00013400001,2016-08-29,35918,297,Katou Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
1,INV-00013400002,2016-08-29,23828,297,Katou Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
2,INV-00013400003,2016-08-29,36908,300,Katsuragi Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1
3,INV-00013400004,2016-08-29,34359,35,Archer Inc.,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,8
4,INV-00013400005,2016-08-29,36903,300,Katsuragi Company,3882,Kwik Shop #579 / Davenport,2805 TELEGRAPH RD,DAVENPORT,52802,82,Scott,1


In [3]:
# Produce basic ratings CSV relating customers to invoices to ratings.

import numpy as np

merged_df = invoice_df.merge(item_df,how='left',left_on='Item_id',right_on='Item_id')
num_invoices = len(merged_df['Invoice_id'])

num_customers = 10000
customer_ids = np.random.randint(1000,10000,size=(num_customers,))

customer_series = pd.Series(np.random.choice(customer_ids,size=(num_invoices,), replace=True),name="Customer_id")
invoice_series  = pd.Series(np.random.choice(merged_df['Invoice_id'],size=(num_invoices,), replace=True),name="Invoice_id")
ratings_series  = pd.Series(np.random.randint(1,6,size=(num_invoices,)),name="Rating")

ratings_df = pd.DataFrame(customer_series).join(invoice_series).join(ratings_series)
ratings_df.sort_values(by='Invoice_id').reset_index(drop=True)

ratings_df.to_csv('ratings.csv')

In [4]:
# Break into separate files because that's how the game is played.
from csv import DictReader

entries = []
with open('ratings.csv','r') as ratings_file:
    ratings_data = DictReader(ratings_file)

    for line in ratings_data:
        entries.append(line)

import sys
original_stdout = sys.stdout # Save a reference to the original standard output

for entry in entries:
    with open(f'./reviews/review{str(entry[""]).zfill(5)}.dat','w') as review_file:
        sys.stdout = review_file
        print(f'Customer ID: {entry["Customer_id"]}')
        print(f'Invoice ID: {entry["Invoice_id"]}')
        print(f'Product Rating: {entry["Rating"]}/5',end='')
        sys.stdout = original_stdout

In [4]:
# Load back from disk.

import glob
files = glob.glob('./reviews/*.dat')[:20000]  # start with a subset

fields = ['Review_id','Customer_id','Invoice_id','Rating']
review_df = pd.DataFrame(columns=fields)

for f in files:
    with open(f,'r') as review_file:
        review_data = review_file.readlines()
        tmp_dict = {}
        tmp_dict['Review_id'] = str(f).split('/')[-1][6:-4]
        tmp_dict['Customer_id'] = review_data[0][12:].strip().strip()
        tmp_dict['Invoice_id'] = review_data[1][11:].strip()
        tmp_dict['Rating'] = int(review_data[2][16:17].strip())
        review_df = review_df.append(tmp_dict,ignore_index=True)

In [5]:
review_df.head()

Unnamed: 0,Review_id,Customer_id,Invoice_id,Rating
0,71973,3856,S30601300029,4
1,309240,6419,INV-09494400100,4
2,732110,2260,S25794000126,2
3,476901,5046,S05043500031,5
4,251573,8643,S12595200073,1


In [6]:
merged_df = invoice_df.merge(review_df,how='left',left_on='Invoice_id',right_on='Invoice_id')
merged_df = merged_df.merge(item_df,how='left',left_on='Item_id',right_on='Item_id')
merged_df = merged_df.dropna().reset_index(drop=True)

In [7]:
merged_df.head()

Unnamed: 0,Invoice_id,Date,Item_id,Vendor_id,Vendor_Name,Store_id,Store_Name,Address,City_Name,Zip_Code,...,Bottles_Sold,Review_id,Customer_id,Rating,Item_Description,Category,Pack,Bottle_Volume_ml,Bottle_Cost,Bottle_Retail_Price
0,INV-00013500008,2016-08-29,64858,421,Midoriya Inc.,3918,Smokin' Joe's #1 Food and Durg,3120 ROCKINGHAM RD,DAVENPORT,52802,...,1,322573,6233,1,Ryuuji's Root Beer,Root Beer,1,3000,5.69,8.54
1,INV-00013500019,2016-08-29,64571,260,Inuyasha Brands,3918,Smokin' Joe's #1 Food and Durg,3120 ROCKINGHAM RD,DAVENPORT,52802,...,1,661376,5388,3,Wen-li's Red Pop,Cherry Soda,24,200,2.2,3.3
2,INV-00014500022,2016-08-29,45278,434,Minami LLC,4962,Hilltop Grocery,1312 HARRISON ST,DAVENPORT,52803,...,1,391748,4704,2,Dante's Energy,Energy Drink,6,1750,2.57,3.85
3,INV-00015100018,2016-08-29,43126,35,Archer Inc.,4892,Sara Mini Mart,1026 W RIVER DR,DAVENPORT,52802,...,1,227582,2703,5,Alphonse's Energy Booster,Energy Drink,12,750,2.63,3.94
4,INV-00015100047,2016-08-29,43026,259,Inaba Brands,4892,Sara Mini Mart,1026 W RIVER DR,DAVENPORT,52802,...,1,45381,7941,5,Kotori's Energy Booster,Energy Drink,12,750,2.31,3.47


In [11]:
from holden import holden

rec = holden()
rec.initialize(merged_df)

TypeError: __init__() takes 0 positional arguments but 1 was given

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

features = ['Customer_id','Rating','Item_Description']

In [27]:
def combine(entry):
    '''
    Assess closeness based on customer and rating.
    
    Notably, we omit description and classification.  This also
    spuriously associates customers with similar IDs.
    '''
    return f"{entry['Customer_id']} {entry['Rating']}"

In [28]:
for feature in features:
    merged_df[feature] = merged_df[feature].fillna('')

merged_df['Combined'] = merged_df.apply(combine,axis=1)

cv = CountVectorizer()
count_matrix = cv.fit_transform(merged_df['Combined'])

In [29]:
cosine_sim = cosine_similarity(count_matrix)

In [30]:
def get_item_from_id(id):
    return merged_df[merged_df['Item_id'] == id]['Item_Description'].iloc[0]
def get_id_from_item(item):
    return merged_df[merged_df['Item_Description'] == item]['Item_id'].iloc[0]

def get_index_from_id(id):
    return merged_df[merged_df['Item_id'] == id].index[0]
def get_item_from_index(index):
    return merged_df[merged_df.index == index]['Item_Description'].iloc[0]

In [31]:
item = "Azusa's Root Beer"
index = 67540
merged_df[merged_df['Item_id'] == index]['Item_id'].index[0]

5348

In [32]:
item = "Azusa's Root Beer"
id = 67540

merged_df[merged_df['Item_Description'] == item].index[0]

print(get_id_from_item(item))
print(get_id_from_item(get_item_from_id(id)))
print(get_item_from_id(67540))
print(get_item_from_id(get_id_from_item(item)))
print(get_index_from_id(67540))
print(get_index_from_id(get_id_from_item(item)))

merged_df[merged_df['Item_Description']==item]

67540
67540
Azusa's Root Beer
Azusa's Root Beer
5348
5348


Unnamed: 0,Invoice_id,Date,Item_id,Vendor_id,Vendor_Name,Store_id,Store_Name,Address,City_Name,Zip_Code,...,Review_id,Customer_id,Rating,Item_Description,Category,Pack,Bottle_Volume_ml,Bottle_Cost,Bottle_Retail_Price,Combined
5348,S04679600107,2012-03-21,67540,370,Kyon Inc.,2594,Hy-Vee Food Store / Sioux City,4500 SERGEANT ROAD,SIOUX CITY,51106,...,806168,9350,1,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,9350 1
7530,S09085100139,2012-11-20,67540,370,Kyon Inc.,2635,Hy-Vee #4 / Davenport,4064 E 53RD ST,DAVENPORT,52807,...,437752,6716,3,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,6716 3
7945,S09806400132,2012-12-31,67540,370,Kyon Inc.,2614,Hy-Vee #3 Food and Drugstore,1823 E KIMBERLY RD,DAVENPORT,52807,...,785544,6854,5,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,6854 5
15768,S26132800118,2015-06-10,67540,370,Kyon Inc.,2614,Hy-Vee #3 Food and Drugstore,1823 E KIMBERLY RD,DAVENPORT,52807,...,336254,7563,1,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,7563 1
15854,S26301800112,2015-06-18,67540,370,Kyon Inc.,2635,Hy-Vee #4 / Davenport,4064 E 53RD ST,DAVENPORT,52807,...,643639,3130,2,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,3130 2
18155,S30677800085,2016-02-10,67540,370,Kyon Inc.,2625,Hy-Vee Convenience Store #2,3301 W KIMBERLY RD,DAVENPORT,52804,...,877195,6443,5,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,6443 5
18700,S31771800011,2016-04-13,67540,370,Kyon Inc.,4676,Sam's Mini Mart / Sioux City,711 GORDON DR,SIOUX CITY,51101,...,726984,6689,5,Azusa's Root Beer,Root Beer,12,750,3.19,4.78,6689 5


In [33]:
item = "Azusa's Root Beer"
merged_df[merged_df['Item_Description'] == item].iloc[0]['Item_id']

67540

In [34]:
item_index = get_index_from_id(get_id_from_item("Azusa's Root Beer"))
similar_soft_drinks = list(enumerate(cosine_sim[item_index]))

In [35]:
similar_sorted = sorted(similar_soft_drinks,key=lambda x:x[1],reverse=True)[1:]
similar_sorted[:10]

[(5381, 1.0),
 (6509, 1.0),
 (6813, 1.0),
 (18071, 1.0),
 (18421, 1.0),
 (0, 0.0),
 (1, 0.0),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0)]

In [37]:
print("Top 5 similarly recommended soft drinks are:\n")
for i in range(0,5):
    try:
        print(get_item_from_index(similar_sorted[i][0]))
    except IndexError:
        print('Empty DF')

Top 5 similarly recommended soft drinks are:

Yukiho's Raspberry Soda
Maes's Wild Cherry
Gohan's Orange Soda
Meliodas's Old Fashioned Cream Soda
Meliodas's Old Fashioned Cream Soda


https://medium.com/@sumanadhikari/building-a-movie-recommendation-engine-using-scikit-learn-8dbb11c5aa4b

In [None]:
merged_df