In [1]:
import pandas as pd
import numpy as np

In [2]:
# df is order csv
df = pd.read_csv("./from_skip_files/orders_11mil.csv")
# cf is customer profile csv
cf = pd.read_csv("./derived_files/customer_profiles_recency.csv")

# Read in the names of cuisine types for storage in future dataframes
names = cf.columns.values.tolist()

# Get the count of all the customer orders, sorted by customer id
group_counts = df.groupby('customer_id').count()
arr = cf.to_numpy()

In [3]:
# Make a new list of customer profiles with any customer with less than 10 orders removed
customer_profile_list = []
i = 0
for index, row in group_counts.iterrows():
    if i >= arr.shape[0]:
        break
    if row[0] > 10:
        customer_profile_list.append(arr[i, :])
    i += 1

In [4]:
# Store the new cutoff into a .csv file
cutoff_df = pd.DataFrame(customer_profile_list, columns=names[:])
cutoff_df.to_csv('./derived_files/customer_profiles_lowest_cutoff.csv')

print(cutoff_df)

                                 customer_id  African   Alcohol    Bakery  \
0       0000267e-c83e-4a48-9776-8163eab97b6a      0.0  0.000000  0.010832   
1       000118af-5466-4790-be93-f052f355a773      0.0  0.012231  0.010857   
2       00011ffe-f012-42d0-9aab-e371f9cd8547      0.0  0.003054  0.048490   
3       0001bab7-7ea0-4657-88db-7e7c57990a2b      0.0  0.000000  0.000000   
4       000242ae-bd33-48da-ad7c-1cac5086dd05      0.0  0.000000  0.063679   
...                                      ...      ...       ...       ...   
200478  fffd1f5a-09e4-4cac-b275-4676ec841c7a      0.0  0.000000  0.000000   
200479  fffd5519-4a83-489c-a715-2fbcb7c2718a      0.0  0.000000  0.017278   
200480  fffda4bb-f035-409c-ad58-79ba22d34fa1      0.0  0.000000  0.000000   
200481  fffecc08-88f5-414c-b29d-0ddc6177352d      0.0  0.000000  0.000000   
200482  fffed9c1-b66d-419d-a92f-72b69720b7a4      0.0  0.000000  0.018349   

        Barbecue      Beef  Breakfast & Brunch  Bubble Tea   Burgers  \
0  

In [5]:
# Perform cosine similarity between all the cuisine types, store values in a matrix
cosine_sim = np.zeros((54, 54))
for i in range(54):
    for j in range(54):
        a = cutoff_df.iloc[:, i + 1]
        b = cutoff_df.iloc[:, j + 1]
        cosine_sim[i][j] = np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [6]:
# Calculate mean and standard deviation
mean = np.mean(cosine_sim)
std = np.std(cosine_sim)

# Use a sigmoid function to remap values so the average is closer to 0.5
cosine_sim = 1 / (1 + np.exp(-(cosine_sim - mean) / std))

# Store the final matrix as a dataframe
final_df = pd.DataFrame(cosine_sim, names[1:], names[1:])

# Hard coded values to remove
# Values are removed to eliminate any chance of poor recommendation
# Values are only removed in one direction, since the reverse direction can still be recommended
final_df.at['Halal','Alcohol'] = 0
final_df.at['Halal','Pork'] = 0
final_df.at['Kosher','Pork'] = 0

# Store the final dataframe into a .csv file
final_df.to_csv('./derived_files/similarity_matrix_final_official.csv')

In [7]:
# One final check on some statistics of the matrix
minimum = np.min(cosine_sim)
mean = np.mean(cosine_sim)  # New mean
std = np.std(cosine_sim)  # New std

print(minimum)
print(mean)
print(std)

0.0
0.47633151193313844
0.11369182393277837
