In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from scipy.sparse import coo_matrix
from scipy.stats import pearsonr
from joblib import Parallel, delayed
from scipy.sparse import csr_matrix
import implicit

In [2]:
df_user = pd.read_csv("public/oders_s.csv")
df = pd.read_csv("public/oder_products_s.csv")
df_products = pd.read_csv("public/products.csv")

In [7]:
display(df_user.head())
display(df.head())
display(df_products.head())

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0


Unnamed: 0,order_id,product_id,add_to_cart_order
0,2,33120,1
1,2,28985,2
2,2,9327,3
3,2,45918,4
4,2,30035,5


Unnamed: 0,product_id,product_name,aisle_id,department_id
0,1,Chocolate Sandwich Cookies,61,19
1,2,All-Seasons Salt,104,13
2,3,Robust Golden Unsweetened Oolong Tea,94,7
3,4,Smart Ones Classic Favorites Mini Rigatoni Wit...,38,1
4,5,Green Chile Anytime Sauce,5,13


### Join

In [3]:
df = df.merge(df_products[["product_id", "product_name"]], on="product_id", how="left")
df = df.merge(df_user, on="order_id", how="left")

In [4]:
# Umwandlung von user_id und product_name in Kategorie-Typen
df['user_id'] = df['user_id'].astype('category')
df['product_name'] = df['product_name'].astype('category')

KeyboardInterrupt: 

In [4]:
df[['order_id', 'product_id', 'add_to_cart_order', "user_id", "order_number", "order_dow", "order_hour_of_day"]] = df[['order_id', 'product_id', 'add_to_cart_order', "user_id", "order_number", "order_dow", "order_hour_of_day"]].astype('int32')  # Umwandlung in int32

In [5]:
df.dtypes

order_id                    int32
product_id                  int32
add_to_cart_order           int32
product_name               object
user_id                     int32
order_number                int32
order_dow                   int32
order_hour_of_day           int32
days_since_prior_order    float64
dtype: object

In [11]:
display(df)

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,Organic Egg Whites,202279,3,5,9,8.0
1,2,28985,2,Michigan Organic Kale,202279,3,5,9,8.0
2,2,9327,3,Garlic Powder,202279,3,5,9,8.0
3,2,45918,4,Coconut Butter,202279,3,5,9,8.0
4,2,30035,5,Natural Sweetener,202279,3,5,9,8.0
...,...,...,...,...,...,...,...,...,...
16942231,3421058,6244,4,Club Soda Lower Sodium,136952,20,3,18,15.0
16942232,3421058,6858,5,Classic Britannia Crisps,136952,20,3,18,15.0
16942233,3421058,30316,6,Baby Brie,136952,20,3,18,15.0
16942234,3421058,35578,7,Genoa Salame with White Cheddar Cheese & Toast...,136952,20,3,18,15.0


In [6]:
X_train, X_test = train_test_split(df, test_size=0.3, random_state=42)
train_sample = X_train.sample(frac=0.008, random_state=42) 

In [13]:
print(f"Anzahl user: {len(df.user_id.unique())}")
print(f"Anzahl Produkte: {len(df.product_id.unique())}")
print(f"Anzahl Bestellungen: {len(df.order_id.unique())}")

Anzahl user: 103104
Anzahl Produkte: 49258
Anzahl Bestellungen: 1673021


In [14]:
train_sample

Unnamed: 0,order_id,product_id,add_to_cart_order,product_name,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order
1796607,378740,18963,12,Apple Cider Vinegar,4507,21,6,23,7.0
10861377,2288160,7559,9,Cinnamon Rolls with Icing,191465,43,6,8,3.0
5863645,1235061,49683,11,Cucumber Kirby,197771,12,0,0,10.0
263419,55215,26209,5,Limes,10528,13,4,1,12.0
11144437,2347243,5161,2,Dried Mango,198410,2,5,8,15.0
...,...,...,...,...,...,...,...,...,...
273675,57300,44075,14,Sesame Whole Wheat Crackers,189028,11,5,10,14.0
1518967,319789,21560,35,Cut Hearts Of Palm,88930,28,0,15,7.0
5441697,1146114,43001,14,Spring Water Infused with Fulvic Acid,83878,10,6,22,5.0
2485883,523619,24891,20,Organic Chocolate 1% Milk with DHA Omega-3,133849,1,1,15,


---

# ALS-Algorithmus
Matrix-Faktorisierung

Empfehlung basierend nach gekauften Produkten des Users

In [7]:
# Transaktionsdaten vorbereiten: Nur user_id, product_id, und purchase_count
user_product_df = df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')

# Erstellen einer COO-Matrix (Effizienter als eine dichte Matrix)
user_ids = user_product_df['user_id'].values
product_ids = user_product_df['product_id'].values
purchase_counts = user_product_df['purchase_count'].values

# Erstellen einer COO-Matrix, die sparsamen Speicher verwendet
user_product_sparse = coo_matrix((purchase_counts, (user_ids, product_ids)))

# Konvertiere die COO-Matrix in eine CSR-Matrix
user_product_sparse_csr = user_product_sparse.tocsr()

# Initialisiere das ALS-Modell
model = implicit.als.AlternatingLeastSquares(factors=100, regularization=0.1, iterations=30, use_gpu=False, alpha=40, num_threads=8)

# Trainiere das Modell mit der sparsamen Matrix
model.fit(user_product_sparse_csr)

  check_blas_config()


  0%|          | 0/30 [00:00<?, ?it/s]

In [27]:
# Vorhersage: Für einen gegebenen User, welches Produkt könnte der User als nächstes kaufen?
user_id = 42  # Beispiel-User
recommended = model.recommend(user_id, user_product_sparse_csr[user_id], N=10)

# Ausgabe der empfohlenen Produkt-IDs und deren Scores
print(recommended)

(array([ 9124, 34217, 42450,  3717, 13966, 33957, 31883, 20670, 24852,
       16953], dtype=int32), array([1.2444499, 1.2337629, 1.1753899, 1.0894485, 1.088693 , 1.0871917,
       1.0867434, 1.0862818, 1.0826225, 1.0698375], dtype=float32))


In [24]:
# Umwandlung der Arrays in einen DataFrame
recommended_df = pd.DataFrame({
    'product_id': recommended[0],  # Die product_ids
    'score': recommended[1]        # Die Scores
})

# Optional: Füge den Produktnamen aus der df_products.csv hinzu, um die Empfehlungen besser zu verstehen
recommended_df['product_name'] = recommended_df['product_id'].map(lambda x: df_products[df_products['product_id'] == x]['product_name'].values[0])

# Ausgabe des DataFrames
display(recommended_df)

Unnamed: 0,product_id,score,product_name
0,9124,1.24445,Broccoli & Cheddar Bake Meal Bowl
1,34217,1.233763,Pesto Tortellini Bowls
2,42450,1.17539,Macaroni & Cheese
3,3717,1.089448,Cheese Enchilada Meal
4,13966,1.088693,Chicken Pot Pie
5,33957,1.087192,Mexican Casserole Bowl
6,31883,1.086743,Vegetable Lasagna
7,20670,1.086282,Organic Lentil Vegetable Soup
8,24852,1.082623,Banana
9,16953,1.069837,Creamy Peanut Butter


---

# User-based Collaborative Filtering
Empfehlung basierend von Kunden ähnlicher Kaufverhalten
mit Nearest Neighbors

In [10]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import coo_matrix

# Schritt 1: Erstelle die Benutzer-Produkt-Matrix
user_product_df = df.groupby(['user_id', 'product_id']).size().reset_index(name='purchase_count')
user_ids = user_product_df['user_id'].values
product_ids = user_product_df['product_id'].values
purchase_counts = user_product_df['purchase_count'].values

# Erstelle eine COO-Matrix für die Benutzer-Produkt-Interaktionen
user_product_sparse = coo_matrix((purchase_counts, (user_ids, product_ids)))

# Konvertiere die COO-Matrix in eine CSR-Matrix
user_product_sparse_csr = user_product_sparse.tocsr()

# Schritt 2: Nearest Neighbors Modell
# Fitten des NearestNeighbors-Modells, hier verwenden wir die Kosinus-Ähnlichkeit
model_nn = NearestNeighbors(n_neighbors=6, algorithm='auto', metric='cosine')
model_nn.fit(user_product_sparse_csr)

In [11]:
# Schritt 3: Suche die ähnlichsten Benutzer für den gegebenen Benutzer (user_id)
user_id = 42  # Beispielbenutzer, den wir analysieren möchten
distances, indices = model_nn.kneighbors(user_product_sparse_csr[user_id], n_neighbors=6)

# Indices repräsentieren die ähnlichsten Benutzer (einschließlich des aktuellen Benutzers)
similar_users = indices[0][1:]  # Top 5 ähnliche Benutzer (den aktuellen Benutzer ausschließend)

# Schritt 4: Empfehle Produkte basierend auf den Vorlieben der ähnlichen Benutzer
# Wir extrahieren alle Produkte, die von den ähnlichen Benutzern gekauft wurden
similar_users_products = df[df['user_id'].isin(similar_users)]

# Die Produkte, die von den ähnlichen Benutzern am häufigsten gekauft wurden, aggregieren
recommended_products = similar_users_products.groupby('product_id').size().reset_index(name='purchase_count')
recommended_products = recommended_products.sort_values('purchase_count', ascending=False)

# Optional: Produktnamen hinzufügen
recommended_products['product_name'] = recommended_products['product_id'].map(lambda x: df_products[df_products['product_id'] == x]['product_name'].values[0])

# Ausgabe der empfohlenen Produkte
display(recommended_products.head(10))

Unnamed: 0,product_id,purchase_count,product_name
51,39275,29,Organic Blueberries
6,9290,11,Country Cheddar Bowl
29,23148,11,Veggie Loaf & Mashed Potatoes Entrée
36,28458,8,Black Bean Tamale Verde
16,13966,7,Chicken Pot Pie
15,13176,5,Bag of Organic Bananas
35,28431,5,Light Sodium Mexican Casserole Bowl
18,14966,5,Organic Mandarins
24,21137,4,Organic Strawberries
25,21195,3,Organic Extra Virgin Olive Oil
