In [1]:

from tabulate import tabulate
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics.pairwise import linear_kernel
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import LabelEncoder

# Read the CSV file
df = pd.read_csv('DATA.csv')

# Check for NaN values and handle them
df['UserId'].fillna(0, inplace=True)  # Replace NaN in UserId with a placeholder
df['UserId'] = df['UserId'].astype(str)  # Ensure UserId is of type string
df['ProductName'].fillna('', inplace=True)  # Replace NaN in ProductName with empty string
df['price'].fillna(0, inplace=True)  # Replace NaN in price with 0 or another placeholder value

# Ensure price is of type string for concatenation
df['price'] = df['price'].astype(str)

# Pivot the DataFrame to get a user-product matrix
user_product_matrix = df.pivot_table(index='UserId', columns='nutritional _data', values='price', aggfunc='size', fill_value=0)

# Create a TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the product names to TF-IDF vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(df['nutritional _data'] + ' ' + df['price'])

# Function to get product recommendations for a given user
def get_recommendations(user_id):
    user_products = user_product_matrix.loc[user_id]
    user_tfidf_vector = np.asarray(tfidf_matrix[user_products.to_numpy().nonzero()[0]].mean(axis=0))
    cosine_scores = linear_kernel(user_tfidf_vector, tfidf_matrix).flatten()
    product_indices = cosine_scores.argsort()[::-1]
    recommended_products = [idx for idx in product_indices if idx not in user_products.to_numpy().nonzero()[0]]

    return recommended_products

# Get all unique user IDs from the dataset
all_user_ids = df['UserId'].unique()

# Store recommendations in a DataFrame
recommendations_data = []

for user_id in all_user_ids:
    recommendations = get_recommendations(user_id)

    # Store recommendations
    for idx in recommendations[:1]:  # Display top recommendation
        recommendations_data.append({'UserId': user_id, 'Recommended_Product': df['ProductName'].iloc[idx], 'Price': df['price'].iloc[idx]})

# Create a DataFrame from recommendations
recommendations_df = pd.DataFrame(recommendations_data)

# Merge recommendations DataFrame with existing dataset
df = pd.merge(df, recommendations_df, on='UserId', how='left')

# Save the updated DataFrame to the dataset file
df.to_csv('data_with_recomended.csv', index=False)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['UserId'].fillna(0, inplace=True)  # Replace NaN in UserId with a placeholder
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ProductName'].fillna('', inplace=True)  # Replace NaN in ProductName with empty string
The behavior will change in pandas 3.0. This inplace method 

In [2]:
data=pd.read_csv('data_with_recomended.csv')

In [3]:
data.head(10)

Unnamed: 0,UserId,ProductId,Rating,time,price,ProductName,nutritional _data,Recommended_Product,Price
0,1,1,5,2/19/2024 7:45,10,Tea,"This drink contains 2 calories, 10 grams of fa...",Nescaffee black,15
1,2,2,3,2/20/2024 10:45,15,Tea_Milk,"This drink contains 50 calories, 2 grams of fa...",Nescaffee black,15
2,3,3,2,2/21/2024 10:45,15,Nescaffee black,"This drink contains 80 calories, 3 grams of fa...",Nescafee,20
3,5,4,1,2/22/2024 10:45,20,Nescafee,"This drink contains 100 calories, 3 grams of f...",Tea,10
4,6,5,4,2/23/2024 10:45,15,Cacao,"This drink contains 150 calories, 8 grams of f...",Tea_Milk,15
5,7,6,2,2/24/2024 10:45,20,Cacao_Milk,"This drink contains 200 calories, 10 grams of ...",Nescafee,20
6,8,1,3,2/25/2024 10:45,10,Tea,"This drink contains 2 calories, 10 grams of fa...",Nescaffee black,15
7,9,2,1,2/26/2024 10:45,15,Tea_Milk,"This drink contains 50 calories, 2 grams of fa...",Tea,10
8,10,3,5,2/27/2024 10:45,15,Nescaffee black,"This drink contains 80 calories, 3 grams of fa...",Cacao_Milk,20
9,11,4,4,2/28/2024 10:45,20,Nescafee,"This drink contains 100 calories, 3 grams of f...",Tea,10


In [4]:
y = data['Recommended_Product']
y

0      Nescaffee black
1      Nescaffee black
2             Nescafee
3                  Tea
4             Tea_Milk
            ...       
593           Tea_Milk
594           Nescafee
595    Nescaffee black
596              Cacao
597           Nescafee
Name: Recommended_Product, Length: 598, dtype: object

In [5]:
from sklearn.preprocessing import LabelEncoder



label_encoder_user = LabelEncoder()
df['UserId'] = label_encoder_user.fit_transform(df['UserId'])

label_encoder_product = LabelEncoder()
df['ProductId'] = label_encoder_product.fit_transform(df['ProductName'])  # Assuming ProductId is derived from ProductName


X = data[['UserId', 'ProductId', 'Rating', 'price']]  # Select relevant numerical columns as input features
y = data['Recommended_Product']  # Set the target variable
# Encode the target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)


In [6]:
from sklearn.experimental import enable_hist_gradient_boosting  # noqa
from sklearn.ensemble import RandomForestClassifier

# Split the data (assuming X and y are defined as your dataset)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the RandomForestClassifierClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

# Make predictions on the test set
y_pred = classifier.predict(X_test)

# Evaluate the classifier
accuracy = classifier.score(X_test, y_test)
print("Accuracy:", accuracy)




Accuracy: 0.9083333333333333


In [7]:
y_pred = classifier.predict(X_test)

In [8]:
y_pred

array([5, 0, 5, 2, 4, 5, 5, 1, 3, 2, 0, 3, 4, 4, 5, 1, 3, 5, 3, 1, 3, 0,
       4, 2, 4, 3, 3, 2, 0, 3, 5, 4, 5, 0, 1, 5, 4, 0, 5, 0, 2, 2, 2, 4,
       3, 5, 2, 5, 4, 1, 0, 5, 2, 3, 3, 4, 0, 2, 0, 2, 5, 5, 3, 5, 2, 1,
       3, 4, 4, 1, 1, 3, 5, 3, 5, 2, 0, 3, 5, 0, 1, 4, 5, 1, 0, 4, 0, 5,
       2, 2, 4, 4, 3, 0, 4, 3, 1, 2, 0, 2, 3, 3, 3, 3, 0, 5, 5, 0, 3, 1,
       4, 5, 1, 5, 3, 5, 4, 3, 5, 1])

In [9]:
predicted = label_encoder.inverse_transform(y_pred)
print("Predicted Drink:", predicted)

Predicted Drink: ['Tea_Milk' 'Cacao' 'Tea_Milk' 'Nescafee' 'Tea' 'Tea_Milk' 'Tea_Milk'
 'Cacao_Milk' 'Nescaffee black' 'Nescafee' 'Cacao' 'Nescaffee black' 'Tea'
 'Tea' 'Tea_Milk' 'Cacao_Milk' 'Nescaffee black' 'Tea_Milk'
 'Nescaffee black' 'Cacao_Milk' 'Nescaffee black' 'Cacao' 'Tea' 'Nescafee'
 'Tea' 'Nescaffee black' 'Nescaffee black' 'Nescafee' 'Cacao'
 'Nescaffee black' 'Tea_Milk' 'Tea' 'Tea_Milk' 'Cacao' 'Cacao_Milk'
 'Tea_Milk' 'Tea' 'Cacao' 'Tea_Milk' 'Cacao' 'Nescafee' 'Nescafee'
 'Nescafee' 'Tea' 'Nescaffee black' 'Tea_Milk' 'Nescafee' 'Tea_Milk' 'Tea'
 'Cacao_Milk' 'Cacao' 'Tea_Milk' 'Nescafee' 'Nescaffee black'
 'Nescaffee black' 'Tea' 'Cacao' 'Nescafee' 'Cacao' 'Nescafee' 'Tea_Milk'
 'Tea_Milk' 'Nescaffee black' 'Tea_Milk' 'Nescafee' 'Cacao_Milk'
 'Nescaffee black' 'Tea' 'Tea' 'Cacao_Milk' 'Cacao_Milk' 'Nescaffee black'
 'Tea_Milk' 'Nescaffee black' 'Tea_Milk' 'Nescafee' 'Cacao'
 'Nescaffee black' 'Tea_Milk' 'Cacao' 'Cacao_Milk' 'Tea' 'Tea_Milk'
 'Cacao_Milk' 'Cacao' 'Tea'

In [10]:
### Create a Pickle file using serialization
import pickle
pickle_out = open("classifier.pkl","wb")
pickle.dump(classifier, pickle_out)
pickle_out.close()

In [11]:
predicted_label=classifier.predict([[40,5,5,45]])
predicted_drink = label_encoder.inverse_transform(predicted_label)[0]
print("Predicted Drink:", predicted_label)

Predicted Drink: [4]


