In [3]:
import pandas as pd

# Load the wines dataset
wines_df = pd.read_csv("wines_w2v.csv")
print("Wines DataFrame:")
print(wines_df.info())
print(wines_df.head())

# Load the ratings dataset
ratings_df = pd.read_csv("XWines_Slim_150K_ratings.csv")
print("\nRatings DataFrame:")
print(ratings_df.info())
print(ratings_df.head())

# Merge wines and ratings dataframes
merged_df = pd.merge(ratings_df, wines_df, on='WineID')

Wines DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1007 entries, 0 to 1006
Data columns (total 22 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   WineID          1007 non-null   int64  
 1   WineName        1007 non-null   object 
 2   Type            1007 non-null   object 
 3   Elaborate       1007 non-null   object 
 4   Grapes          1007 non-null   object 
 5   Harmonize       1007 non-null   object 
 6   ABV             1007 non-null   float64
 7   Body            1007 non-null   object 
 8   Acidity         1007 non-null   object 
 9   Country         1007 non-null   object 
 10  RegionID        1007 non-null   int64  
 11  WineryID        1007 non-null   int64  
 12  RegionName      1007 non-null   object 
 13  WineryName      1007 non-null   object 
 14  WineName_w2v    1007 non-null   object 
 15  Type_w2v        1007 non-null   object 
 16  Elaborate_w2v   1007 non-null   object 
 17  Grapes_w2v      

  ratings_df = pd.read_csv("XWines_Slim_150K_ratings.csv")


In [6]:
def ExplainRecommendation(user, wine):
    
    user_ratings = merged_df[merged_df["UserID"] == user] # Get the user's ratings
    wine_ratings = merged_df[merged_df["WineID"] == wine] # Get the wine's ratings
    user_wine_rating = user_ratings[user_ratings["WineID"] == wine] # Get the user's rating for the wine
    wine_details = wines_df[wines_df["WineID"] == wine] # Get the wine's details

    # Get the wine's average rating
    wine_avg_rating = wine_ratings["Rating"].mean()

    # Get the user's average rating
    user_avg_rating = user_ratings["Rating"].mean()

    # Get the user's average rating for wines with the same country
    user_country_avg_rating = user_ratings[user_ratings["Country"] == wine_details["Country"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same type
    user_type_avg_rating = user_ratings[user_ratings["Type"] == wine_details["Type"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same elaborate
    user_elaborate_avg_rating = user_ratings[user_ratings["Elaborate"] == wine_details["Elaborate"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same grapes
    user_grapes_avg_rating = user_ratings[user_ratings["Grapes"] == wine_details["Grapes"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same harmonize
    user_harmonize_avg_rating = user_ratings[user_ratings["Harmonize"] == wine_details["Harmonize"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same ABV interval
    abv_interval = 0.5
    wine_abv = wine_details["ABV"].values[0]
    user_abv_avg_rating = user_ratings[(user_ratings["ABV"] >= wine_abv - abv_interval) & (user_ratings["ABV"] <= wine_abv + abv_interval)]["Rating"].mean()

    # Get the user's average rating for wines with the same body
    user_body_avg_rating = user_ratings[user_ratings["Body"] == wine_details["Body"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same acidity
    user_acidity_avg_rating = user_ratings[user_ratings["Acidity"] == wine_details["Acidity"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same RegionName
    user_regionname_avg_rating = user_ratings[user_ratings["RegionName"] == wine_details["RegionName"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same WineryName
    user_wineryname_avg_rating = user_ratings[user_ratings["WineryName"] == wine_details["WineryName"].values[0]]["Rating"].mean()

    # Get the user's average rating for wines with the same Vintages

    # return the explanation 
    features_scores = [user_country_avg_rating, user_type_avg_rating, user_elaborate_avg_rating, user_grapes_avg_rating, user_harmonize_avg_rating, user_abv_avg_rating, user_body_avg_rating, user_acidity_avg_rating, user_regionname_avg_rating, user_wineryname_avg_rating]
    features_names = ["country", "type", "elaborate", "grapes", "harmonization", "ABV", "body", "acidity", "region", "winery", "vintage year"]

    explanation = ""

    if not user_wine_rating.empty:
        if user_wine_rating["Rating"].values[0] > 4:
            explanation += "You liked this wine! You rated it " + str(user_wine_rating["Rating"].values[0]) + " out of 5. You might also like this wine because "
        elif wine_avg_rating > 4.5:
            explanation += "This is a popular wine! It has an average rating of " + str(wine_avg_rating) + " out of 5. You might also like this wine because "
        else:
            explanation += "You might like this wine because "
    '''
    print("\nUser Average Rating: " + str(user_avg_rating))

    for i, features_score in enumerate(features_scores):
        if features_score > user_avg_rating:
            print(features_names[i] + ": Positive " + str(features_score))
        else:
            print(features_names[i] + ": Negative " + str(features_score))
    '''
    highest_two = sorted(range(len(features_scores)), key=lambda i: features_scores[i], reverse=True)[:2]
    explanation += "you have rated wines with the same " + features_names[highest_two[0]] + " and " + features_names[highest_two[1]] + " highly. \n"

    return explanation

In [11]:
import pandas as pd
import numpy as np
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
wines = pd.read_csv('wines_w2v.csv')
ratings = pd.read_csv('XWines_Slim_150K_ratings.csv', low_memory=False)

# Create a set of all wine IDs
all_wine_ids = set(ratings['WineID'].unique())

# Select columns with w2v features
vector_columns = [
    'WineName_w2v', 'Type_w2v', 'Elaborate_w2v', 'Grapes_w2v',
    'Harmonize_w2v', 'Country_w2v', 'RegionName_w2v', 'WineryName_w2v'
]

def convert_w2v_string(w2v_string):
    cleaned_string = w2v_string.strip("[]")
    float_list = list(map(float, cleaned_string.split()))
    return np.array(float_list)

# Apply the conversion function to each column
for column in vector_columns:
    wines[column] = wines[column].apply(convert_w2v_string)

# Weights for each feature
feature_weights = {
    'WineName_w2v': 1.0,
    'Type_w2v': 1.2,
    'Elaborate_w2v': 0.8,
    'Grapes_w2v': 1.0,
    'Harmonize_w2v': 1.5,
    'Country_w2v': 1.0,
    'RegionName_w2v': 1.1,
    'WineryName_w2v': 0.9
}

# Create a new column to store weighted wine vectors
wines['Weighted_wine_vector'] = wines.apply(
    lambda row: np.concatenate([row[column] * feature_weights[column] for column in vector_columns]), 
    axis=1
)

# Prepare data for SVD
reader = Reader(rating_scale=(0, 5))
data = Dataset.load_from_df(ratings[['UserID', 'WineID', 'Rating']], reader)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

# Train SVD model
model_svd = SVD(n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02)
model_svd.fit(trainset)

# Prepare KNN model
wine_vectors = np.stack(wines['Weighted_wine_vector'].values)
knn_model = NearestNeighbors(n_neighbors=10, metric='cosine')
knn_model.fit(wine_vectors)

def predict_svd_ratings(user_id, wines_df, model):
    predictions = []
    for wine_id in wines_df['WineID'].unique():
        predicted_rating = model.predict(user_id, wine_id).est
        
        predictions.append((wine_id, predicted_rating))
    return pd.DataFrame(predictions, columns=['WineID', 'PredictedRating'])

def predict_knn_ratings(user_id, knn_model, k=10, rating_threshold=3.5):
    user_ratings = ratings[ratings['UserID'] == user_id]
    
    if user_ratings.empty:
        return pd.DataFrame(columns=['WineID', 'PredictedRating'])

    highly_rated_wines = user_ratings[user_ratings['Rating'] > rating_threshold]
    to_predict = set()

    for _, rated_wine in highly_rated_wines.iterrows():
        rated_wine_id = rated_wine['WineID']
        rated_wine_vector = wines.loc[wines['WineID'] == rated_wine_id, 'Weighted_wine_vector'].values[0]
        distances, indices = knn_model.kneighbors(rated_wine_vector.reshape(1, -1), n_neighbors=k)
        unrated_wines = wines[~wines['WineID'].isin(user_ratings['WineID'])]
        neighbor_wine_ids = wines.iloc[indices.flatten()]['WineID'].values
        to_predict.update(neighbor_wine_id for neighbor_wine_id in neighbor_wine_ids 
                          if neighbor_wine_id in unrated_wines['WineID'].values)

    unrated_wine_predictions = {}
    similarity_explanations = {}

    for unrated_wine_id in to_predict:
        unrated_wine_vector = wines.loc[wines['WineID'] == unrated_wine_id, 'Weighted_wine_vector'].values[0].reshape(1, -1)
        total_weighted_rating = 0
        total_similarity = 0
        similar_wines = []

        for _, rated_wine in highly_rated_wines.iterrows():
            rated_wine_id = rated_wine['WineID']
            rated_wine_rating = rated_wine['Rating']
            rated_wine_vector = wines.loc[wines['WineID'] == rated_wine_id, 'Weighted_wine_vector'].values[0].reshape(1, -1)
            similarity_weight = cosine_similarity(rated_wine_vector, unrated_wine_vector)[0][0]

            if similarity_weight > 0:
                total_weighted_rating += rated_wine_rating * similarity_weight
                total_similarity += similarity_weight
                similar_wines.append((rated_wine_id, similarity_weight, rated_wine_rating))

        if total_similarity > 0:
            predicted_rating = total_weighted_rating / total_similarity
            unrated_wine_predictions[unrated_wine_id] = predicted_rating
            similarity_explanations[unrated_wine_id] = sorted(similar_wines, key=lambda x: x[1], reverse=True)[:3]

    predictions = pd.DataFrame(list(unrated_wine_predictions.items()), columns=['WineID', 'PredictedRating']).sort_values(by='PredictedRating', ascending=False)
    predictions['SimilarityExplanation'] = predictions['WineID'].map(similarity_explanations)
    return predictions

def recommend_wines(user_id, N=10):
    svd_predictions = predict_svd_ratings(user_id, wines, model_svd)
    knn_predictions = predict_knn_ratings(user_id, knn_model)
    
    # Combine predictions (you can adjust the weights as needed)
    combined_predictions = pd.merge(svd_predictions, knn_predictions, on='WineID', suffixes=('_svd', '_knn'))
    combined_predictions['CombinedRating'] = (combined_predictions['PredictedRating_svd'] + combined_predictions['PredictedRating_knn']) / 2
    
    recommended_wines = combined_predictions.sort_values('CombinedRating', ascending=False).head(N)
    
    # Add wine names and wine types to recommendations
    recommended_wines = recommended_wines.merge(wines[['WineID', 'WineName', 'Type', 'Country']], on='WineID')
    
    return recommended_wines[['WineID', 'CombinedRating', 'WineName', 'Type', 'Country', 'SimilarityExplanation']]

def generate_explanation(recommendation, user_id):
    wine_id = recommendation['WineID']
    wine_name = recommendation['WineName']
    wine_type = recommendation['Type']
    wine_country = recommendation['Country']
    combined_rating = recommendation['CombinedRating']
    similar_wines = recommendation['SimilarityExplanation']

    explanation = f"We recommend {wine_name}, a {wine_type} from {wine_country}. "
    explanation += f"Our system predicts you'll rate it {combined_rating:.2f} out of 5 based on your previous ratings. "

    if similar_wines:
        explanation += "This recommendation is based on your high ratings for similar wines:\n"
        for similar_wine_id, similarity, rating in similar_wines:
            similar_wine = wines[wines['WineID'] == similar_wine_id].iloc[0]
            explanation += f"- {similar_wine['WineName']} (which you rated {rating:.1f}/5) - {similarity:.2f} similarity\n"

    user_ratings = ratings[ratings['UserID'] == user_id]
    avg_user_rating = user_ratings['Rating'].mean()
    if combined_rating > avg_user_rating:
        explanation += f"\nThis wine's predicted rating ({combined_rating:.2f}) is higher than your average rating ({avg_user_rating:.2f}), suggesting you might especially enjoy it."

    return explanation

def display_recommendations_with_explanations(user_id):
    print(f"\nTop 10 wine recommendations for user {user_id}:")
    recommendations = recommend_wines(user_id)
    
    for i, (_, row) in enumerate(recommendations.iterrows(), 1):
        print(f"\n{i}. {row['WineName']} (Type: {row['Type']}, Predicted rating: {row['CombinedRating']:.2f})")
        explanation = generate_explanation(row, user_id)
        print(explanation)
        print(ExplainRecommendation(user_id, row['WineID']))

def add_new_user():
    global ratings
    
    new_user_id = ratings['UserID'].max() + 1
    print(f"New user created with ID: {new_user_id}")
    
    sample_wines = wines.sample(5)
    new_ratings = []
    
    for _, wine in sample_wines.iterrows():
        print(f"\nWine: {wine['WineName']}")
        print(f"Type: {wine['Type']}")
        print(f"Country: {wine['Country']}")
        
        while True:
            try:
                rating = float(input("Please rate this wine (1-5): "))
                if 1 <= rating <= 5:
                    break
                else:
                    print("Rating must be between 1 and 5.")
            except ValueError:
                print("Please enter a valid number.")
        
        new_ratings.append({
            'UserID': new_user_id,
            'WineID': wine['WineID'],
            'Rating': rating
        })
    
    ratings = pd.concat([ratings, pd.DataFrame(new_ratings)], ignore_index=True)
    print("\nThank you for rating these wines!")
    return new_user_id

def main():
    while True:
        print("\n1. Add new user")
        print("2. Get recommendations for existing user")
        print("3. Exit")
        
        choice = input("Enter your choice (1-3): ")
        
        if choice == '1':
            new_user_id = add_new_user()
            display_recommendations_with_explanations(new_user_id)
        
        elif choice == '2':
            try:
                user_id = int(input("Enter user ID: "))
                if user_id in ratings['UserID'].unique():
                    display_recommendations_with_explanations(user_id)
                else:
                    print("User not found.")
            except ValueError:
                print("Invalid user ID. Please enter a number.")
        
        elif choice == '3':
            print("Thank you for using the Wine Recommender System. Goodbye!")
            break
        
        else:
            print("Invalid choice. Please try again.")

if __name__ == "__main__":
    main()


1. Add new user
2. Get recommendations for existing user
3. Exit

Top 10 wine recommendations for user 1209683:

1. Brunello di Montalcino (Type: Red, Predicted rating: 4.45)
We recommend Brunello di Montalcino, a Red from Italy. Our system predicts you'll rate it 4.45 out of 5 based on your previous ratings. This recommendation is based on your high ratings for similar wines:
- Brunello di Montalcino (which you rated 4.5/5) - 0.99 similarity
- Kurni Marche (which you rated 4.0/5) - 0.80 similarity
- Priorat (which you rated 4.5/5) - 0.79 similarity

This wine's predicted rating (4.45) is higher than your average rating (3.79), suggesting you might especially enjoy it.
you have rated wines with the same grapes and region highly. 


2. Pauillac  Premier Grand Cru Classé  (Type: Red, Predicted rating: 4.45)
We recommend Pauillac  Premier Grand Cru Classé , a Red from France. Our system predicts you'll rate it 4.45 out of 5 based on your previous ratings. This recommendation is based on 

In [5]:
import tkinter as tk
from tkinter import ttk, messagebox
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

class WineRecommenderGUI:
    def __init__(self, master):
        self.master = master
        master.title("Wine Recommender System")
        master.geometry("600x400")

        # Load data
        self.wines = pd.read_csv('wines.csv')
        self.ratings = pd.read_csv('XWines_Slim_150K_ratings.csv', low_memory=False)
        self.wine_similarities = np.load('wine_similarities_w2v.npy')

        # Create sets of valid wine IDs
        self.valid_wine_ids = set(self.wines['WineID'])
        self.valid_rated_wine_ids = set(self.ratings['WineID'])
        self.all_valid_wine_ids = self.valid_wine_ids.intersection(self.valid_rated_wine_ids)

        self.wine_id_to_idx = {wine_id: idx for idx, wine_id in enumerate(self.wines['WineID'])}

        # Create notebook (tabbed interface)
        self.notebook = ttk.Notebook(master)
        self.notebook.pack(expand=True, fill="both", padx=10, pady=10)

        # Create tabs
        self.create_new_user_tab()
        self.create_existing_user_tab()

    def create_new_user_tab(self):
        new_user_frame = ttk.Frame(self.notebook)
        self.notebook.add(new_user_frame, text="New User")

        # Instructions
        ttk.Label(new_user_frame, text="Rate 5 random wines to get started:").grid(row=0, column=0, columnspan=2, pady=10)

        # Create widgets for rating wines
        self.rating_widgets = []
        self.sample_wines = self.wines[self.wines['WineID'].isin(self.all_valid_wine_ids)].sample(5)
        for i, (_, wine) in enumerate(self.sample_wines.iterrows()):
            ttk.Label(new_user_frame, text=f"{wine['WineName']} ({wine['Type']}, {wine['Country']})").grid(row=i+1, column=0, sticky="w", padx=5)
            rating_var = tk.StringVar()
            rating_combo = ttk.Combobox(new_user_frame, textvariable=rating_var, values=[1, 2, 3, 4, 5], width=5)
            rating_combo.grid(row=i+1, column=1, padx=5)
            self.rating_widgets.append((wine['WineID'], rating_var))

        # Button to submit ratings
        ttk.Button(new_user_frame, text="Get Recommendations", command=self.submit_new_user_ratings).grid(row=6, column=0, columnspan=2, pady=10)

        # Recommendations display
        self.new_user_recommendations = tk.Text(new_user_frame, height=10, width=70)
        self.new_user_recommendations.grid(row=7, column=0, columnspan=2, padx=5, pady=5)

    def create_existing_user_tab(self):
        existing_user_frame = ttk.Frame(self.notebook)
        self.notebook.add(existing_user_frame, text="Existing User")

        ttk.Label(existing_user_frame, text="Enter User ID:").grid(row=0, column=0, padx=5, pady=10)
        self.user_id_entry = ttk.Entry(existing_user_frame, width=10)
        self.user_id_entry.grid(row=0, column=1, padx=5)

        ttk.Button(existing_user_frame, text="Get Recommendations", command=self.get_existing_user_recommendations).grid(row=1, column=0, columnspan=2, pady=10)

        self.existing_user_recommendations = tk.Text(existing_user_frame, height=15, width=70)
        self.existing_user_recommendations.grid(row=2, column=0, columnspan=2, padx=5, pady=5)

    def submit_new_user_ratings(self):
        new_ratings = []
        new_user_id = self.ratings['UserID'].max() + 1

        for wine_id, rating_var in self.rating_widgets:
            try:
                rating = float(rating_var.get())
                if 1 <= rating <= 5:
                    new_ratings.append({'UserID': new_user_id, 'WineID': wine_id, 'Rating': rating})
                else:
                    raise ValueError
            except ValueError:
                messagebox.showerror("Invalid Input", "Please enter valid ratings (1-5) for all wines.")
                return

        self.ratings = pd.concat([self.ratings, pd.DataFrame(new_ratings)], ignore_index=True)
        recommendations = self.recommend_wines(new_user_id)
        self.display_recommendations(recommendations, self.new_user_recommendations)

    def get_existing_user_recommendations(self):
        try:
            user_id = int(self.user_id_entry.get())
            if user_id in self.ratings['UserID'].unique():
                recommendations = self.recommend_wines(user_id)
                self.display_recommendations(recommendations, self.existing_user_recommendations)
            else:
                messagebox.showerror("User Not Found", "The entered User ID does not exist.")
        except ValueError:
            messagebox.showerror("Invalid Input", "Please enter a valid User ID (integer).")

    def recommend_wines(self, user_id, N=10):
        predicted_ratings = self.predict_all_ratings(user_id)
        recommended_wines = sorted(predicted_ratings.items(), key=lambda x: x[1], reverse=True)
        
        unique_recommendations = []
        seen_names = set()
        for wine_id, predicted_rating in recommended_wines:
            if wine_id in self.valid_wine_ids:
                wine_name = self.wines[self.wines['WineID'] == wine_id]['WineName'].values[0]
                if wine_name not in seen_names:
                    unique_recommendations.append((wine_id, predicted_rating, wine_name))
                    seen_names.add(wine_name)
                if len(unique_recommendations) == N:
                    break
        
        return unique_recommendations

    def predict_all_ratings(self, user_id):
        unrated_wines = self.get_unrated_wines(user_id)
        predictions = {}
        
        for wine in unrated_wines:
            predicted_rating = self.predict_rating(user_id, wine)
            predictions[wine] = predicted_rating
        
        return predictions

    def get_unrated_wines(self, user_id):
        rated_wines = set(self.ratings[self.ratings['UserID'] == user_id]['WineID'].unique())
        unrated_wines = self.all_valid_wine_ids - rated_wines
        return list(unrated_wines)

    def predict_rating(self, user_id, unrated_wine):
        user_ratings = self.ratings[self.ratings['UserID'] == user_id][['WineID', 'Rating']]
        rated_wines = user_ratings['WineID'].tolist()
        
        if len(rated_wines) == 0:
            return np.nan
        
        k_similar_wines = self.get_similarities(unrated_wine, rated_wines)
        
        numerator = 0
        denominator = 0
        for wine, similarity in k_similar_wines:
            rating = user_ratings[user_ratings['WineID'] == wine]['Rating'].values[0]
            numerator += similarity * rating
            denominator += abs(similarity)
        
        if denominator == 0:
            user_mean = user_ratings['Rating'].mean()
            return user_mean
        
        return numerator / denominator

    def get_similarities(self, unrated_wine, rated_wines):
        similarities = []
        if unrated_wine not in self.wine_id_to_idx:
            return similarities
        
        unrated_wine_idx = self.wine_id_to_idx[unrated_wine]
        
        for rated_wine in rated_wines:
            if rated_wine not in self.wine_id_to_idx:
                continue
            rated_wine_idx = self.wine_id_to_idx[rated_wine]
            similarity = self.wine_similarities[unrated_wine_idx, rated_wine_idx]
            similarities.append((rated_wine, similarity))
        
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities

    def display_recommendations(self, recommendations, text_widget):
        text_widget.delete('1.0', tk.END)
        text_widget.insert(tk.END, "Top 10 Wine Recommendations:\n\n")
        for i, (_, predicted_rating, wine_name) in enumerate(recommendations, 1):
            text_widget.insert(tk.END, f"{i}. {wine_name} (Predicted rating: {predicted_rating:.2f})\n")

if __name__ == "__main__":
    root = tk.Tk()
    app = WineRecommenderGUI(root)
    root.mainloop()