In [None]:
import pandas as pd
import os
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
import re
import string
import random
from nltk.tokenize import RegexpTokenizer
nltk.download('stopwords')
from fuzzywuzzy import fuzz


In [None]:
# Import the imdb movies dataset
df = pd.read_csv("D:\\IPBA\\BYOP_G_L\\data.csv")

In [None]:
df.dtypes

In [None]:
# Combine movie name and tags into a single string
df['content'] = df['product_name'].astype(str) + ' ' + df['rating'].astype(str) + ' ' + df['product_tag'] + ' ' + df['brand_tag']
df['content'] = df['content'].fillna('')

In [None]:
#Utitlity functions for removing ASCII characters, converting lower case, removing stop words, html and punctuation from description

def _removeNonAscii(s):
    return "".join(i for i in s if  ord(i)<128)

def make_lower_case(text):
    return text.lower()

def remove_stop_words(text):
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

def remove_html(text):
    html_pattern = re.compile('<.*?>')
    return html_pattern.sub(r'', text)

def remove_punctuation(text):
    tokenizer = RegexpTokenizer(r'\w+')
    text = tokenizer.tokenize(text)
    text = " ".join(text)
    return text

df['cleaned_content'] = df['content'].apply(_removeNonAscii)

df['cleaned_content'] = df.cleaned_content.apply(func = make_lower_case)
df['cleaned_content'] = df.cleaned_content.apply(func = remove_stop_words)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_punctuation)
df['cleaned_content'] = df.cleaned_content.apply(func=remove_html)

In [None]:
df.head()

In [None]:
os.chdir("D:\\py virtual environment\\glove files")

In [None]:
glove_file="D:\\py virtual environment\\glove files\\glove.6B.100d.txt"

In [None]:
# Load the GloVe model
def load_glove_model(glove_file):
    print("Loading GloVe Model")
    glove_model = {}
    with open(glove_file, 'r', encoding='utf8') as f:
        for line in f:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]])
            glove_model[word] = embedding
    print("Done.", len(glove_model), "words loaded!")
    return glove_model

glove_model = load_glove_model('glove.6B.100d.txt')


In [None]:
# Function to convert description to Glove vector
def description_to_glove_vector(description, glove_model):
    words = description.split()
    word_vectors = [glove_model.get(word, np.zeros((100,))) for word in words]
    
    # Filter out zero vectors (which represent missing words in the GloVe model)
    valid_vectors = [vec for vec in word_vectors if np.any(vec)]
    
    if len(valid_vectors) == 0:
        return np.zeros((100,))
    else:
        # Normalize the vector
        mean_vector = np.mean(valid_vectors, axis=0)
        return mean_vector / np.linalg.norm(mean_vector)




In [None]:
# Applying function to create Glove vectors
glove_data = df['cleaned_content'].apply(lambda x: description_to_glove_vector(x, glove_model))

# Slice before converting to DataFrame, taking first 50 values
glove_data1 = glove_data.apply(lambda x: x[:50])

# Converting the arrays into a DataFrame
glove_df = pd.DataFrame(glove_data1.tolist(), columns=[f'glove_{i}' for i in range(50)])

In [None]:
glove_df.head()

In [None]:
# Convert the glove_data Series to a NumPy array
glove_feature_array = np.vstack(glove_data)


# 1st Approch (Basic Sorting)

In [None]:
# Define a function to recommend similar items based on a user input product name
def recommend_similar_items(user_item_index, glove_feature_array, top_n=10):
    # Compute cosine similarities between the user item and all other items
    user_item_vector = glove_feature_array[user_item_index].reshape(1, -1)
    similarity_scores = cosine_similarity(user_item_vector, glove_feature_array)

    # Get top N most similar items (excluding the user item itself)
    similar_items = list(enumerate(similarity_scores[0]))
    sorted_similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)[1:top_n+1]

    return sorted_similar_items

In [None]:
# Example usage: Get recommendations for a user input product name
user_item_name = "Croc Textured Two Fold Wallet"  # Replace with the user input product name
user_item_index = df[df['product_name'] == user_item_name].index[0]

# Get recommendations for the user input product
recommendations = recommend_similar_items(user_item_index, glove_feature_array)

# Create a list to store the recommendations
recommendations_list = []

# Populate the list with recommendations and additional information
for i, score in recommendations:
    recommended_product_name = df.loc[i, 'product_name']
    rating = df.loc[df['product_name'] == recommended_product_name, 'rating'].iloc[0]
    brand_name = df.loc[df['product_name'] == recommended_product_name, 'brand_name'].iloc[0]
    recommendations_list.append({
        'Product Name': recommended_product_name,
        'Similarity Score': score,
        'Rating': rating,
        'Brand Name': brand_name
    })

# Convert the list to a DataFrame
recommendations_df = pd.DataFrame(recommendations_list)

# Remove duplicate products with exact same name
recommendations_df = recommendations_df.drop_duplicates(subset=['Product Name'])

# Display the recommendations DataFrame
print("Top recommended products for '{}':".format(user_item_name))
print(recommendations_df)

# 2nd Approch (Fuzzywuzzy)

In [None]:

# Define a function to find the closest match for the user input among product names
def find_closest_match(user_input, product_names):
    highest_score = -1
    closest_match = None
    for product_name in product_names:
        similarity_score = fuzz.partial_ratio(user_input, product_name)
        if similarity_score > highest_score:
            highest_score = similarity_score
            closest_match = product_name
    return closest_match



In [None]:
# Example usage: Get recommendations based on a user input (similar) product name
user_input = "shirts"  # Replace with the user input
closest_product_name = find_closest_match(user_input, df['product_name'])

recommendations_list = []  # List to store recommendations

if closest_product_name:
    print("Closest matching product name:", closest_product_name)
    # Get recommendations for the closest matching product name
    user_item_index = df[df['product_name'] == closest_product_name].index[0]
    recommendations = recommend_similar_items(user_item_index, glove_feature_array)

    # Populate the recommendations list with top 10 recommendations and additional information
    for i, score in recommendations[:10]:  # Selecting only the top 10 recommendations
        recommended_product_name = df.loc[i, 'product_name']
        rating = df.loc[df['product_name'] == recommended_product_name, 'rating'].iloc[0]
        brand_name = df.loc[df['product_name'] == recommended_product_name, 'brand_name'].iloc[0]
        recommendations_list.append({
            'Product Name': recommended_product_name,
            'Similarity Score': score,
            'Rating': rating,
            'Brand Name': brand_name
        })
# Convert the list to a DataFrame
recommendations_df = pd.DataFrame(recommendations_list)

# Remove duplicate products with exact same name
recommendations_df = recommendations_df.drop_duplicates(subset=['Product Name'])

# Display the recommendations DataFrame
print("Top 10 recommended products:")
print(recommendations_df)




# Hybrid Model

In [None]:
# Model 1: Define a function to recommend similar items based on a user input product name
def recommend_similar_items(user_item_index, glove_feature_array, top_n=10):
    # Compute cosine similarities between the user item and all other items
    user_item_vector = glove_feature_array[user_item_index].reshape(1, -1)
    similarity_scores = cosine_similarity(user_item_vector, glove_feature_array)

    # Get top N most similar items (excluding the user item itself)
    similar_items = list(enumerate(similarity_scores[0]))
    sorted_similar_items = sorted(similar_items, key=lambda x: x[1], reverse=True)[1:top_n+1]

    return sorted_similar_items

# Model 2: Define a function to find the closest match for the user input among product names
def find_closest_match(user_input, product_names):
    highest_score = -1
    closest_match = None
    for product_name in product_names:
        similarity_score = fuzz.partial_ratio(user_input, product_name)
        if similarity_score > highest_score:
            highest_score = similarity_score
            closest_match = product_name
    return closest_match

# Hybrid recommendation function
def hybrid_recommendation(user_input, glove_feature_array, df):
    # Model 1: Get recommendations based on the user input (similar product name)
    closest_product_name = find_closest_match(user_input, df['product_name'])
    recommendations_list = []  # List to store recommendations

    if closest_product_name:
        # Get recommendations for the closest matching product name
        user_item_index = df[df['product_name'] == closest_product_name].index[0]
        recommendations = recommend_similar_items(user_item_index, glove_feature_array)

        # Populate the recommendations list with top 10 recommendations and additional information
        for i, score in recommendations[:10]:  # Selecting only the top 10 recommendations
            recommended_product_name = df.loc[i, 'product_name']
            rating = df.loc[df['product_name'] == recommended_product_name, 'rating'].iloc[0]
            brand_name = df.loc[df['product_name'] == recommended_product_name, 'brand_name'].iloc[0]
            recommendations_list.append({
                'Product Name': recommended_product_name,
                'Similarity Score': score,
                'Rating': rating,
                'Brand Name': brand_name
            })
    
    # Model 2: Get recommendations based on the user input (closest matching product name)
    user_input_recommendations = []
    if closest_product_name:
        closest_match_recommendations = find_closest_match(user_input, df['product_name'])
        if closest_match_recommendations:
            user_input_recommendations.append(closest_match_recommendations)

    # Combine recommendations from both models
    combined_recommendations = user_input_recommendations + recommendations_list
    
    # Convert the list to a DataFrame
    recommendations_df = pd.DataFrame(combined_recommendations)

    return recommendations_df




In [None]:
# Example usage
user_input = "wallet"  # Replace with the user input
hybrid_results = hybrid_recommendation(user_input, glove_feature_array, df)

# Display the hybrid recommendations DataFrame
print("Hybrid recommendations:")
print(hybrid_results)

# Hybrid 2

In [3]:
import pandas as pd
from scipy.spatial.distance import cdist
from Levenshtein import distance as levenshtein_distance

ModuleNotFoundError: No module named 'Levenshtein'

In [None]:
import pandas as pd
from scipy.spatial.distance import cdist
from Levenshtein import distance as levenshtein_distance

# Model 1: Define a function to recommend similar items based on a user input product name
def recommend_similar_items(user_item_index, glove_feature_array, top_n=10):
    # Compute cosine similarities between the user item and all other items
    user_item_vector = glove_feature_array[user_item_index].reshape(1, -1)
    similarity_scores = cdist(user_item_vector, glove_feature_array, metric='cosine')

    # Get top N most similar items (excluding the user item itself)
    similar_items = list(enumerate(similarity_scores[0]))
    sorted_similar_items = sorted(similar_items, key=lambda x: x[1], reverse=False)[:top_n]

    return sorted_similar_items

# Model 2: Define a function to find the closest match for the user input among product names using Levenshtein Distance
def find_closest_match(user_input, product_names):
    closest_match = None
    min_distance = float('inf')
    for product_name in product_names:
        distance = levenshtein_distance(user_input, product_name)
        if distance < min_distance:
            min_distance = distance
            closest_match = product_name
    return closest_match

# Hybrid recommendation function
def hybrid_recommendation(user_input, glove_feature_array, df):
    # Model 1: Get recommendations based on the user input (similar product name)
    closest_product_name = find_closest_match(user_input, df['product_name'])
    recommendations_list = []  # List to store recommendations

    if closest_product_name:
        # Get recommendations for the closest matching product name
        user_item_index = df[df['product_name'] == closest_product_name].index[0]
        recommendations = recommend_similar_items(user_item_index, glove_feature_array)

        # Populate the recommendations list with top 10 recommendations and additional information
        for i, score in recommendations[:10]:  # Selecting only the top 10 recommendations
            recommended_product_name = df.loc[i, 'product_name']
            rating = df.loc[df['product_name'] == recommended_product_name, 'rating'].iloc[0]
            brand_name = df.loc[df['product_name'] == recommended_product_name, 'brand_name'].iloc[0]
            recommendations_list.append({
                'Product Name': recommended_product_name,
                'Similarity Score': score,
                'Rating': rating,
                'Brand Name': brand_name
            })
    
    # Model 2: Get recommendations based on the user input (closest matching product name)
    user_input_recommendations = []
    if closest_product_name:
        closest_match_recommendations = find_closest_match(user_input, df['product_name'])
        if closest_match_recommendations:
            user_input_recommendations.append(closest_match_recommendations)

    # Combine recommendations from both models
    combined_recommendations = user_input_recommendations + recommendations_list
    
    # Convert the list to a DataFrame
    recommendations_df = pd.DataFrame(combined_recommendations)

    return recommendations_df

# Example usage
user_input = "shirts"  # Replace with the user input
hybrid_results = hybrid_recommendation(user_input, glove_feature_array, df)

# Display the hybrid recommendations DataFrame
print("Hybrid recommendations:")
print(hybrid_results)
