<a href="https://colab.research.google.com/github/PandeyAnupma/CSE508_Winters2024_A2_MT23019/blob/main/CSE508_Winters2024_A2_MT23019.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**DATA READ**

In [None]:
import cv2
import pandas as pd
import ast
import os
import numpy as np
import pickle
import numpy as np
from urllib.request import urlopen
from google.colab import drive
import re

# Function to perform basic image preprocessing
def preprocess_image(image, output_size=(256, 256)):
    if image is None:
        print("Error: Failed to load image")
        return None
    resized_image = cv2.resize(image, output_size)
    grayscale_image = cv2.cvtColor(resized_image, cv2.COLOR_BGR2GRAY)
    return grayscale_image

# Function to preprocess the review text
def preprocess_text(text):
    if pd.isnull(text):
        return ""
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

# Function to mount Google Drive
def mount_google_drive():
    drive.mount('/content/drive')

# Function to create output directory
def create_output_directory(output_directory):
    os.makedirs(output_directory, exist_ok=True)

# Function to read CSV file into a DataFrame
def read_csv_into_dataframe(csv_file_path):
    return pd.read_csv(csv_file_path)

# Function to preprocess review texts
def preprocess_review_text(df):
    df['Review Text'] = df['Review Text'].apply(preprocess_text)
    return df

# Function to store DataFrame in a pickle file
def store_dataframe_in_pickle(df, df_path):
    with open(df_path, 'wb') as f:
        pickle.dump(df, f)

# Function to load image from URL
def load_image_from_url(url):
    resp = urlopen(url)
    image = np.asarray(bytearray(resp.read()), dtype="uint8")
    return cv2.imdecode(image, cv2.IMREAD_COLOR)

# Function to preprocess and save images
def preprocess_and_save_images(image_urls, product_directory):
    for idx, url in enumerate(image_urls):
        try:
            image = load_image_from_url(url)
            preprocessed_image = preprocess_image(image)
            if preprocessed_image is not None:
                image_path = os.path.join(product_directory, f"image_{idx}.jpg")
                cv2.imwrite(image_path, preprocessed_image)
            else:
                print(f"Skipping image {url} due to preprocessing error")
        except Exception as e:
            print(f"Error loading image from URL {url}: {str(e)}")

# Main function to perform all tasks
def main():
    mount_google_drive()
    output_directory = "/content/drive/My Drive/preprocessed_images/"
    create_output_directory(output_directory)
    csv_file_path = "/content/drive/My Drive/A2_Data.csv"
    df = read_csv_into_dataframe(csv_file_path)
    df = preprocess_review_text(df)
    df_path = "/content/drive/My Drive/preprocessed_dataframe.pickle"
    store_dataframe_in_pickle(df, df_path)

    for index, row in df.iterrows():
        image_urls = ast.literal_eval(row['Image'])
        product_directory = os.path.join(output_directory, str(row['Product_ID']))
        os.makedirs(product_directory, exist_ok=True)
        preprocess_and_save_images(image_urls, product_directory)

    print("File successfully read, Image and Text preprocessing complete.")

    pickle_file_path = "/content/drive/My Drive/dataframe.pickle"
    store_dataframe_in_pickle(df, pickle_file_path)
    print("DataFrame stored in pickle file successfully.")

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/71F3npeHUDL._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/71wHUWncMGL._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/71B8OOE5N8L._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/81SX3oAWbNL._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/718niQ1GEwL._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/61OboZT-kcL._SY88.jpg: HTTP Error 404: Not Found
Error loading image from URL https://images-na.ssl-images-amazon.com/images/I/710a2Pyh5lL._SY88.jpg: HTTP Err

In [None]:
!pip install numpy scikit-learn tensorflow



**IMAGE PREPROCESSING & FEATURE EXTRACTION**

In [None]:
import os
import numpy as np
import pandas as pd
import cv2
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
import pickle

# Function to load and preprocess images with error handling
def load_and_preprocess_image(image_path, target_size=(224, 224)):
    try:
        img = cv2.imread(image_path)
        if img is None or img.size == 0:
            raise Exception("Failed to load image or image is empty")

        img = cv2.resize(img, target_size)
        img = img.astype(np.float32)
        img = img / 255.0  # Normalize pixel values to [0, 1]
        return img
    except Exception as e:
        print(f"Error loading or preprocessing image {image_path}: {str(e)}")
        return None

# Function to load pre-trained VGG16 model
def load_pretrained_VGG16():
    base_model = VGG16(weights='imagenet', include_top=True)
    model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)
    return model

# Function to extract features from images using VGG16 model
def extract_features(df, preprocessed_images_dir, model):
    features = []
    labels = []

    for index, row in df.iterrows():
        image_urls = ast.literal_eval(row['Image'])

        for idx, url in enumerate(image_urls):
            image_path = os.path.join(preprocessed_images_dir, str(row['Product_ID']), f"image_{idx}.jpg")
            image = load_and_preprocess_image(image_path)

            if image is not None:
                feature = model.predict(np.expand_dims(image, axis=0)).flatten()
                features.append(feature)
                labels.append(row['Product_ID'])

    features = np.array(features)
    labels = np.array(labels)
    return features, labels

# Function to save extracted features and labels
def save_features_and_labels(features, labels, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump((features, labels), f)
    print("Feature extraction complete and saved.")

# Main function to perform all tasks
def main():
    preprocessed_images_dir = "/content/drive/My Drive/preprocessed_images/"
    preprocessed_df_path = "/content/drive/My Drive/dataframe.pickle"
    output_file = "/content/drive/My Drive/extracted_features.pickle"

    df = pd.read_pickle(preprocessed_df_path)
    model = load_pretrained_VGG16()
    features, labels = extract_features(df, preprocessed_images_dir, model)
    save_features_and_labels(features, labels, output_file)

if __name__ == "__main__":
    main()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels.h5
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/2235/image_0.jpg: Failed to load image or image is empty
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/2235/image_1.jpg: Failed to load image or image is empty
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/3317/image_0.jpg: Failed to load image or image is empty
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/3317/image_1.jpg: Failed to load image or image is empty
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/2912/image_0.jpg: Failed to load image or image is empty
Error loading or preprocessing image /content/drive/My Drive/preprocessed_images/2265/image_0.jpg: Failed to load image or image is empty
Error loading or preprocessing image /c

In [None]:
import numpy as np
import pickle
from sklearn.preprocessing import StandardScaler
from google.colab import drive

# Function to mount Google Drive
def mount_google_drive():
    drive.mount('/content/drive')

# Function to load extracted features and labels from the pickle file
def load_features_and_labels(input_file):
    with open(input_file, 'rb') as f:
        features, labels = pickle.load(f)
    return features, labels

# Function to normalize features using StandardScaler
def normalize_features(features):
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(features)
    return normalized_features

# Function to save normalized features and labels to the pickle file
def save_normalized_features_and_labels(normalized_features, labels, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump((normalized_features, labels), f)
    print("Feature normalization complete and saved.")

# Main function to perform all tasks
def main():
    mount_google_drive()
    input_file = "/content/drive/My Drive/extracted_features.pickle"
    output_file = "/content/drive/My Drive/normalized_extracted_features.pickle"

    features, labels = load_features_and_labels(input_file)
    normalized_features = normalize_features(features)
    save_normalized_features_and_labels(normalized_features, labels, output_file)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Feature normalization complete and saved.


**TEXT PREPROCESSING**

In [None]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import string
import re
import pickle

# Function to download NLTK resources
def download_nltk_resources():
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('wordnet')

# Function to load preprocessed DataFrame from the pickle file
def load_preprocessed_dataframe(preprocessed_df_path):
    with open(preprocessed_df_path, 'rb') as f:
        df = pickle.load(f)
    return df

# Function to perform text preprocessing
def preprocess_text(text):
    if pd.isnull(text):  # Check for NaN values
        return ""

    # Convert text to lowercase
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

# Function to preprocess reviews and store in a pickle file
def preprocess_reviews_and_store(df, output_path):
    df['Preprocessed_Review'] = df['Review Text'].apply(preprocess_text)
    with open(output_path, 'wb') as f:
        pickle.dump(df['Preprocessed_Review'], f)

# Main function to perform all tasks
def main():
    download_nltk_resources()
    preprocessed_df_path = "/content/drive/My Drive/dataframe.pickle"
    output_path = "/content/drive/My Drive/preprocessed_reviews.pickle"
    df = load_preprocessed_dataframe(preprocessed_df_path)
    preprocess_reviews_and_store(df, output_path)

if __name__ == "__main__":
    main()


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


**TF-IDF scores**

In [None]:
import pandas as pd
import numpy as np
import math
import pickle
import re

# Function to load preprocessed DataFrame from the pickle file
def load_preprocessed_dataframe(preprocessed_df_path):
    with open(preprocessed_df_path, 'rb') as f:
        df = pickle.load(f)
    return df

# Function to replace NaN values in 'Review Text' column with empty strings
def replace_nan_with_empty_strings(df):
    df['Review Text'] = df['Review Text'].fillna('')
    return df

# Function to tokenize and preprocess the reviews
def tokenize_and_preprocess_reviews(df):
    df['Preprocessed_Review'] = df['Review Text'].apply(lambda x: re.findall(r'\b\w+\b', x.lower()))
    return df

# Function to calculate term frequency (TF) for a term in a document
def calculate_tf(term, document):
    term_count = document.count(term)
    total_terms = len(document)
    return term_count / total_terms if total_terms > 0 else 0

# Function to calculate inverse document frequency (IDF) for a term
def calculate_idf(term, documents):
    document_count = sum(1 for document in documents if term in document)
    total_documents = len(documents)
    return math.log10(total_documents / (1 + document_count))

# Function to calculate TF-IDF scores for terms in documents
def calculate_tfidf_scores(documents):
    tfidf_scores = {}
    for i, document in enumerate(documents):
        tfidf_scores[i] = {}
        for term in set(document):
            tf = calculate_tf(term, document)
            idf = calculate_idf(term, documents)
            tfidf_scores[i][term] = tf * idf
    return tfidf_scores

# Function to save TF-IDF scores using pickle
def save_tfidf_scores(tfidf_scores, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump(tfidf_scores, f)
    print("TF-IDF scores calculated and saved.")

# Main function to perform all tasks
def main():
    preprocessed_df_path = "/content/drive/My Drive/dataframe.pickle"
    output_file = "/content/drive/My Drive/tfidf_scores.pickle"

    df = load_preprocessed_dataframe(preprocessed_df_path)
    df = replace_nan_with_empty_strings(df)
    df = tokenize_and_preprocess_reviews(df)

    tfidf_scores = calculate_tfidf_scores(df['Preprocessed_Review'])
    save_tfidf_scores(tfidf_scores, output_file)

if __name__ == "__main__":
    main()


TF-IDF scores calculated and saved.


**IMAGE COSINE SIMILARITY**

In [None]:
import numpy as np
import pickle

# Function to load normalized features and labels from a pickle file
def load_normalized_features_and_labels(file_path):
    with open(file_path, 'rb') as f:
        normalized_features, labels = pickle.load(f)
    return normalized_features, labels

# Function to calculate cosine similarity between two vectors
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm_vector1 = np.linalg.norm(vector1)
    norm_vector2 = np.linalg.norm(vector2)
    similarity = dot_product / (norm_vector1 * norm_vector2)
    return similarity

# Function to find most similar images for each image
def find_similar_images(feature_vectors, labels, top_k=3):
    similar_images = {}
    for i, feature_vector in enumerate(feature_vectors):
        current_product_id = labels[i]
        similarities = []
        for j, other_feature_vector in enumerate(feature_vectors):
            if i != j:
                similarity = cosine_similarity(feature_vector, other_feature_vector)
                similarities.append((similarity, labels[j]))
        similarities.sort(key=lambda x: x[0], reverse=True)
        similar_images[current_product_id] = []
        unique_product_ids = set()
        for similarity, product_id in similarities:
            if len(similar_images[current_product_id]) >= top_k:
                break
            if product_id not in unique_product_ids:
                similar_images[current_product_id].append((product_id, similarity))
                unique_product_ids.add(product_id)
    return similar_images

# Function to save results using pickle
def save_results(similar_images, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump(similar_images, f)
    print("Similar images results saved.")

# Main function to perform all tasks
def main():
    normalized_features_file = "/content/drive/My Drive/normalized_extracted_features.pickle"
    output_file = "/content/drive/My Drive/similar_images_results.pickle"

    normalized_features, labels = load_normalized_features_and_labels(normalized_features_file)
    similar_images = find_similar_images(normalized_features, labels)
    save_results(similar_images, output_file)

if __name__ == "__main__":
    main()


Similar images results saved.


**TEXT COSINE SIMILARITY**

In [None]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Function to load preprocessed reviews from a pickle file
def load_preprocessed_reviews(file_path):
    with open(file_path, 'rb') as f:
        preprocessed_reviews = pickle.load(f)
    return preprocessed_reviews

# Function to load preprocessed DataFrame from a pickle file
def load_preprocessed_dataframe(file_path):
    with open(file_path, 'rb') as f:
        df = pickle.load(f)
    return df

# Function to create TF-IDF matrix
def create_tfidf_matrix(preprocessed_reviews):
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(preprocessed_reviews)
    return tfidf_matrix

# Function to find most similar reviews for each review
def find_similar_reviews(tfidf_matrix, product_ids, top_k=3):
    similar_reviews = {}
    num_reviews = tfidf_matrix.shape[0]

    for i, product_id in enumerate(product_ids):
        similarities = []
        for j in range(num_reviews):
            if i != j:
                similarity = cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0][0]
                similarities.append((product_ids[j], similarity))

        similarities.sort(key=lambda x: x[1], reverse=True)
        similar_reviews[product_id] = similarities[:top_k]

    return similar_reviews

# Function to save results using pickle
def save_results(similar_reviews, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump(similar_reviews, f)
    print("Similar reviews results saved.")

# Main function to perform all tasks
def main():
    preprocessed_reviews_file = "/content/drive/My Drive/preprocessed_reviews.pickle"
    preprocessed_df_path = "/content/drive/My Drive/dataframe.pickle"
    output_file = "/content/drive/My Drive/similar_reviews_results.pickle"

    preprocessed_reviews = load_preprocessed_reviews(preprocessed_reviews_file)
    df = load_preprocessed_dataframe(preprocessed_df_path)
    product_ids = df['Product_ID'].tolist()
    tfidf_matrix = create_tfidf_matrix(preprocessed_reviews)
    similar_reviews = find_similar_reviews(tfidf_matrix, product_ids)
    save_results(similar_reviews, output_file)

if __name__ == "__main__":
    main()


Similar reviews results saved.


**AVERAGE COMPOSITE SIMILARITY SCORE**

In [None]:
import pickle
import numpy as np
from google.colab import drive

# Function to mount Google Drive
def mount_google_drive():
    drive.mount('/content/drive')

# Function to load similarity results from pickle files
def load_similarity_results(image_file_path, review_file_path):
    with open(image_file_path, 'rb') as f:
        similar_images = pickle.load(f)
    with open(review_file_path, 'rb') as f:
        similar_reviews = pickle.load(f)
    return similar_images, similar_reviews

# Function to calculate average similarity scores
def calculate_average_similarity(similarity_results):
    composite_similarity_scores = {}
    for key, value in similarity_results.items():
        composite_similarity_scores[key] = np.mean([score for _, score in value])
    return composite_similarity_scores

# Function to calculate composite similarity scores
def calculate_composite_similarity(average_image_similarity, average_review_similarity):
    composite_similarity_scores = {}
    for key in average_image_similarity.keys():
        composite_similarity_scores[key] = (average_image_similarity[key] + average_review_similarity[key]) / 2
    return composite_similarity_scores

# Function to save composite similarity scores using pickle
def save_composite_similarity_scores(composite_similarity_scores, output_file):
    with open(output_file, 'wb') as f:
        pickle.dump(composite_similarity_scores, f)
    print("Composite similarity scores calculated and saved.")

# Main function to perform all tasks
def main():
    mount_google_drive()
    image_file_path = "/content/drive/My Drive/similar_images_results.pickle"
    review_file_path = "/content/drive/My Drive/similar_reviews_results.pickle"
    output_file = "/content/drive/My Drive/composite_similarity_scores.pickle"

    similar_images, similar_reviews = load_similarity_results(image_file_path, review_file_path)
    average_image_similarity = calculate_average_similarity(similar_images)
    average_review_similarity = calculate_average_similarity(similar_reviews)
    composite_similarity_scores = calculate_composite_similarity(average_image_similarity, average_review_similarity)
    save_composite_similarity_scores(composite_similarity_scores, output_file)

if __name__ == "__main__":
    main()


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Composite similarity scores calculated and saved.


**RANKING**

In [None]:
import pickle

# Function to rank pairs based on composite similarity score and save them
def rank_and_save_pairs(composite_similarity_scores, output_file):
    # Rank the pairs based on the composite similarity score
    ranked_pairs = sorted(composite_similarity_scores.items(), key=lambda x: x[1], reverse=True)

    # Save the ranked pairs using pickle
    with open(output_file, 'wb') as f:
        pickle.dump(ranked_pairs, f)

    print("Ranked pairs based on composite similarity score saved.")

# Main function to perform all tasks
def main():
    composite_similarity_scores_file = "/content/drive/My Drive/composite_similarity_scores.pickle"
    output_file_ranked = "/content/drive/My Drive/ranked_pairs_based_on_composite_similarity.pickle"

    # Load composite similarity scores from the file
    with open(composite_similarity_scores_file, 'rb') as f:
        composite_similarity_scores = pickle.load(f)

    # Rank pairs and save them
    rank_and_save_pairs(composite_similarity_scores, output_file_ranked)

if __name__ == "__main__":
    main()


Ranked pairs based on composite similarity score saved.


**RETRIEVAL**

In [None]:
import pandas as pd
import pickle
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def load_preprocessed_reviews(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def load_dataframe(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def load_similar_results(images_path, reviews_path):
    with open(images_path, 'rb') as f:
        similar_images = pickle.load(f)
    with open(reviews_path, 'rb') as f:
        similar_reviews = pickle.load(f)
    return similar_images, similar_reviews

def create_tfidf_vectorizer(reviews):
    vectorizer = TfidfVectorizer()
    return vectorizer.fit_transform(reviews), vectorizer

def find_product_id_by_review(text_review, df):
    for index, row in df.iterrows():
        if row['Review Text'] == text_review:
            return row['Product_ID']
    return None

def find_product_id(image_url, df):
    for index, row in df.iterrows():
        image_urls = eval(row['Image'])
        if image_url in image_urls:
            return row['Product_ID']
    return None

def find_top_similar_images(image_url, similar_images, df):
    product_id = find_product_id(image_url, df)
    if product_id is not None:
        top_similar_images_info = similar_images.get(product_id)
        if top_similar_images_info:
            similar_product_ids = [info[0] for info in top_similar_images_info]
            similar_image_urls = [row['Image'] for index, row in df.iterrows() if row['Product_ID'] in similar_product_ids]
            similar_text_reviews = [row['Review Text'] for index, row in df.iterrows() if row['Product_ID'] in similar_product_ids]
            return list(zip(similar_product_ids, similar_image_urls, [info[1] for info in top_similar_images_info], similar_text_reviews))
    return None

def find_top_similar_reviews(text_review, similar_reviews, df):
    text_review = re.sub(r'\s+', ' ', text_review).strip()
    product_id = find_product_id_by_review(text_review, df)
    if product_id is not None:
        top_similar_reviews_info = similar_reviews.get(product_id)
        if top_similar_reviews_info:
            similar_product_ids = [info[0] for info in top_similar_reviews_info]
            similar_text_reviews = [row['Review Text'] for index, row in df.iterrows() if row['Product_ID'] in similar_product_ids]
            similar_image_urls = [row['Image'] for index, row in df.iterrows() if row['Product_ID'] in similar_product_ids]
            similarity_scores = [info[1] for info in top_similar_reviews_info]
            return list(zip(similar_product_ids, similar_text_reviews, similarity_scores, similar_image_urls))
    return None

def calculate_similarity(review1, review2, vectorizer):
    vector1 = vectorizer.transform([review1])
    vector2 = vectorizer.transform([review2])
    similarity = cosine_similarity(vector1, vector2)[0][0]
    return similarity

def load_ranked_pairs(path):
    with open(path, 'rb') as f:
        return pickle.load(f)

def preprocess_text(text):
    if pd.isnull(text):
        return ""
    return re.sub(r'\s+', ' ', text).strip()

def find_top_similar_product(image_url, review_text, ranked_pairs, df):
    similar_products_info = []
    for pair in ranked_pairs:
        product_id = pair[0]
        composite_similarity_score = pair[1]
        product_info = df[df['Product_ID'] == product_id].iloc[0]
        similar_products_info.append({
            'Product_ID': product_id,
            'Image_URL': product_info['Image'],
            'Review_Text': product_info['Review Text'],
            'Composite_Similarity_Score': composite_similarity_score
        })
        if len(similar_products_info) == 3:
            break
    return similar_products_info

def main():
    preprocessed_reviews_path = "/content/drive/My Drive/preprocessed_reviews.pickle"
    df_path = "/content/drive/My Drive/dataframe.pickle"
    images_results_path = "/content/drive/My Drive/similar_images_results.pickle"
    reviews_results_path = "/content/drive/My Drive/similar_reviews_results.pickle"
    ranked_pairs_path = "/content/drive/My Drive/ranked_pairs_based_on_composite_similarity.pickle"

    preprocessed_reviews = load_preprocessed_reviews(preprocessed_reviews_path)
    df = load_dataframe(df_path)
    similar_images, similar_reviews = load_similar_results(images_results_path, reviews_results_path)
    tfidf_matrix, vectorizer = create_tfidf_vectorizer(preprocessed_reviews)

    image_url_input = input("Enter the image URL: ")
    text_review_input = input("Enter the text review: ")

    top_similar_images = find_top_similar_images(image_url_input, similar_images, df)
    top_similar_reviews = find_top_similar_reviews(text_review_input, similar_reviews, df)

    if top_similar_images:
        print("\nUSING IMAGE RETRIEVAL\n")
        for i, similar_image_info in enumerate(top_similar_images, 1):
            product_id, image_url, similarity_score, similar_text_review = similar_image_info
            print(f"{i}) Product ID: {product_id}")
            print(f"Image URL: {image_url}")
            print(f"Review: {similar_text_review}")
            print(f"Cosine similarity of images - {similarity_score:.4f}")
            text_similarity_score = calculate_similarity(text_review_input, similar_text_review, vectorizer)
            print(f"Cosine similarity of text - {text_similarity_score:.3f}")
            composite_similarity_score = (similarity_score + text_similarity_score) / 2
            print(f"Composite similarity score: {composite_similarity_score:.4f}")
            print()
    else:
        print("No similar images found for the input image URL.")

    if top_similar_reviews:
        print("\nUSING TEXT RETRIEVAL\n")
        for i, similar_review_info in enumerate(top_similar_reviews, 1):
            similar_product_id, similar_review_text, similarity_score_text, similar_image_url = similar_review_info
            similarity_score_image = calculate_similarity(df[df['Image'].apply(lambda x: image_url_input in x)]['Review Text'].iloc[0], similar_review_text, vectorizer)
            composite_similarity_score = (similarity_score_text + similarity_score_image) / 2
            print(f"{i}) Product ID: {similar_product_id}")
            print(f"Image URL: {similar_image_url}")
            print(f"Review: {similar_review_text}")
            print(f"Cosine similarity of images - {similarity_score_image:.4f}")
            print(f"Cosine similarity of text - {similarity_score_text:.3f}")
            print(f"Composite similarity score: {composite_similarity_score:.4f}")
            print()
    else:
        print("No similar text reviews found for the input text review.")

    ranked_pairs = load_ranked_pairs(ranked_pairs_path)

    similar_products = find_top_similar_product(image_url_input, text_review_input, ranked_pairs, df)

    print("\nCOMPOSITE RETRIEVAL\n")
    for product in similar_products:
        print("Product ID:", product['Product_ID'])
        print("Image URL:", product['Image_URL'])
        print("Review Text:", product['Review_Text'])
        print("Composite Similarity Score:", product['Composite_Similarity_Score'])
        print()

if __name__ == "__main__":
    main()


Enter the image URL: https://images-na.ssl-images-amazon.com/images/I/81q5+IxFVUL._SY88.jpg
Enter the text review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.

USING IMAGE RETRIEVAL

1) Product ID: 3637
Image URL: ['https://images-na.ssl-images-amazon.com/images/I/71nsBodxLXL._SY88.jpg', 'https://images-na.ssl-images-amazon.com/images/I/71WSo7HvxkL._SY88.jpg']
Review: Way better than I was expecting. I was recently upgrading all the parts on my first strat, and a buddy told me the trem system upgrade would be the biggest. I'm glad I listened to him and got this, because it was night and day. I'm not sure if it's just me, because I'm using my old springs, but my guitar stays in tune better now when using the trem bar. I uploaded a photo to show how small and light my old trem block was compared to this one. Side note...everything fit per