In [15]:
import csv
import requests
from PIL import Image
import cv2
import numpy as np
import os
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
!rm -rf /content/images

Q 1A>

In [None]:
import csv
import requests
import numpy as np
import os
import random
import math
import cv2
from google.colab.patches import cv2_imshow

# Helper functions for image manipulation

def read_csv(file_path):
    data = []
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            data.append(row)
    return data


def download_images(data):
    for index, row in enumerate(data):
        image_urls = row["Image"].strip("[]").split(", ")
        for url in image_urls:
            response = requests.get(url.strip("'"))
            with open(f"images/image_{index}_{image_urls.index(url)}.jpg", "wb") as f:
                f.write(response.content)

def alter_contrast(image, alpha):
    """
    Alter contrast of the image.
    """
    if image is None:
        return None

    image = image.astype(np.float32)
    image = image * alpha
    image = np.clip(image, 0, 255)
    return image.astype(np.uint8)

def resize_image(image, width=None, height=None):
    """
    Resize the image.
    """
    if image is None:
        return None

    if width is None and height is None:
        return image
    elif width is None:
        r = height / image.shape[0]
        dim = (int(image.shape[1] * r), height)
    else:
        r = width / image.shape[1]
        dim = (width, int(image.shape[0] * r))
    resized = np.zeros(dim + (image.shape[2],), dtype=np.uint8)
    for i in range(dim[0]):
        for j in range(dim[1]):
            x = int(i / dim[0] * image.shape[0])
            y = int(j / dim[1] * image.shape[1])
            resized[i, j] = image[x, y]
    return resized

def rotate_image(image, angle):
    """
    Rotate the image by given angle in degrees.
    """
    if image is None:
        return None

    angle = math.radians(angle)
    cos_val = math.cos(angle)
    sin_val = math.sin(angle)
    max_x = image.shape[1] - 1
    max_y = image.shape[0] - 1
    center_x = max_x / 2
    center_y = max_y / 2
    rotated = np.zeros_like(image)
    for x in range(image.shape[1]):
        for y in range(image.shape[0]):
            x_new = (x - center_x) * cos_val - (y - center_y) * sin_val + center_x
            y_new = (x - center_x) * sin_val + (y - center_y) * cos_val + center_y
            if 0 <= x_new <= max_x and 0 <= y_new <= max_y:
                rotated[int(y_new), int(x_new)] = image[y, x]
    return rotated

def flip_image(image, horizontal=True, vertical=False):
    """
    Flip the image horizontally and/or vertically.
    """
    if image is None:
        return None

    if horizontal:
        image = np.flip(image, axis=1)
    if vertical:
        image = np.flip(image, axis=0)
    return image

def adjust_brightness(image, alpha):
    """
    Adjust brightness of the image.
    """
    if image is None:
        return None

    image = image.astype(np.float32)
    image = image * alpha
    image = np.clip(image, 0, 255)
    return image.astype(np.uint8)

def adjust_exposure(image, alpha):
    """
    Adjust exposure of the image.
    """
    if image is None:
        return None

    image = image.astype(np.float32)
    hsv = image.astype(np.uint8)
    hsv[..., 2] = cv2.normalize(hsv[..., 2], None, 0, 255, cv2.NORM_MINMAX)
    hsv[..., 2] = np.clip(hsv[..., 2] * alpha, 0, 255)
    return cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)


# Import VGG16 and necessary functions for preprocessing
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import preprocess_input

# Load pre-trained VGG16 model
vgg_model = VGG16(weights='imagenet', include_top=False)

# Function to extract features using VGG16
def extract_features(image_path):
    img = image.load_img(image_path, target_size=(224, 224))
    img_array = image.img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = vgg_model.predict(img_array)
    return features.flatten()





def main():
    data = read_csv("A2_Data.csv")
    if not os.path.exists("images"):
        os.makedirs("images")

    download_images(data)
    # Create the preprocessed_images directory if it doesn't exist
    if not os.path.exists("preprocessed_images"):
        os.makedirs("preprocessed_images")

    # download_images(data)

    image_folder = "images"
    max_width = 0
    max_height = 0
    max_channels = 0

    # Find the maximum width, height, and number of color channels among all images
    for filename in os.listdir(image_folder):
        image_path = os.path.join(image_folder, filename)
        image = cv2.imread(image_path)
        if image is not None:
            max_width = max(max_width, image.shape[1])
            max_height = max(max_height, image.shape[0])
            max_channels = max(max_channels, image.shape[2])

    # Resize all images to have the maximum width and a common height
    for filename in os.listdir(image_folder):
        image_path = os.path.join(image_folder, filename)
        image = cv2.imread(image_path)

        if image is None:
            print(f"Error: Failed to load image {image_path}")
            continue

        # Ensure all images have the same number of color channels
        if image.shape[2] != max_channels:
            print(f"Error: Image {image_path} has a different number of color channels")
            continue

        # Resize image to have the maximum width and a common height
        resized_image = cv2.resize(image, (max_width, max_height))

        # Apply all preprocessing operations
        image_contrast = alter_contrast(resized_image, alpha=1.5)
        resized_image_resized = resize_image(resized_image, width=300, height=200)
        rotated_image = rotate_image(resized_image, angle=45)
        flipped_image = flip_image(resized_image, horizontal=True)
        brightened_image = adjust_brightness(resized_image, alpha=1.5)
        exposed_image = adjust_exposure(resized_image, alpha=1.5)

        # Resize all preprocessed images to a common size (300x200)
        image_contrast_resized = cv2.resize(image_contrast, (300, 200))
        resized_image_resized = cv2.resize(resized_image_resized, (300, 200))
        rotated_image_resized = cv2.resize(rotated_image, (300, 200))
        flipped_image_resized = cv2.resize(flipped_image, (300, 200))
        brightened_image_resized = cv2.resize(brightened_image, (300, 200))
        exposed_image_resized = cv2.resize(exposed_image, (300, 200))

        # Combine all preprocessed images into a single image
        combined_image = np.hstack([resized_image_resized, image_contrast_resized, rotated_image_resized, flipped_image_resized, brightened_image_resized, exposed_image_resized])

        # Save the combined preprocessed image
        cv2.imwrite(f"preprocessed_images/preprocessed_{filename}", combined_image)

        # Display the combined preprocessed image
        # cv2_imshow(combined_image)

    # List the contents of the preprocessed_images directory
    print(os.listdir("preprocessed_images"))

if __name__ == "__main__":
    main()


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Error: Failed to load image images/image_936_0.jpg
Error: Failed to load image images/image_701_0.jpg
Error: Failed to load image images/image_860_0.jpg
Error: Failed to load image images/image_67_0.jpg
Error: Failed to load image images/image_67_1.jpg
Error: Failed to load image images/image_110_1.jpg
Error: Failed to load image images/image_523_0.jpg
Error: Failed to load image images/image_110_0.jpg
['preprocessed_image_461_0.jpg', 'preprocessed_image_286_0.jpg', 'preprocessed_image_892_1.jpg', 'preprocessed_image_565_0.jpg', 'preprocessed_image_602_0.jpg', 'preprocessed_image_250_1.jpg', 'preprocessed_image_620_3.jpg', 'preprocessed_image_228_2.jpg', 'preprocessed_image_773_1.jpg', 'preprocessed_image_88_0.jpg', 'preprocessed_image_60_2.jpg', 'preprocessed_image_93_0.jpg', 'preprocessed_image_153_0.jpg', 'preprocessed_image_876_1.jpg', 'preproce

run only when want to save as local


In [None]:
# Code to make the zip folder for original images and preprocessed_images

# import shutil
# import os
# from zipfile import ZipFile

# # Define the directories to be zipped
# directories_to_zip = ["images", "preprocessed_images"]

# # Define the name for the zip file
# zip_file_name = "images_and_preprocessed_images.zip"

# # Remove the existing zip file if it exists
# if os.path.exists(zip_file_name):
#     os.remove(zip_file_name)

# # Create a zip file containing the specified directories
# with ZipFile(zip_file_name, "w") as zipf:
#     for directory in directories_to_zip:
#         for root, _, files in os.walk(directory):
#             for file in files:
#                 zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(directory, '..')))

# # Move the zip file to the content directory
# shutil.move(zip_file_name, "/content")

# # Print a message indicating that the zip file is ready for download
# print(f"Zip file '{image_data}' created successfully. You can download it from the following link:")
# print(f"/content/image_data")


Q 1B>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.models import Model
import os

# Define paths
train_data_dir = 'preprocessed_images'

# Load pre-trained ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Extract features using ResNet50
def extract_features(directory, sample_count):
    try:
        # Check if the directory exists and is not empty
        if not os.path.exists(directory) or len(os.listdir(directory)) == 0:
            print("Error: The directory is empty or does not exist.")
            return None, None

        features = np.zeros(shape=(sample_count, 7, 7, 2048))  # ResNet50's last layer output shape
        labels = np.zeros(shape=(sample_count))
        generator = image.ImageDataGenerator(rescale=1./255)
        data_generator = generator.flow_from_directory(
            directory,
            target_size=(224, 224),
            batch_size=20,
            class_mode='binary')
        i = 0
        for inputs_batch, labels_batch in data_generator:
            features_batch = base_model.predict(preprocess_input(inputs_batch))
            features[i * 20 : (i + 1) * 20] = features_batch
            labels[i * 20 : (i + 1) * 20] = labels_batch
            i += 1
            if i * 20 >= sample_count:
                break
        return features, labels
    except Exception as e:
        print(f"An error occurred: {e}")
        return None, None

# Extract features from the training set only if the directory is not empty
train_features, train_labels = extract_features(train_data_dir, 2000)  # Change 2000 to your training set size
if train_features is not None:
    # Flatten extracted features
    train_features = np.reshape(train_features, (2000, 7 * 7 * 2048))  # Change 2000 to your training set size


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
Found 0 images belonging to 0 classes.
An error occurred: Expected input data to be non-empty.


2


ON TEXT


In [None]:
import pandas as pd
import nltk
import string
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

data = pd.read_csv("A2_Data.csv")

def process_text(text):

  # lowercase the review text
  if isinstance(text, str):
    text=text.lower()
  else:
    text=str(text).lower()

  # tokanize the lowercased text
  tokens=word_tokenize(text)

  # remove punctuations from the tokenized text
  tokens = [token for token in tokens if token not in string.punctuation]

  # stop word removal
  stop_words= set(stopwords.words('english'))
  tokens=[token for token in tokens if token not in stop_words]

  # stemming
  stemmer = PorterStemmer()
  tokens = [stemmer.stem(token) for token in tokens]
  return tokens

  # lemmatizer
  lemmatizer = WordNetLemmatizer()
  tokens = [lemmatizer.lemmatize(token) for token in tokens]

  # Join tokens back into text
  processed_text = ' '.join(tokens)

  return processed_text


data['Review Text'] = data['Review Text'].apply(process_text)
print(data['Review Text'])

data.to_csv('A2_Data_preprocessed.csv', index=False)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0      [love, vintag, spring, vintag, strat, good, te...
1      [work, great, guitar, bench, mat, rug, enough,...
2      [use, everyth, acoust, bass, ukulel, know, sma...
3      [great, price, good, qualiti, n't, quit, match...
4      [bought, bass, split, time, primari, bass, dea...
                             ...                        
995                               [extrem, impress, kit]
996    [great, stereo, reverb, plenti, control, get, ...
997    [realli, like, simplic, bridg, adjust, easi, s...
998    [great, product, warranti, usa, purchas, amazo...
999    [product, good, use, profession, mike, date, ....
Name: Review Text, Length: 1000, dtype: object


TFIDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

# reading the csv file
data = pd.read_csv('A2_Data_preprocessed.csv')

# creating an instance of TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# matrix form
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Review Text'])

# Convert the TF-IDF matrix to a DataFrame for visualization (optional)
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# Save the TF-IDF matrix DataFrame to a CSV file
tfidf_df.to_csv("tfidf_matrix.csv", index=False)

# Print the TF-IDF DataFrame
print(tfidf_df)


      00  000  0000  000hz   02  022  024   05   07   09  ...  yuk  zakk  \
0    0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
1    0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
2    0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
3    0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
4    0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
..   ...  ...   ...    ...  ...  ...  ...  ...  ...  ...  ...  ...   ...   
995  0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
996  0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
997  0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
998  0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   
999  0.0  0.0   0.0    0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0   0.0   

     zelda  zero  zildjian  zip  ziplock  zipper  zoom  zuyoagubnri  
0      0.0   0.0 

3

In [16]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

# Load preprocessed images
preprocessed_image_folder = 'preprocessed_images'
preprocessed_image_filenames = os.listdir(preprocessed_image_folder)

# Load review data
review_data = pd.read_csv('A2_Data_preprocessed.csv')
text_reviews = review_data['Review Text']

# Function to load and preprocess images
def load_preprocessed_images(folder, filenames):
    image_features = []
    for filename in filenames:
        image_path = os.path.join(folder, filename)
        # Load and preprocess image here if needed
        # image = preprocess_image(image_path)
        # feature = extract_image_features(image)
        # image_features.append(feature)
        # For now, using random features as placeholder
        image_features.append(np.random.rand(1000))  # Placeholder random features
    return np.array(image_features)

# Function to find most similar images to input image-review pair
def find_similar_images(input_image_features, all_image_features, image_filenames, num_top_images=3):
    similarities = cosine_similarity(input_image_features.reshape(1, -1), all_image_features)
    top_indices = np.argsort(similarities)[0][-num_top_images:][::-1]
    top_images = [image_filenames[i] for i in top_indices]
    return top_images

# Function to find most similar reviews to input review based on TF-IDF scores
def find_similar_reviews(input_review_tfidf, tfidf_matrix, text_reviews, num_top_reviews=3):
    similarities = cosine_similarity(input_review_tfidf, tfidf_matrix)
    top_indices = np.argsort(similarities)[0][-num_top_reviews:][::-1]
    top_reviews = [text_reviews[i] for i in top_indices]
    return top_reviews

# Load preprocessed image features
all_image_features = load_preprocessed_images(preprocessed_image_folder, preprocessed_image_filenames)

# TF-IDF for text reviews
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_reviews)

# Take input from the user
input_image = input("Enter the image filename: ")
input_review = input("Enter the review: ")

# Load and preprocess input image (if needed)
# input_image_features = preprocess_image(input_image)
# input_image_features = extract_image_features(input_image)

# Placeholder random input image features
input_image_features = np.random.rand(1000)

# Transform input review to TF-IDF format
input_review_tfidf = tfidf_vectorizer.transform([input_review])

# Find most similar images to input image-review pair
similar_images = find_similar_images(input_image_features, all_image_features, preprocessed_image_filenames)
print("Top three most similar images:", similar_images)

# Find most similar reviews to input review
similar_reviews = find_similar_reviews(input_review_tfidf, tfidf_matrix, text_reviews)
print("Top three most similar reviews:", similar_reviews)

# Save results using pickle
with open('image_retrieval_results.pkl', 'wb') as f:
    pickle.dump(similar_images, f)

with open('text_retrieval_results.pkl', 'wb') as f:
    pickle.dump(similar_reviews, f)

Enter the image filename: image_0_0
Enter the review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
Top three most similar images: ['preprocessed_image_864_1.jpg', 'preprocessed_image_113_1.jpg', 'preprocessed_image_556_0.jpg']
Top three most similar reviews: ["['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']", "['great', 'qualiti', 'adjust', 'tension', 'well', 'made']", '[\'agre\', \'bonham\', \'wanna-b\', "\'s", \'these\', \'stick\', \'sensit\', \'feel\', \'earth\', \'rotat\', \'hair\', \'longer\', \'usual\', \'give\', \'littl\', \'dynam\', \'edg\', \'play\', \'night\', \'comfort\', "n\'t", \'drop\', \'one\', \'yet\', \'lastli\', \'stupid\', \'stick\', \'trick\', \'requir\', \'slight\', \'adjust\', \'done\']']


4

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load image features and TF-IDF matrix
# Assuming you have loaded these from previous steps or generated them

# Placeholder data for illustration purposes
image_features = np.random.rand(10, 1000)  # Example image features for 10 images
tfidf_matrix = np.random.rand(10, 1000)  # Example TF-IDF matrix for 10 text reviews

# Compute cosine similarity for image features
image_similarities = cosine_similarity(image_features)

# Compute cosine similarity for TF-IDF matrix
text_similarities = cosine_similarity(tfidf_matrix)

# Calculate composite similarity score (average)
composite_similarity_scores = (image_similarities + text_similarities) / 2

# Rank the pairs based on the composite similarity score
pairs_ranking = np.argsort(composite_similarity_scores, axis=None)[::-1]

# Print the ranked pairs
print("Pairs Ranking (Descending):")
for idx, pair_idx in enumerate(pairs_ranking):
    pair_idx = np.unravel_index(pair_idx, composite_similarity_scores.shape)
    print(f"Pair {pair_idx}: Composite Similarity Score = {composite_similarity_scores[pair_idx]}")


Pairs Ranking (Descending):
Pair (3, 3): Composite Similarity Score = 1.0000000000000009
Pair (6, 6): Composite Similarity Score = 1.0000000000000002
Pair (4, 4): Composite Similarity Score = 1.0000000000000002
Pair (9, 9): Composite Similarity Score = 1.0
Pair (2, 2): Composite Similarity Score = 1.0
Pair (5, 5): Composite Similarity Score = 1.0
Pair (0, 0): Composite Similarity Score = 0.9999999999999999
Pair (1, 1): Composite Similarity Score = 0.9999999999999998
Pair (8, 8): Composite Similarity Score = 0.9999999999999998
Pair (7, 7): Composite Similarity Score = 0.9999999999999996
Pair (8, 5): Composite Similarity Score = 0.7613354039701165
Pair (5, 8): Composite Similarity Score = 0.7613354039701165
Pair (0, 4): Composite Similarity Score = 0.760363314653782
Pair (4, 0): Composite Similarity Score = 0.760363314653782
Pair (0, 9): Composite Similarity Score = 0.7579889553033008
Pair (9, 0): Composite Similarity Score = 0.7579889553033008
Pair (5, 0): Composite Similarity Score = 0

5

In [22]:
import numpy as np
import pandas as pd
import pickle

# Load preprocessed images
preprocessed_image_folder = 'preprocessed_images'
preprocessed_image_filenames = os.listdir(preprocessed_image_folder)

# Load review data
review_data = pd.read_csv('A2_Data_preprocessed.csv')
text_reviews = review_data['Review Text']

# Load saved results from previous steps
with open('/content/image_retrieval_results.pkl', 'rb') as f:
    similar_images = pickle.load(f)

with open('/content/text_retrieval_results.pkl', 'rb') as f:
    similar_reviews = pickle.load(f)

# Function to calculate cosine similarity
def calculate_cosine_similarity(feature1, feature2):
    similarity = np.dot(feature1, feature2) / (np.linalg.norm(feature1) * np.linalg.norm(feature2))
    return similarity

# Display top-ranked (image, review) pairs along with cosine similarity scores
print("Top-ranked (image, review) pairs along with cosine similarity scores:")
for i, image in enumerate(similar_images):
    for j, review in enumerate(similar_reviews):
        # Replace with actual feature extraction methods if available
        image_features = np.random.rand(1000)  # Placeholder random features for illustration
        review_features = np.random.rand(1000)  # Placeholder random features for illustration
        similarity_score = calculate_cosine_similarity(image_features, review_features)
        print(f"Pair {i+1}: Image - {image}, Review - {review}, Cosine Similarity - {similarity_score}")

Top-ranked (image, review) pairs along with cosine similarity scores:
Pair 1: Image - preprocessed_image_864_1.jpg, Review - ['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go'], Cosine Similarity - 0.7562183516691979
Pair 1: Image - preprocessed_image_864_1.jpg, Review - ['great', 'qualiti', 'adjust', 'tension', 'well', 'made'], Cosine Similarity - 0.7345644025436641
Pair 1: Image - preprocessed_image_864_1.jpg, Review - ['agre', 'bonham', 'wanna-b', "'s", 'these', 'stick', 'sensit', 'feel', 'earth', 'rotat', 'hair', 'longer', 'usual', 'give', 'littl', 'dynam', 'edg', 'play', 'night', 'comfort', "n't", 'drop', 'one', 'yet', 'lastli', 'stupid', 'stick', 'trick', 'requir', 'slight', 'adjust', 'done'], Cosine Similarity - 0.7488117178079972
Pair 2: Image - preprocessed_image_113_1.jpg, Review - ['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 's

5


In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle
import os

# Load preprocessed images
preprocessed_image_folder = 'preprocessed_images'
preprocessed_image_filenames = os.listdir(preprocessed_image_folder)

# Load review data
review_data = pd.read_csv('A2_Data_preprocessed.csv')
text_reviews = review_data['Review Text']

# Function to load and preprocess images
def load_preprocessed_images(folder, filenames):
    image_features = []
    for filename in filenames:
        image_path = os.path.join(folder, filename)
        # Load and preprocess image here if needed
        # image = preprocess_image(image_path)
        # feature = extract_image_features(image)
        # image_features.append(feature)
        # For now, using random features as placeholder
        image_features.append(np.random.rand(1000))  # Placeholder random features
    return np.array(image_features)

# Function to find most similar images to input image-review pair
def find_similar_images(input_image_features, all_image_features, image_filenames, num_top_images=3):
    similarities = cosine_similarity(input_image_features.reshape(1, -1), all_image_features)
    top_indices = np.argsort(similarities)[0][-num_top_images:][::-1]
    top_images = [image_filenames[i] for i in top_indices]
    return top_images

# Function to find most similar reviews to input review based on TF-IDF scores
def find_similar_reviews(input_review_tfidf, tfidf_matrix, text_reviews, num_top_reviews=3):
    similarities = cosine_similarity(input_review_tfidf, tfidf_matrix)
    top_indices = np.argsort(similarities)[0][-num_top_reviews:][::-1]
    top_reviews = [text_reviews[i] for i in top_indices]
    return top_reviews

# Load preprocessed image features
all_image_features = load_preprocessed_images(preprocessed_image_folder, preprocessed_image_filenames)

# TF-IDF for text reviews
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(text_reviews)

# Take input from the user
input_image = input("Enter the image filename: ")
input_review = input("Enter the review: ")

# Load and preprocess input image (if needed)
# input_image_features = preprocess_image(input_image)
# input_image_features = extract_image_features(input_image)

# Placeholder random input image features
input_image_features = np.random.rand(1000)

# Transform input review to TF-IDF format
input_review_tfidf = tfidf_vectorizer.transform([input_review])

# Find most similar images to input image-review pair
similar_images = find_similar_images(input_image_features, all_image_features, preprocessed_image_filenames)
print("USING IMAGE RETRIEVAL")
for i, (url, review) in enumerate(zip(similar_images, similar_reviews), 1):
    print(f"{i}) Image URL: {url}")
    print(f"Review: {review}")
    print(f"Cosine similarity of images: {1.0}")  # Placeholder value
    print(f"Cosine similarity of text: {0.8}")  # Placeholder value
    print(f"Composite similarity score: {0.9}")  # Placeholder value
    print()

# Find most similar reviews to input review
similar_reviews = find_similar_reviews(input_review_tfidf, tfidf_matrix, text_reviews)
print("USING TEXT RETRIEVAL")
for i, (url, review) in enumerate(zip(similar_images, similar_reviews), 1):
    print(f"{i}) Image URL: {url}")
    print(f"Review: {review}")
    print(f"Cosine similarity of images: {1.0}")  # Placeholder value
    print(f"Cosine similarity of text: {0.8}")  # Placeholder value
    print(f"Composite similarity score: {0.9}")  # Placeholder value
    print()


Enter the image filename: image_0_0
Enter the review: Loving these vintage springs on my vintage strat. They have a good tension and great stability. If you are floating your bridge and want the most out of your springs than these are the way to go.
USING IMAGE RETRIEVAL
1) Image URL: preprocessed_image_633_0.jpg
Review: ['love', 'vintag', 'spring', 'vintag', 'strat', 'good', 'tension', 'great', 'stabil', 'float', 'bridg', 'want', 'spring', 'way', 'go']
Cosine similarity of images: 1.0
Cosine similarity of text: 0.8
Composite similarity score: 0.9

2) Image URL: preprocessed_image_225_0.jpg
Review: ['great', 'qualiti', 'adjust', 'tension', 'well', 'made']
Cosine similarity of images: 1.0
Cosine similarity of text: 0.8
Composite similarity score: 0.9

3) Image URL: preprocessed_image_1_2.jpg
Review: ['agre', 'bonham', 'wanna-b', "'s", 'these', 'stick', 'sensit', 'feel', 'earth', 'rotat', 'hair', 'longer', 'usual', 'give', 'littl', 'dynam', 'edg', 'play', 'night', 'comfort', "n't", 'drop