In [7]:
from surprise import Dataset, Reader
from surprise.model_selection import train_test_split
from surprise import SVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
from scipy.sparse import hstack



In [8]:
# Load the original dataset
file_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_subset.csv"
df = pd.read_csv(file_path)

# Extract the desired columns
desired_columns = ['UserId', 'ProductId', 'Rating', 'Timestamp']
selected_df = df[desired_columns]

# Save the selected columns to a new CSV file
output_file_path = "C:/Users/HP/Desktop/Dataset/archive/Svd_Dataset.csv"
selected_df.to_csv(output_file_path, index=False)


In [9]:

# Paths to the CSV files
amazon_beauty_subset_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_subset.csv"
svd_dataset_subset_path = "C:/Users/HP/Desktop/Dataset/archive/Svd_Dataset.csv"
filtered_csv_train_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_filtered_train.csv"
filtered_csv_test_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_filtered_test.csv"

# Load the amazon_beauty_subset.csv file
amazon_beauty_subset_df = pd.read_csv(amazon_beauty_subset_path)

# Load the dataset using Surprise's Dataset class
reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 5), skip_lines=1)
data = Dataset.load_from_file(svd_dataset_subset_path, reader=reader)

# Split the data into train and test sets
trainset, testset = train_test_split(data, test_size=0.3)

# Convert trainset to pandas DataFrame
trainset_df = pd.DataFrame([{
    'UserId': trainset.to_raw_uid(uid),
    'ProductId': trainset.to_raw_iid(iid),
    'Rating': rating
} for uid, user_ratings in trainset.ur.items() for iid, rating in user_ratings])

# Convert testset to pandas DataFrame
testset_df = pd.DataFrame([{
    'UserId': uid,
    'ProductId': iid,
    'Rating': rating
} for uid, iid, rating in testset])

# Filter the amazon_beauty_subset_df to keep only the entries in the trainset
content_trainset = amazon_beauty_subset_df[
    amazon_beauty_subset_df[['UserId', 'ProductId']].apply(tuple, axis=1).isin(
        trainset_df[['UserId', 'ProductId']].apply(tuple, axis=1)
    )
]

# Filter the amazon_beauty_subset_df to keep only the entries in the testset
content_testset = amazon_beauty_subset_df[
    amazon_beauty_subset_df[['UserId', 'ProductId']].apply(tuple, axis=1).isin(
        testset_df[['UserId', 'ProductId']].apply(tuple, axis=1)
    )
]

# Save the filtered DataFrames to new CSV files
content_trainset.to_csv(filtered_csv_train_path, index=False)
content_testset.to_csv(filtered_csv_test_path, index=False)

print("Filtered CSV train and test files have been saved.")


Filtered CSV train and test files have been saved.


In [10]:
svd = SVD()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19c644a6690>

In [11]:
# Paths to the CSV files
Trainset_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_filtered_train.csv"
Testset_path = "C:/Users/HP/Desktop/Dataset/archive/amazon_beauty_filtered_test.csv"

# Load your pre-split datasets
trainset_df = pd.read_csv(Trainset_path)
testset_df = pd.read_csv(Testset_path)

item_ids = trainset_df['ProductId'].unique()

# Split the data into features and target for the training set
X_train = trainset_df['ProductType']
y_train = trainset_df['Rating']

# Split the data into features and target for the testing set
X_test = testset_df['ProductType']
y_test = testset_df['Rating']

print("Data has been loaded and split successfully.")


Data has been loaded and split successfully.


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

# Load your pre-split datasets
trainset_df = pd.read_csv(Trainset_path)
testset_df = pd.read_csv(Testset_path)

# Concatenate training and test data
all_data = pd.concat([trainset_df[['UserId', 'ProductId']], testset_df[['UserId', 'ProductId']]])

# Fit OneHotEncoder
encoder = OneHotEncoder()
encoder.fit(all_data)

# Transform training data
X_train_user_product = encoder.transform(trainset_df[['UserId', 'ProductId']])

# Convert the text data into TF-IDF vectors
tfidf_vectorizer = TfidfVectorizer()
X_train_product_type_tfidf = tfidf_vectorizer.fit_transform(trainset_df['ProductType'])

# Combine the encoded user-product features with the product type TF-IDF vectors
X_train = hstack([X_train_user_product, X_train_product_type_tfidf])

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, trainset_df['Rating'])

# Create lists to store the data
user_ids = []
product_ids = []
predicted_ratings = []

# Iterate over each unique user and product pair in the test set
for index, row in testset_df.iterrows():
    user_id = row['UserId']
    product_id = row['ProductId']
    user_product_features = encoder.transform([[user_id, product_id]])
    product_type = row['ProductType']
    product_type_tfidf = tfidf_vectorizer.transform([product_type])
    
    # Combine the features
    features = hstack([user_product_features, product_type_tfidf])
    
    # Predict the rating
    predicted_rating = model.predict(features)[0]
    
    # Append the data to the lists
    user_ids.append(user_id)
    product_ids.append(product_id)
    predicted_ratings.append(predicted_rating)

# Create a dataframe to store the predicted ratings
predicted_ratings_df = pd.DataFrame({'UserId': user_ids, 'ProductId': product_ids, 'PredictedRating': predicted_ratings})

# Save the dataframe to a CSV file
predicted_ratings_df.to_csv("predicted_ratings.csv", index=False)


In [13]:


def hybrid_recommendations(user_id, svd_model, item_ids, item_similarities, top_n, alpha):
    # Initialize a list to store hybrid scores for each item
    hybrid_scores = []

    # Iterate over each item
    for item_id in item_ids:
        # Predict the rating for the current user and item using SVD
        predicted_rating = svd_model.predict(user_id, item_id).est
        content_rating_series = item_similarities.loc[(item_similarities['UserId'] == user_id) & (item_similarities['ProductId'] == item_id), 'PredictedRating']
        # If content_rating_series is empty, set the content_rating to 0
        content_rating = content_rating_series.iloc[0] if not content_rating_series.empty else 0

        # Compute the hybrid score by combining collaborative filtering and content-based filtering
        hybrid_score = alpha * predicted_rating + (1 - alpha) * content_rating

        # Append the hybrid score to the list
        hybrid_scores.append((item_id, hybrid_score))

    # Sort the hybrid scores in descending order
    hybrid_scores.sort(key=lambda x: x[1], reverse=True)

    # Extract the top N recommended items
    top_recommendations = [item_id for item_id, _ in hybrid_scores[:top_n]]

    return top_recommendations

In [15]:
predictions = pd.read_csv("predicted_ratings.csv")
# Example usage: recommend top 5 items for user 1 with alpha=0.5
user_id = 'A3BCI5FNQCEJM6'
recommended_items = hybrid_recommendations(user_id, svd_model=svd, item_ids=item_ids, item_similarities=predictions, top_n=10, alpha=0.5)
print("Top 5 recommended items for user", user_id, ":", recommended_items)

Top 5 recommended items for user A3BCI5FNQCEJM6 : ['B0030O3VRW', 'B008TBTA6C', 'B00GS83884', 'B0083QNBCM', 'B005OSQGN8', 'B002TPQPEE', 'B000F35R00', 'B00067YSLO', 'B000Q7XDR4', 'B009GYVMAS']


In [16]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, precision_recall_fscore_support


In [17]:
def hybrid_recommendations_Test(user_id, svd_model, item_ids, item_similarities, alpha):
    hybrid_scores = {}

    for item_id in item_ids:
        predicted_rating = svd_model.predict(user_id, item_id).est
        content_rating_series = item_similarities.loc[(item_similarities['UserId'] == user_id) & (item_similarities['ProductId'] == item_id), 'PredictedRating']
        content_rating = content_rating_series.iloc[0] if not content_rating_series.empty else 0

        # Compute the hybrid score by combining collaborative filtering and content-based filtering
        hybrid_score = alpha * predicted_rating + (1 - alpha) * content_rating

        # Append the hybrid score to the list
        hybrid_scores[item_id] = hybrid_score

    return hybrid_scores




In [38]:
# Define the evaluate_model function
def evaluate_model(svd_model, testset, item_similarities, alpha, threshold=3.5):
    actual_ratings = []
    predicted_ratings = []
    actual_hits = []
    predicted_hits = []

    for user_id, item_id, actual_rating in testset:
        item_ids = [item_id]
        hybrid_scores = hybrid_recommendations_Test(user_id, svd_model, item_ids, item_similarities, alpha)
        predicted_rating = hybrid_scores.get(item_id, 0)
        actual_ratings.append(actual_rating)
        predicted_ratings.append(predicted_rating)

        # Convert ratings to binary hits
        actual_hits.append(1 if actual_rating >= threshold else 0)
        predicted_hits.append(1 if predicted_rating >= threshold else 0)

    mse = mean_squared_error(actual_ratings, predicted_ratings)
    rmse = np.sqrt(mean_squared_error(actual_ratings, predicted_ratings))
    mae = mean_absolute_error(actual_ratings, predicted_ratings)

    precision, recall, f1, _ = precision_recall_fscore_support(actual_hits, predicted_hits, average='binary')
    hit_rate = np.mean(np.array(actual_hits) == np.array(predicted_hits))

    return rmse, mse, mae, precision, recall, f1, hit_rate

In [46]:
testset3 = trainset.build_testset()

alpha = 0

# Evaluate the model
rmse, mse, mae, precision, recall, f1, hit_rate = evaluate_model(svd, testset, predictions, alpha, threshold=4)
print(f'MSE: {mse}, RMSE: {rmse}, MAE: {mae}')
print(f'Precision: {precision}, Recall: {recall}, F1 Score: {f1}, Hit Rate: {hit_rate}')   

MSE: 1.8037882601251443, RMSE: 1.3430518456579197, MAE: 1.0477103468004454
Precision: 0.7853977632411901, Recall: 0.8105400696864111, F1 Score: 0.7977708712892508, Hit Rate: 0.6855
