In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset and initialize necessary libraries
df1 = pd.read_json("data/data.json")
df2 = pd.read_csv('data/user_recommendation.csv')

In [3]:
# Filling empty synopsis
df1['synopsis'] = df1['synopsis'].fillna('')

In [4]:
# Remove duplicate rows based on mal_id
df1 = df1.drop_duplicates('mal_id', keep='first')

In [5]:
# Define a function to merge arrays
def merge_arrays(row):
    return row['genres'] + row['themes']

df1['merged_genres_themes'] = df1.apply(merge_arrays, axis=1)

In [6]:
# Text preprocessing and feature extraction for synopsis
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
synopsis_vectors = tfidf.fit_transform(df1['synopsis'])

In [7]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [8]:
# Fit and transform the genres data
mgt_encoded = mlb.fit_transform(df1['merged_genres_themes'])

# Convert the encoded genres into a DataFrame (optional)
encoded_mgt_df = pd.DataFrame(mgt_encoded, columns=mlb.classes_)

In [9]:
# Combine TF-IDF vectors of synopses and binary encoded genres and themes
combined_features = np.hstack((synopsis_vectors.toarray(), mgt_encoded))

In [10]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(combined_features, combined_features)

In [11]:
#Construct a reverse map of indices and movie titles
indices = pd.Series(df1.index, index=df1['mal_id']).drop_duplicates()

In [12]:
def adjust_similarity_with_votes(sim_scores, mal_id, df2):
    # Create a dictionary to store initial similarity scores
    sim_scores_with_votes = {mal_id: score for mal_id, score in sim_scores}
    
    # Filter df2 to get all recommendations for the given mal_id
    recommendations = df2[df2['mal_id'] == mal_id]
    
    # Adjust similarity scores with votes from recommendations
    for _, row in recommendations.iterrows():
        rec_id = row['mal_id_recomm']
        vote = row['votes']
        if rec_id in sim_scores_with_votes:
            sim_scores_with_votes[rec_id] += vote
        # else:
        #     sim_scores_with_votes[rec_id] = 0.1 * vote
    
    # Sort the movies based on the adjusted similarity scores
    sim_scores_sorted = sorted(sim_scores_with_votes.items(), key=lambda x: x[1], reverse=True)
    
    return sim_scores_sorted

In [13]:
def get_similarity(mal_id, cosine_sim=cosine_sim):
    if mal_id not in indices:
        return f"mal_id {mal_id} not found in the dataset"
    
    idx = indices[mal_id]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = cosine_sim[idx]
    
    # Ensure sim_scores is a 1D array
    sim_scores = sim_scores.flatten()
    
    # Create a list of tuples (mal_id, similarity score)
    sim_scores = [(df1['mal_id'].iloc[i], score) for i, score in enumerate(sim_scores)]
    
    # Adjust similarity scores with votes from recommendations
    sim_scores_sorted = adjust_similarity_with_votes(sim_scores, mal_id, df2)
    
    # Get the scores of the 10 most similar movies (excluding itself)
    sim_scores_top10 = [item for item in sim_scores_sorted if item[0] != mal_id][:10]
    
    # Create a dictionary of mal_id and adjusted similarity scores
    similar_movies = {mal_id: score for mal_id, score in sim_scores_top10}
    
    return similar_movies

In [14]:
result = get_similarity(21)
print(result)

{6702: 8586.044254895529, 11061: 5960.013349429273, 20: 4647.0070727804, 918: 2424.0089472478685, 223: 2021.0333374906827, 813: 1617.0142036656073, 34572: 1617.0068360302373, 1735: 1516.0098524178134, 269: 1213.0230207927223, 136: 1213.0026008878885}


In [17]:
import pickle
with open('data/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
with open('data/synopsis_vectors.pkl', 'wb') as f:
    pickle.dump(synopsis_vectors, f)
with open('data/mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)
with open('data/mgt_encoded.pkl', 'wb') as f:
    pickle.dump(mgt_encoded, f)
with open('data/combined_features.pkl', 'wb') as f:
    pickle.dump(combined_features, f)
with open('data/cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)
with open('data/indices.pkl', 'wb') as f:
    pickle.dump(indices, f)

# Save the dataframes as well
df1.to_pickle('data/df1.pkl')
df2.to_pickle('data/df2.pkl')