In [17]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler


data = pd.read_csv('/kaggle/input/movies-data/mymoviedb (2).csv', sep=';', on_bad_lines='skip', encoding='latin-1')

# hot-encoding genre
data['Genre'] = data['Genre'].fillna('').astype(str)
data['Genre'] = data['Genre'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['Genre'])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

for column in ['Popularity', 'Vote_Count', 'Vote_Average']:
    data[column] = pd.to_numeric(data[column], errors='coerce')

data[['Popularity', 'Vote_Count', 'Vote_Average']] = data[['Popularity', 'Vote_Count', 'Vote_Average']].fillna(0)

scaler = MinMaxScaler()
data[['Popularity', 'Vote_Count', 'Vote_Average']] = scaler.fit_transform(data[['Popularity', 'Vote_Count', 'Vote_Average']])

data['Release_Year'] = pd.to_datetime(data['Release_Date'], format='%d.%m.%Y').dt.year

prepared_data = pd.concat([data, genre_encoded_df], axis=1).drop(['Genre', 'Poster_Url', 'Overview', 'Release_Date'], axis=1)

prepared_data

Unnamed: 0,Title,Popularity,Vote_Count,Vote_Average,Original_Language,Release_Year,Action,Adventure,Animation,Comedy,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Brat 2,0.295046,0.010426,0.80,ru,2000,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Major Grom: Plague Doctor,0.334385,0.014673,0.78,ru,2021,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,Balkan Line,0.285211,0.010040,0.75,ru,2019,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,T-34,0.314716,0.012356,0.73,ru,2018,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Adventurers,0.245872,0.006886,0.67,ru,2014,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,Unlawful Entry,0.002627,0.006403,0.61,en,1992,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
9655,The Little Prince,0.002627,0.002864,0.66,en,1974,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
9656,Badlands,0.002627,0.028832,0.76,en,1973,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9657,Violent Delights,0.002627,0.000257,0.35,es,2020,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [19]:
import numpy as np

features = pd.concat([
    pd.DataFrame(data[['Popularity', 'Vote_Count', 'Vote_Average']]),
    genre_encoded_df
], axis=1)

features

Unnamed: 0,Popularity,Vote_Count,Vote_Average,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,0.295046,0.010426,0.80,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,0.334385,0.014673,0.78,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0.285211,0.010040,0.75,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0.314716,0.012356,0.73,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,0.245872,0.006886,0.67,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9654,0.002627,0.006403,0.61,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,1,0,0
9655,0.002627,0.002864,0.66,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
9656,0.002627,0.028832,0.76,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9657,0.002627,0.000257,0.35,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [26]:
import random

user_ratings = pd.Series(0, index=prepared_data.index)

specific_ratings = {0: 5, 3: 3, 27: 4, 42: 4, 1: 3, 26: 4, 7: 2, 32: 3, 14: 3, 25: 1} 

user_ratings.loc[prepared_data['Animation']==1] = random.randint(4, 6)
user_ratings.loc[prepared_data['Comedy']==1] = random.randint(4, 6)
user_ratings.loc[prepared_data['Crime']==1] = random.randint(4, 6)

for index, rating in specific_ratings.items():
    user_ratings[index] = rating


rated_movies = user_ratings[user_ratings > 0]

rated_movies_with_details = data.loc[rated_movies.index]
rated_movies_with_ratings = rated_movies_with_details.copy()
rated_movies_with_ratings['User Rating'] = rated_movies


rated_movies_with_ratings

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url,Release_Year,User Rating
0,11.05.2000,Brat 2,Danila Bagrov returns to America to help his f...,0.295046,0.010426,0.80,ru,"[Action, Crime, Drama]",https://upload.wikimedia.org/wikipedia/en/2/2e...,2000,5
1,01.04.2021,Major Grom: Plague Doctor,"In a city plagued by crime and corruption, Maj...",0.334385,0.014673,0.78,ru,"[Action, Thriller, Crime]",https://image.tmdb.org/t/p/original/8FxFUsT6dY...,2021,3
3,01.01.2018,T-34,A young tank commander leads a daring escape f...,0.314716,0.012356,0.73,ru,"[Action, War, Drama]",https://image.tmdb.org/t/p/original/6oG4xMUYxv...,2018,3
5,26.12.2019,Serf,A rich playboy is sent to live as a serf to te...,0.354055,0.015735,0.82,ru,[Comedy],https://image.tmdb.org/t/p/original/7Ii5aW3ZdQ...,2019,4
6,04.03.2010,What Men Talk About,"A group of friends embarks on a road trip, sha...",0.275376,0.009653,0.76,ru,[Comedy],https://image.tmdb.org/t/p/original/2a1mGVfi4A...,2010,4
...,...,...,...,...,...,...,...,...,...,...,...
9647,19.04.2015,Ashby,When new kid in town Ed Wallis is given an ass...,0.002629,0.012260,0.62,en,"[Drama, Romance, Comedy]",https://image.tmdb.org/t/p/original/hQDEvfqoYD...,2015,4
9651,18.05.2005,I'm in Love With My Little Sister,Yori and his twin sister Iku used to be very c...,0.002628,0.000290,0.57,ja,"[Romance, Animation, Drama]",https://image.tmdb.org/t/p/original/fRdxgR85XX...,2005,6
9653,13.05.2008,Amateur Porn Star Killer 2,Shane Ryan's sequel to the disturbing Amateur ...,0.002628,0.000450,0.54,en,"[Crime, Horror]",https://image.tmdb.org/t/p/original/iBmngXVhDu...,2008,6
9654,26.06.1992,Unlawful Entry,"After a break-in at their house, a couple gets...",0.002627,0.006403,0.61,en,"[Crime, Thriller, Mystery]",https://image.tmdb.org/t/p/original/sKUk1ca6gH...,1992,6


In [50]:
from sklearn.neighbors import NearestNeighbors
import numpy as np


def recommend_movies_knn(features, user_preferences, data, top_n=10):
    """
    Recommend movies based on user preferences using k-Nearest Neighbors.

    :param features: Feature matrix (e.g., numerical and one-hot encoded data).
    :param user_preferences: Array representing user preferences for features.
    :param data: Original metadata for movies (e.g., title, genre).
    :param top_n: Number of recommendations to return.
    :return: DataFrame of top_n recommended movies.
    """
    knn = NearestNeighbors(n_neighbors=top_n, metric='cosine')
    knn.fit(features)

    distances, indices = knn.kneighbors(user_preferences)

    recommended_movies = data.iloc[indices[0]]

    recommended_movies = recommended_movies.copy()
    recommended_movies['Similarity'] = 1 - distances[0]

    return recommended_movies.sort_values(by='Similarity', ascending=False)




rated_features = features.loc[rated_movies.index]
user_preferences = np.dot(rated_features.T, rated_movies)
user_preferences = user_preferences / np.linalg.norm(user_preferences)
user_preferences = pd.DataFrame([user_preferences], columns=features.columns)



top_recommendations = recommend_movies_knn(features, user_preferences, prepared_data, top_n=5000)


print("Top 5000 Movie Recommendations:")
top_recommendations[['Title', 'Similarity']]

Top 5000 Movie Recommendations:


Unnamed: 0,Title,Similarity
9316,One Piece Episode of Merry: The Tale of One Mo...,0.803412
5012,PokÃ©mon the Movie: Diancie and the Cocoon of ...,0.796549
7358,Scooby-Doo! and the Gourmet Ghost,0.794863
7094,Miracles,0.788379
207,Pretty Guardian Sailor Moon Eternal The Movie ...,0.784732
...,...,...
8993,Shimmer Lake,0.534567
8649,Ae Fond Kiss...,0.534555
7186,Green Zone,0.534553
8777,"My Brother, My Sister",0.534551


In [51]:
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import cosine_similarity


def recommend_movies_reranked(features, user_preferences, data, top_n=10, alpha=0.5):
    """
    Recommend movies using k-NN and re-rank based on diversity.

    :param features: Feature matrix (e.g., numerical and one-hot encoded data).
    :param user_preferences: Array representing user preferences for features.
    :param data: Original metadata for movies (e.g., title, popularity).
    :param top_n: Number of recommendations to return.
    :param alpha: Weight for similarity vs diversity.
    :return: DataFrame of top_n recommended movies.
    """

    knn = NearestNeighbors(n_neighbors=top_n * 2, metric='cosine')
    knn.fit(features)
    distances, indices = knn.kneighbors(user_preferences)

    recommended_movies = data.iloc[indices[0]].copy()
    recommended_movies['Similarity'] = 1 - distances[0]

    # Calculate Diversity Score (inverse of Popularity, normalized)
    recommended_movies['Diversity'] = 1 - (recommended_movies['Popularity'] / recommended_movies['Popularity'].max())
    
    genre_columns = [col for col in features.columns if col in data.columns and data[col].max() == 1]
    
    # add Genre Diversity
    genre_features = features[genre_columns]
    genre_similarity = cosine_similarity(
        genre_features.iloc[indices[0]], genre_features.iloc[indices[0]]
    ).mean(axis=1)

    recommended_movies['Genre_Diversity'] = 1 - genre_similarity
    
    recommended_movies['Diversity'] = (
        0.5 * (1 - (recommended_movies['Popularity'] / recommended_movies['Popularity'].max())) +
        0.5 * recommended_movies['Genre_Diversity']
    )
    
    recommended_movies['Final_Score'] = (
        alpha * recommended_movies['Similarity'] +
        (1 - alpha) * recommended_movies['Diversity']
    )

    recommended_movies = recommended_movies.sort_values(by='Final_Score', ascending=False)

    return recommended_movies.head(top_n)

recommendations = recommend_movies_reranked(features, user_preferences, prepared_data, top_n=10)

print("Top 10 Movie Recommendations:")
recommendations[['Title', 'Similarity', 'Diversity', 'Final_Score']]

Top 10 Movie Recommendations:


Unnamed: 0,Title,Similarity,Diversity,Final_Score
7358,Scooby-Doo! and the Gourmet Ghost,0.794863,0.636826,0.715844
3978,Lupin the Third: The Castle of Cagliostro,0.784524,0.616595,0.70056
5697,The Sting,0.774271,0.62173,0.698
5575,48 Hrs.,0.782612,0.612735,0.697674
9316,One Piece Episode of Merry: The Tale of One Mo...,0.803412,0.587947,0.69568
8805,Haikyuu!! Movie 4: Battle of Concepts,0.775125,0.615317,0.695221
8714,Ernest & Celestine,0.779246,0.606704,0.692975
7094,Miracles,0.788379,0.593469,0.690924
4931,First Strike,0.784297,0.588349,0.686323
5679,Beyond the Boundary: I'll Be Here â Future,0.77658,0.588758,0.682669


In [56]:
recommended_indices = top_recommendations.index

rated_indices = rated_movies.index

relevant_recommendations = set(recommended_indices).intersection(set(rated_indices))

precision_at_k = len(relevant_recommendations) / len(recommended_indices)

recall_at_k = len(relevant_recommendations) / len(rated_indices)

print(f"Precision@k: {precision_at_k}")
print(f"Recall@k: {recall_at_k}")

Precision@k: 0.8642
Recall@k: 0.9049214659685864


In [57]:
f1_score = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)
print(f"F1-Score: {f1_score}")

F1-Score: 0.8840920716112532
