### Data Engineering

In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, MinMaxScaler, StandardScaler


data = pd.read_csv('mymoviedb.csv')

# hot-encoding genre
data['Genre'] = data['Genre'].fillna('').astype(str)
data['Genre'] = data['Genre'].apply(lambda x: x.split(', '))
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(data['Genre'])
genre_encoded_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

for column in ['Popularity', 'Vote_Count', 'Vote_Average']:
    data[column] = pd.to_numeric(data[column], errors='coerce')

data[['Popularity', 'Vote_Count', 'Vote_Average']] = data[['Popularity', 'Vote_Count', 'Vote_Average']].fillna(0)

scaler = StandardScaler()
data[['Popularity', 'Vote_Count', 'Vote_Average']] = scaler.fit_transform(data[['Popularity', 'Vote_Count', 'Vote_Average']])

data['Release_Year'] = pd.to_datetime(data['Release_Date']).dt.year

prepared_data = pd.concat([data, genre_encoded_df], axis=1).drop(['Genre', 'Poster_Url', 'Overview', 'Release_Date'], axis=1)

prepared_data




Unnamed: 0,Title,Popularity,Vote_Count,Vote_Average,Original_Language,Poster_URL,Release_Year,Action,Adventure,Animation,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,Brat 2,11.132005,-0.408767,1.379082,ru,Failed to retrieve data.,2000,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Major Grom: Plague Doctor,12.661168,-0.358159,1.202041,ru,https://m.media-amazon.com/images/M/MV5BOTI4Zm...,2021,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,Balkan Line,10.749715,-0.413368,0.936480,ru,Failed to retrieve data.,2019,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,T-34,11.896587,-0.385763,0.759440,ru,Failed to retrieve data.,2018,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,Adventurers,9.220552,-0.450941,0.228318,ru,https://m.media-amazon.com/images/M/MV5BMmIyMz...,2014,1,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9847,Badlands,-0.234590,-0.189466,1.025001,en,Failed to retrieve data.,1973,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9848,Violent Delights,-0.234597,-0.529920,-2.604332,es,Failed to retrieve data.,2020,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9849,The Offering,-0.234605,-0.496948,-1.276527,en,Failed to retrieve data.,2016,0,0,0,...,0,1,0,1,0,0,0,1,0,0
9850,The United States vs. Billie Holiday,-0.234612,-0.474711,0.228318,en,Failed to retrieve data.,2021,0,0,0,...,1,0,1,0,0,0,0,0,0,0


In [3]:
most_popular_movie = data.sort_values('Popularity', ascending=False).head(5)
most_popular_movie

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url,Poster_URL,Release_Year
26,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,38.534249,2.89456,1.644643,en,"[Action, Adventure, Science Fiction]",https://image.tmdb.org/t/p/original/1g0dhYtq4i...,https://m.media-amazon.com/images/M/MV5BMmFiZG...,2021
27,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",28.928844,-0.0917,1.467602,en,"[Crime, Mystery, Thriller]",https://image.tmdb.org/t/p/original/74xTEgt7R3...,https://m.media-amazon.com/images/M/MV5BMmU5NG...,2022
28,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,19.68069,-0.486213,-0.125763,en,[Thriller],https://image.tmdb.org/t/p/original/vDHsLnOWKl...,https://m.media-amazon.com/images/M/MV5BNGZiMz...,2022
29,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",18.030066,1.413123,1.113521,en,"[Animation, Comedy, Family, Fantasy]",https://image.tmdb.org/t/p/original/4j0PNHkMr5...,Failed to retrieve data.,2021
30,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,14.156009,0.154439,0.493879,en,"[Action, Adventure, Thriller, War]",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...,Failed to retrieve data.,2021


### Create the features matrix

In [9]:
import numpy as np
from sklearn.decomposition import PCA
from scipy.sparse import csr_matrix



features = pd.concat([
    pd.DataFrame(data[['Popularity', 'Vote_Count', 'Vote_Average']]),
    genre_encoded_df
], axis=1)

features 

# pca = PCA(n_components=2)
# features = pca.fit_transform(features)
sparse_features = csr_matrix(features)

features


Unnamed: 0,Popularity,Vote_Count,Vote_Average,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,...,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
0,11.132005,-0.408767,1.379082,1,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
1,12.661168,-0.358159,1.202041,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,10.749715,-0.413368,0.936480,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,11.896587,-0.385763,0.759440,1,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,9.220552,-0.450941,0.228318,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9847,-0.234590,-0.189466,1.025001,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
9848,-0.234597,-0.529920,-2.604332,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
9849,-0.234605,-0.496948,-1.276527,0,0,0,0,0,0,0,...,0,1,0,1,0,0,0,1,0,0
9850,-0.234612,-0.474711,0.228318,0,0,0,0,0,0,1,...,1,0,1,0,0,0,0,0,0,0


### Создаем рандомного пользователя который выдает рейтинг 12 фильмам

In [5]:
from sklearn.metrics.pairwise import cosine_similarity

rated_indices = np.random.choice(data.index, size=10, replace=False) 
user_ratings = pd.Series(0, index=data.index) 
user_ratings[rated_indices] = np.random.choice([1,2, 3, 4, 5], size=10) 

rated_movies = user_ratings[user_ratings > 0]

rated_movies_with_details = data.loc[rated_movies.index]

rated_movies_with_ratings = rated_movies_with_details.copy()
rated_movies_with_ratings['User Rating'] = rated_movies

print(rated_movies_with_ratings)

     Release_Date                      Title  \
714    2020-02-28      All the Bright Places   
1205   2020-12-10                   Songbird   
1628   2021-08-18               Drive My Car   
2628   2016-04-23      I've Always Liked You   
3495   1995-09-14                    Hackers   
3586   2018-12-27                       T-34   
4005   2005-01-28            Innocent Voices   
4990   2002-11-26  Shark Attack 3: Megalodon   
7955   2009-09-04            Valhalla Rising   
8338   2016-07-31             Edge of Winter   

                                               Overview  Popularity  \
714   Two teens facing personal struggles form a pow...    0.310771   
1205  During a pandemic lockdown, Nico, a young man ...    0.112247   
1628  Yusuke Kafuku, a stage actor and director, sti...    0.029841   
2628  Love is blooming at Sakuragaoka High School. N...   -0.079624   
3495  Along with his new friends, a teenager who was...   -0.129276   
3586  In 1944, a courageous group of Russian 

### Используем cosine_similarity для настройки весов

In [6]:
similarity_matrix = cosine_similarity(features)


def recommend_movies(user_ratings, data, similarity_matrix, top_n=5):
    weighted_scores = similarity_matrix.T.dot(user_ratings)
    recommendations = pd.DataFrame({
        'Title': data['Title'],
        'Score': weighted_scores
    }).sort_values(by='Score', ascending=False).head(top_n)
    return recommendations

recommendations = recommend_movies(user_ratings, data, similarity_matrix)

print("Movies Rated by User:")
print(data.loc[rated_indices, ['Title', 'Genre']])
print("\nRecommendations:")
print(recommendations)

Movies Rated by User:
                          Title                                Genre
714       All the Bright Places                     [Romance, Drama]
2628      I've Always Liked You          [Animation, Drama, Romance]
1628               Drive My Car                              [Drama]
3495                    Hackers     [Action, Crime, Thriller, Drama]
4005            Innocent Voices                         [Drama, War]
8338             Edge of Winter                    [Drama, Thriller]
4990  Shark Attack 3: Megalodon                             [Horror]
1205                   Songbird  [Thriller, Romance, Drama, Mystery]
7955            Valhalla Rising  [Adventure, Drama, Action, Fantasy]
3586                       T-34        [War, Action, Drama, History]

Recommendations:
                    Title      Score
6980                Speak  12.511850
6482        The Good Liar  12.511848
8971  The Yin Yang Master  12.511847
7432            Overboard  12.511841
8890        36th

### Basic KNN

In [7]:
from sklearn.neighbors import NearestNeighbors
import numpy as np
from scipy.sparse import csr_matrix


def recommend_movies_knn(features, user_preferences, data, top_n=10):
    """
    Recommend movies based on user preferences using k-Nearest Neighbors.

    :param features: Feature matrix (e.g., numerical and one-hot encoded data).
    :param user_preferences: Array representing user preferences for features.
    :param data: Original metadata for movies (e.g., title, genre).
    :param top_n: Number of recommendations to return.
    :return: DataFrame of top_n recommended movies.
    """
    knn = NearestNeighbors(n_neighbors=top_n, metric='cosine')

    knn.fit(sparse_features)

    distances, indices = knn.kneighbors(user_preferences)
    print(f"distance: {distances}")

    recommended_movies = data.iloc[indices[0]]

    recommended_movies = recommended_movies.copy()
    recommended_movies['Similarity'] = 1 - distances[0]
    

    return recommended_movies.sort_values(by='Similarity', ascending=False)


rated_features = features[rated_indices] 
user_preferences = np.dot(rated_features.T, user_ratings[rated_indices])
user_preferences = user_preferences / np.linalg.norm(user_preferences)
user_preferences = pca.transform(user_preferences.reshape(1, -1))  z``

# Reshape user_preferences to 2D array
user_preferences = user_preferences.reshape(1, -1)


top_recommendations = recommend_movies_knn(features, user_preferences, data, top_n=100)


print("Top 10 Movie Recommendations:")
print(top_recommendations[['Title', 'Similarity']])



SyntaxError: invalid syntax (1515807158.py, line 35)

In [9]:
from sklearn.metrics import mean_squared_error
import numpy as np

true_ratings = user_ratings[rated_indices] 

predicted_ratings = top_recommendations['Similarity'][:len(true_ratings)].values

rmse = np.sqrt(mean_squared_error(true_ratings, predicted_ratings))
print(f"RMSE: {rmse}")


RMSE: 2.5533176755793283


KeyError: 'UserID'

### KNN with addition of reranking

In [44]:
from sklearn.neighbors import NearestNeighbors


def recommend_movies_reranked(features, user_preferences, data, top_n=10, alpha=0.5):
    """
    Recommend movies using k-NN and re-rank based on diversity.

    :param features: Feature matrix (e.g., numerical and one-hot encoded data).
    :param user_preferences: Array representing user preferences for features.
    :param data: Original metadata for movies (e.g., title, popularity).
    :param top_n: Number of recommendations to return.
    :param alpha: Weight for similarity vs diversity.
    :return: DataFrame of top_n recommended movies.
    """

    knn = NearestNeighbors(n_neighbors=top_n * 2, metric='cosine')
    knn.fit(features)
    distances, indices = knn.kneighbors(user_preferences)

    recommended_movies = data.iloc[indices[0]].copy()
    recommended_movies['Similarity'] = 1 - distances[0]

    # Calculate Diversity Score (inverse of Popularity, normalized)
    recommended_movies['Diversity'] = 1 - (recommended_movies['Popularity'] / recommended_movies['Popularity'].max())
    
    genre_columns = [col for col in features.columns if col in data.columns and data[col].max() == 1]
    
    # add Genre Diversity
    genre_features = features[genre_columns]
    genre_similarity = cosine_similarity(
        genre_features.iloc[indices[0]], genre_features.iloc[indices[0]]
    ).mean(axis=1)

    recommended_movies['Genre_Diversity'] = 1 - genre_similarity
    
    recommended_movies['Diversity'] = (
        0.5 * (1 - (recommended_movies['Popularity'] / recommended_movies['Popularity'].max())) +
        0.5 * recommended_movies['Genre_Diversity']
    )
    
    recommended_movies['Final_Score'] = (
        alpha * recommended_movies['Similarity'] +
        (1 - alpha) * recommended_movies['Diversity']
    )

    recommended_movies = recommended_movies.sort_values(by='Final_Score', ascending=False)

    return recommended_movies.head(top_n)

recommendations = recommend_movies_reranked(features, user_preferences, data, top_n=100)

print("Top 10 Movie Recommendations:")
print(recommendations[['Title', 'Similarity', 'Diversity', 'Final_Score']])



ValueError: Found array with 0 feature(s) (shape=(200, 0)) while a minimum of 1 is required by check_pairwise_arrays.

### Some visualisation

In [8]:
import matplotlib.pyplot as plt


### Metrics


### Сохраняем параметры

In [10]:
import joblib

joblib.dump(features, 'features.pkl')

['features.pkl']