# Загрузка данных

In [23]:
!git clone https://github.com/divensambhwani/MovieLens-100K_Recommender-System.git  > /dev/null 2>&1
%cd MovieLens-100K_Recommender-System

/content/MovieLens-100K_Recommender-System/MovieLens-100K_Recommender-System


# Добавление нужных библиотек

In [24]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Работа с данными

In [25]:
tags_df = pd.read_csv("/content/MovieLens-100K_Recommender-System/data/tags.csv")

In [26]:
tags_df

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [27]:
tags_df = tags_df[["movieId", "tag"]]

In [28]:
len(tags_df["movieId"].unique())

1572

In [29]:
tags_df = tags_df.groupby("movieId")["tag"].apply('|'.join).reset_index()


In [30]:
tags_df["lenOfTag"] = tags_df["tag"].str.len()

In [31]:
movies_df = pd.read_csv("/content/MovieLens-100K_Recommender-System/data/movies.csv")

In [32]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [33]:
movies_df = movies_df.join(tags_df.set_index('movieId'), on='movieId')

In [34]:
movies_df["tag"] = movies_df["tag"].fillna("")

In [35]:
print(movies_df.columns)

Index(['movieId', 'title', 'genres', 'tag', 'lenOfTag'], dtype='object')


In [36]:
movies_df["info"] = movies_df["genres"] + "|" + movies_df["tag"]

In [37]:
movies_df = movies_df[["movieId", "title", "info"]]

In [38]:
movies_df

Unnamed: 0,movieId,title,info
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy|pi...
1,2,Jumanji (1995),Adventure|Children|Fantasy|fantasy|magic board...
2,3,Grumpier Old Men (1995),Comedy|Romance|moldy|old
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance|
4,5,Father of the Bride Part II (1995),Comedy|pregnancy|remake
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy|
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy|
9739,193585,Flint (2017),Drama|
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation|


# Работаем с текстом

In [39]:
movies_df["info"] = movies_df["info"].str.lower()

In [40]:
movies_df

Unnamed: 0,movieId,title,info
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy|pi...
1,2,Jumanji (1995),adventure|children|fantasy|fantasy|magic board...
2,3,Grumpier Old Men (1995),comedy|romance|moldy|old
3,4,Waiting to Exhale (1995),comedy|drama|romance|
4,5,Father of the Bride Part II (1995),comedy|pregnancy|remake
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),action|animation|comedy|fantasy|
9738,193583,No Game No Life: Zero (2017),animation|comedy|fantasy|
9739,193585,Flint (2017),drama|
9740,193587,Bungo Stray Dogs: Dead Apple (2018),action|animation|


In [41]:
stopwords_eng = stopwords.words("english")
print(stopwords_eng)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [42]:
movies_df['info'] = movies_df['info'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords_eng)]))

In [43]:
movies_df

Unnamed: 0,movieId,title,info
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy|pi...
1,2,Jumanji (1995),adventure|children|fantasy|fantasy|magic board...
2,3,Grumpier Old Men (1995),comedy|romance|moldy|old
3,4,Waiting to Exhale (1995),comedy|drama|romance|
4,5,Father of the Bride Part II (1995),comedy|pregnancy|remake
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),action|animation|comedy|fantasy|
9738,193583,No Game No Life: Zero (2017),animation|comedy|fantasy|
9739,193585,Flint (2017),drama|
9740,193587,Bungo Stray Dogs: Dead Apple (2018),action|animation|


# Добавление рейтинга

In [44]:
ratings_df = pd.read_csv("/content/MovieLens-100K_Recommender-System/MovieLens-100K_Recommender-System/data/ratings.csv")

In [45]:
ratings_df = ratings_df[["movieId", "rating"]]

In [46]:
ratings_df

Unnamed: 0,movieId,rating
0,1,4.0
1,3,4.0
2,6,4.0
3,47,5.0
4,50,5.0
...,...,...
100831,166534,4.0
100832,168248,5.0
100833,168250,5.0
100834,168252,5.0


## Делаем средний рейтинг к каждому фильму

In [47]:
ratings_df = ratings_df.groupby("movieId")["rating"].mean()

In [48]:
ratings_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.920930
2,3.431818
3,3.259615
4,2.357143
5,3.071429
...,...
193581,4.000000
193583,3.500000
193585,3.500000
193587,3.500000


In [49]:
ratings_df = ratings_df.reset_index()

In [50]:
ratings_df

Unnamed: 0,movieId,rating
0,1,3.920930
1,2,3.431818
2,3,3.259615
3,4,2.357143
4,5,3.071429
...,...,...
9719,193581,4.000000
9720,193583,3.500000
9721,193585,3.500000
9722,193587,3.500000


## Добавляем рейтинги к датафрейму movies_df

In [51]:
movies_df = movies_df.join(ratings_df.set_index('movieId'), on='movieId')

In [52]:
movies_df

Unnamed: 0,movieId,title,info,rating
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy|pi...,3.920930
1,2,Jumanji (1995),adventure|children|fantasy|fantasy|magic board...,3.431818
2,3,Grumpier Old Men (1995),comedy|romance|moldy|old,3.259615
3,4,Waiting to Exhale (1995),comedy|drama|romance|,2.357143
4,5,Father of the Bride Part II (1995),comedy|pregnancy|remake,3.071429
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),action|animation|comedy|fantasy|,4.000000
9738,193583,No Game No Life: Zero (2017),animation|comedy|fantasy|,3.500000
9739,193585,Flint (2017),drama|,3.500000
9740,193587,Bungo Stray Dogs: Dead Apple (2018),action|animation|,3.500000


## Удаляем фильмы, которые без рейтинга воообще

In [53]:
movies_df[movies_df["rating"].isna()].count()

Unnamed: 0,0
movieId,18
title,18
info,18
rating,0


In [54]:
movies_df["rating"].isna().sum()

18

In [55]:
movies_df = movies_df.dropna()

In [56]:
movies_df

Unnamed: 0,movieId,title,info,rating
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy|pi...,3.920930
1,2,Jumanji (1995),adventure|children|fantasy|fantasy|magic board...,3.431818
2,3,Grumpier Old Men (1995),comedy|romance|moldy|old,3.259615
3,4,Waiting to Exhale (1995),comedy|drama|romance|,2.357143
4,5,Father of the Bride Part II (1995),comedy|pregnancy|remake,3.071429
...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),action|animation|comedy|fantasy|,4.000000
9738,193583,No Game No Life: Zero (2017),animation|comedy|fantasy|,3.500000
9739,193585,Flint (2017),drama|,3.500000
9740,193587,Bungo Stray Dogs: Dead Apple (2018),action|animation|,3.500000


# Применение модели

## Векторизируем info

In [57]:
vectorizer = TfidfVectorizer(tokenizer=lambda x: x.split('|'))
tfidf_matrix = vectorizer.fit_transform(movies_df['info'])



## Смотрим по кисиносному сходству и умножаем на нормализованные рейтинги

In [58]:
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [59]:
scaler = MinMaxScaler()
movies_df['rating_norm'] = scaler.fit_transform(movies_df[['rating']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['rating_norm'] = scaler.fit_transform(movies_df[['rating']])


In [60]:
weighted_sim = cosine_sim * movies_df['rating_norm'].values[:, None]

## Получаем обезличенные рекомнедации

In [61]:
def get_recommendations(movie_id, sim_matrix, movies_df, top_n=10):
    if movie_id not in movies_df['movieId'].values:
        return []

    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    if movie_idx >= sim_matrix.shape[0]:
        return []

    similarity_scores = list(enumerate(sim_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_movies = [movies_df.iloc[i[0]]['movieId'] for i in similarity_scores[1:top_n+1]]  # Пропускаем сам фильм
    return top_movies

In [62]:
movie_id = 1
recommendations = get_recommendations(movie_id, weighted_sim, movies_df)
print(f"Recommendations for movieId {movie_id}: {recommendations}")


Recommendations for movieId 1: [2355, 3114, 122918, 2294, 3754, 4016, 4886, 45074, 53121, 65577]


## Смотрим и сравниваем рекомендации в явном виде

In [63]:
def get_recommendations_with_title(movie_id, sim_matrix, movies_df, top_n=10):
    movie_idx = movies_df[movies_df['movieId'] == movie_id].index[0]
    similarity_scores = list(enumerate(sim_matrix[movie_idx]))
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    top_movies = movies_df.iloc[[i[0] for i in similarity_scores[1:top_n+1]]]
    return top_movies[['movieId', 'title']]


In [64]:
movie_id = 1
recommendations = get_recommendations_with_title(movie_id, weighted_sim, movies_df)
print(f"Recommendations for movieId {movie_id}: {recommendations}")


Recommendations for movieId 1:       movieId                                           title
1757     2355                            Bug's Life, A (1998)
2355     3114                              Toy Story 2 (1999)
8695   122918                Guardians of the Galaxy 2 (2017)
1706     2294                                     Antz (1998)
2809     3754  Adventures of Rocky and Bullwinkle, The (2000)
3000     4016                Emperor's New Groove, The (2000)
3568     4886                           Monsters, Inc. (2001)
6194    45074                                Wild, The (2006)
6486    53121                          Shrek the Third (2007)
6948    65577                  Tale of Despereaux, The (2008)


In [65]:
movies_df[movies_df["movieId"] == 1]

Unnamed: 0,movieId,title,info,rating,rating_norm
0,1,Toy Story (1995),adventure|animation|children|comedy|fantasy|pi...,3.92093,0.760207


In [66]:
recommendations_df = pd.DataFrame({
    'movieId': movies_df['movieId'],
    'recommendations': movies_df['movieId'].apply(lambda x: get_recommendations(x, weighted_sim, movies_df))
})

In [67]:
recommendations_df

Unnamed: 0,movieId,recommendations
0,1,"[2355, 3114, 122918, 2294, 3754, 4016, 4886, 4..."
1,2,"[60, 126, 1009, 2043, 2093, 2161, 2162, 2399, ..."
2,3,"[64, 68, 118, 129, 180, 186, 276, 287, 289, 294]"
3,4,"[94, 195, 281, 359, 372, 542, 803, 1175, 1236,..."
4,5,"[34359, 34528, 6788, 7, 2719, 4808, 8366, 6944..."
...,...,...
9737,193581,[]
9738,193583,[]
9739,193585,[]
9740,193587,[]


## Удаляем пустые рекомендации

In [68]:
recommendations_df = recommendations_df[recommendations_df["recommendations"].str.len() > 0]

In [69]:
recommendations_df

Unnamed: 0,movieId,recommendations
0,1,"[2355, 3114, 122918, 2294, 3754, 4016, 4886, 4..."
1,2,"[60, 126, 1009, 2043, 2093, 2161, 2162, 2399, ..."
2,3,"[64, 68, 118, 129, 180, 186, 276, 287, 289, 294]"
3,4,"[94, 195, 281, 359, 372, 542, 803, 1175, 1236,..."
4,5,"[34359, 34528, 6788, 7, 2719, 4808, 8366, 6944..."
...,...,...
9719,189111,"[94160, 95149, 102084, 4158, 7228, 72692, 7900..."
9720,189333,"[7228, 72692, 79008, 126577, 139855, 182297, 1..."
9721,189381,"[57, 121, 148, 154, 184, 193, 209, 211, 213, 219]"
9722,189547,"[79274, 95004, 99813, 136297, 193587, 106873, ..."


# Создаем и качаем таблицу для добавления в бд

In [70]:
recommendations_df.to_csv("movie_recommendations_list.csv", index=False)

In [71]:
from google.colab import files
files.download('movie_recommendations_list.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>