In [1]:
import pandas as pd


In [2]:
movies = pd.read_csv("movies.csv")

In [3]:
import re
def clean_title(title):
    return re.sub("[^a-zA-Z0-9 ]", "", title)
# ornek olarak Toy Story (1995) in bu fonksiyona girdi olarak verirsek cikti Toy Story 1995 olur.

In [4]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies.clean_title)

# Arama fonksiyonu

In [7]:
from sklearn.metrics.pairwise import cosine_similarity

import numpy as np

#ornek olarak toy story 1995 i arayalim
title = "toy story 1995"
title = clean_title(title)
query_vec = vectorizer.transform([title])
similarity = cosine_similarity(query_vec, tfidf).flatten()

In [8]:
query_vec.data

array([0.52364649, 0.47886319, 0.56091516, 0.30818288, 0.29475734])

In [9]:
'''
similarity toy story 1995 in butun filmlerle olan benzerligi oranidir.
'''
similarity

array([1.        , 0.09681098, 0.06531543, ..., 0.        , 0.        ,
       0.        ])

In [10]:
# en yuksek olasiliga sahip 5 film
indices = np.argpartition(similarity, -5)[-5:]
results = movies.iloc[indices][::-1]# en cok benzeyen film en basta olacak
results

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
59767,201588,Toy Story 4 (2019),Adventure|Animation|Children|Comedy,Toy Story 4 2019
14813,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
20497,106022,Toy Story of Terror (2013),Animation|Children|Comedy,Toy Story of Terror 2013


In [11]:
# yukarida yaptiklarimizi fonksiyon olarak duzenliyoruz.
def search(title):
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]# en cok benzeyen film en basta
    return results

# İnteraktif arama kutusu

In [12]:
#!pip install ipywidgets
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value="",
    description="Movie Title:",
    disabled=False
)
movie_list = widgets.Output()
def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title = data["new"]
        
        if len(title) > 5:
            display(search(title))
    
movie_input.observe(on_type, names="value")

display(movie_input, movie_list)

Text(value='', description='Movie Title:')

Output()

# Film rating degerleri

In [13]:
ratings = pd.read_csv("ratings.csv")
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


# Ayni filmi seven kisileri bulma

In [14]:
# movie_id si 1 olan filmi sevenlere bakiyoruz.
movie_id = 1

In [15]:
# movie id si 1 olan filmi sevenlerin aliyoruz. fakat rating >= 5 olmak uzere
similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()

In [16]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530], dtype=int64)

In [17]:
# yukarida movie id si 1 olan film sevenleri almistik. simdi bu kisilerin sevdigi diger filmlere bakiyoruz.
# burada ise rating > 4 sarti koyuyoruz.
similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]

In [18]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [19]:
# movie id si 1 olan filmi 5 rating ile seven 
# 13506 kisinin diger filmleri sevme dagilimi. 
# movie id si 1 olan filmi sevenler en cok sirasiyla 318, 260, 360 ve 296 id li filmlere ilgi gosteriyor.
similar_user_recs.value_counts()

1         13506
318        5599
260        5464
356        4690
296        4628
          ...  
27306         1
71732         1
4739          1
190187        1
97957         1
Name: movieId, Length: 16797, dtype: int64

In [20]:
# ilgi gosterme oranlari
similar_user_recs.value_counts() / len(similar_users)

1         1.000000
318       0.414556
260       0.404561
356       0.347253
296       0.342663
            ...   
27306     0.000074
71732     0.000074
4739      0.000074
190187    0.000074
97957     0.000074
Name: movieId, Length: 16797, dtype: float64

In [21]:
# en az %10 oranla sevenler
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [22]:
similar_user_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

# Bulunan filmlerin butun izleyiciler uzerinde etkisi

In [23]:
# yukarida movie id si 1 olan filmi sevenlerin en az %10 oranla diger filmleri sevme oranlarini bulduk.
# bulunan bu filmleri rating > 4 ile seven butun kisileri buluyoruz. 
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]

In [24]:
ratings[(ratings["movieId"].isin(similar_user_recs.index))]

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
36,1,5952,4.0,1147868053
39,1,6377,4.0,1147868469
40,1,6539,3.5,1147868461
...,...,...,...,...
25000078,162541,7153,5.0,1240952613
25000081,162541,7361,4.5,1240953484
25000084,162541,8961,4.0,1240953338
25000087,162541,33794,4.0,1240951792


In [25]:
all_users

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
29,1,4973,4.5,1147869080
48,1,7361,5.0,1147880055
72,2,110,5.0,1141416589
76,2,260,5.0,1141417172
...,...,...,...,...
25000055,162541,4973,4.5,1240950790
25000057,162541,4993,5.0,1240952610
25000065,162541,5952,5.0,1240952617
25000078,162541,7153,5.0,1240952613


In [26]:
all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())

In [27]:
all_users_recs

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

# Oneri skoru hesaplama

In [28]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [29]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [30]:
'''
skor hesaplarken suna dikkat ediyoruz. ornek olarak movie id si 1 olan bir filmi seven kisilerin %41 i 318 id li filmi
de seviyor. fakat butun izleyicilere baktigimizda %34 oraninda sevildigini goruyoruz. bu istedigimiz bir sey degil.
sonuc olarak bu iki oranin birbirinden buyuk oranda farkinin olmasini istiyoruz.
skor hesaplarken aradaki fark ne kadar buyukse skor da o kadar buyuk olacak.
'''
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

In [31]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [32]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [33]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

Unnamed: 0,similar,all,score,movieId,title,genres,clean_title
0,1.0,0.125844,7.946323,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
3021,0.295498,0.054186,5.453383,3114,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 2 1999
2264,0.124685,0.025316,4.925186,2355,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy,Bugs Life A 1998
14813,0.138161,0.035445,3.897906,78499,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX,Toy Story 3 2010
580,0.233674,0.068117,3.43048,588,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical,Aladdin 1992
587,0.198949,0.060514,3.287671,595,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX,Beauty and the Beast 1991
33,0.158226,0.052696,3.002602,34,Babe (1995),Children|Drama,Babe 1995
4780,0.210647,0.071444,2.94841,4886,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy,Monsters Inc 2001
1047,0.143418,0.049202,2.914882,1073,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical,Willy Wonka the Chocolate Factory 1971
729,0.108322,0.037362,2.899227,745,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy,Wallace Gromit A Close Shave 1995


# Tekrardan oneri fonksiyonu olusturma

In [34]:
# yukarida yaptiklarimizi fonksiyon haline getiriyoruz.
def find_similar_movies(movie_id):
    similar_users = ratings[(ratings["movieId"] == movie_id) & (ratings["rating"] >= 5)]["userId"].unique()
    similar_user_recs = ratings[(ratings["userId"].isin(similar_users)) & (ratings["rating"] > 4)]["movieId"]
    similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
    similar_user_recs = similar_user_recs[similar_user_recs > .1]
    all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index)) & (ratings["rating"] > 4)]
    all_users_recs = all_users["movieId"].value_counts()/len(all_users["userId"].unique())
    rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
    rec_percentages.columns = ["similar", "all"]
    rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]
    rec_percentages = rec_percentages.sort_values("score", ascending=False)
    return rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")

# İnteraktif oneri kutusu olusturma

In [36]:
movie_name_input = widgets.Text(
    
    description="Movie Title",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
    with recommendation_list:
        recommendation_list.clear_output()
        title=data["new"]
        
        if len(title) > 5:
            results = search(title)
            movie_id= results.iloc[0]["movieId"]
            display(find_similar_movies(movie_id))
            
movie_name_input.observe(on_type, names="value")
display(movie_name_input, recommendation_list)

Text(value='', description='Movie Title')

Output()