<a href="https://colab.research.google.com/github/VictoryChianumba/Github-Projects/blob/main/AirflowDataPipeline(movie_recommendation)_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

In [None]:
movies = pd.read_csv("movies.csv")

In [None]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
62418,209157,We (2018),Drama
62419,209159,Window of the Soul (2001),Documentary
62420,209163,Bad Poems (2018),Comedy|Drama
62421,209169,A Girl Thing (2001),(no genres listed)


In [None]:
import re

def clean_title(title):
  return re.sub("[^a-zA-Z0-9 ]", "", title)

In [None]:
movies["clean_title"] = movies["title"].apply(clean_title)

In [None]:
movies

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995
...,...,...,...,...
62418,209157,We (2018),Drama,We 2018
62419,209159,Window of the Soul (2001),Documentary,Window of the Soul 2001
62420,209163,Bad Poems (2018),Comedy|Drama,Bad Poems 2018
62421,209169,A Girl Thing (2001),(no genres listed),A Girl Thing 2001


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(movies["clean_title"])

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  title = clean_title(title)
  query_vec = vectorizer.transform([title])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5)[-5:]
  results = movies.iloc[indices][::-1]
  return results


In [None]:
import ipywidgets as widgets
from IPython.display import display

movie_input = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

movie_list = widgets.Output()

def on_type(data):
  with movie_list:
    movie_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      display(search(title))

movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title:')

Output()

In [None]:
ratings =  pd.read_csv("ratings.csv")

In [None]:
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510
...,...,...,...,...
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434


In [None]:
movie_id= 1

In [None]:
similar_users = ratings[(ratings["movieId"]== movie_id) & (ratings["rating"] >=5)]["userId"].unique()

In [None]:
similar_users

array([    36,     75,     86, ..., 162518, 162519, 162530])

In [None]:
similar_user_recs = ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"] >4)]["movieId"]

In [None]:
similar_user_recs

5101           1
5105          34
5111         110
5114         150
5127         260
            ... 
24998388    3706
24998389    3735
24998391    3763
24998392    4187
24998393    4321
Name: movieId, Length: 912084, dtype: int64

In [None]:
similar_user_recs = similar_user_recs.value_counts()/len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

In [None]:
similar_user_recs

1       1.000000
318     0.414556
260     0.404561
356     0.347253
296     0.342663
          ...   
1259    0.102991
7361    0.101881
1206    0.101362
1307    0.101066
1208    0.100918
Name: movieId, Length: 92, dtype: float64

In [None]:
all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] > 4)]

In [None]:
all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

In [None]:
all_user_recs

318      0.345282
296      0.287220
2571     0.246217
356      0.237370
593      0.227930
           ...   
1387     0.047886
1307     0.046195
745      0.037362
78499    0.035445
2355     0.025316
Name: movieId, Length: 92, dtype: float64

In [None]:
rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1)
rec_percentages.columns = ["similar", "all"]

In [None]:
rec_percentages

Unnamed: 0,similar,all
1,1.000000,0.125844
318,0.414556,0.345282
260,0.404561,0.224195
356,0.347253,0.237370
296,0.342663,0.287220
...,...,...
1259,0.102991,0.049349
7361,0.101881,0.105172
1206,0.101362,0.087500
1307,0.101066,0.046195


In [None]:
rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]
rec_percentages = rec_percentages.sort_values("score", ascending = False)

In [None]:
rec_percentages

Unnamed: 0,similar,all,score
1,1.000000,0.125844,7.946323
3114,0.295498,0.054186,5.453383
2355,0.124685,0.025316,4.925186
78499,0.138161,0.035445,3.897906
588,0.233674,0.068117,3.430480
...,...,...,...
58559,0.160743,0.147779,1.087725
79132,0.129424,0.132559,0.976349
7361,0.101881,0.105172,0.968704
2959,0.205020,0.218656,0.937638


In [None]:
rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]

Unnamed: 0,score,title,genres
0,7.946323,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3021,5.453383,Toy Story 2 (1999),Adventure|Animation|Children|Comedy|Fantasy
2264,4.925186,"Bug's Life, A (1998)",Adventure|Animation|Children|Comedy
14813,3.897906,Toy Story 3 (2010),Adventure|Animation|Children|Comedy|Fantasy|IMAX
580,3.43048,Aladdin (1992),Adventure|Animation|Children|Comedy|Musical
587,3.287671,Beauty and the Beast (1991),Animation|Children|Fantasy|Musical|Romance|IMAX
33,3.002602,Babe (1995),Children|Drama
4780,2.94841,"Monsters, Inc. (2001)",Adventure|Animation|Children|Comedy|Fantasy
1047,2.914882,Willy Wonka & the Chocolate Factory (1971),Children|Comedy|Fantasy|Musical
729,2.899227,Wallace & Gromit: A Close Shave (1995),Animation|Children|Comedy


In [None]:
def find_similar_movies(movie_id):
  similar_users = ratings[(ratings["movieId"]== movie_id) & (ratings["rating"] >=5)]["userId"].unique()

  similar_user_recs = ratings[(ratings["userId"].isin(similar_users))& (ratings["rating"] >4)]["movieId"]

  similar_user_recs = similar_user_recs.value_counts()/len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > 0.1]

  all_users = ratings[(ratings["movieId"].isin(similar_user_recs.index))& (ratings["rating"] > 4)]
  all_user_recs = all_users["movieId"].value_counts() / len(all_users["userId"].unique())

  rec_percentages = pd.concat([similar_user_recs, all_user_recs], axis = 1)
  rec_percentages.columns = ["similar", "all"]

  rec_percentages["score"] = rec_percentages["similar"]/rec_percentages["all"]

  rec_percentages = rec_percentages.sort_values("score", ascending = False)

  return   rec_percentages.head(10).merge(movies, left_index=True, right_on="movieId")[["score", "title", "genres"]]



In [None]:
movie_input_name = widgets.Text(
    value = "Toy Story",
    description = "Movie Title:",
    disabled = False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    title = data["new"]
    if len(title) > 5:
      results = search(title)
      movie_id = results.iloc[0]["movieId"]
      display(find_similar_movies(movie_id))

movie_input_name.observe(on_type, names="value")

display(movie_input_name, recommendation_list)

Text(value='Toy Story', description='Movie Title:')

Output()