## Import Libraries 

In [1]:
import pandas as pd
import numpy as np


## Get the data

In [2]:
movies = pd.read_csv('movies.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies.shape

(62423, 3)

In [5]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

## Build Search Engine

In [9]:
import re

def clean_title(title):
    return re.sub("[^a-zA-Z0-9]"," ", title)

In [10]:
movies['clean_title'] = movies['title'].apply(clean_title)

In [11]:
movies.head()

Unnamed: 0,movieId,title,genres,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Toy Story 1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Jumanji 1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Grumpier Old Men 1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Waiting to Exhale 1995
4,5,Father of the Bride Part II (1995),Comedy,Father of the Bride Part II 1995


### Creating TFIDF matrix

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf = vectorizer.fit_transform(movies['clean_title'])

In [32]:
# Create search function

from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


def search(title):
    
    title = clean_title(title)
    query_vec = vectorizer.transform([title])
    similarity = cosine_similarity(query_vec, tfidf).flatten()
    indices = np.argpartition(similarity, -5)[-5:]
    results = movies.iloc[indices][::-1]
    
    return results

In [33]:
# Build interactive search box

from ipywidgets import widgets
from IPython.display import display


movie_input = widgets.Text(
    value = 'Toy Story',
    description = 'Movie Title',
    disabled = False
)

movie_list = widgets.Output()


def on_type(data):
    with movie_list:
        movie_list.clear_output()
        title=data['new']
        if len(title) > 5:
            display(search(title))
            
movie_input.observe(on_type, names='value')

display(movie_input, movie_list)

Text(value='Toy Story', description='Movie Title')

Output()

### Import ratings csv

In [35]:
rating = pd.read_csv(r"C:\Users\Administrator\Desktop\Datasets\ratings.csv")

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
rating.head