In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
url = 'C:/Users/Hp/OneDrive/Desktop/PYDS/Movie_recommender/Data/imdb_raw.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,title,director,release_year,runtime,genre,rating,metascore,gross
0,The Shawshank Redemption,Frank Darabont,(1994),142 min,Drama,9.3,82,$28.34M
1,The Godfather,Francis Ford Coppola,(1972),175 min,"Crime, Drama",9.2,100,$134.97M
2,The Dark Knight,Christopher Nolan,(2008),152 min,"Action, Crime, Drama",9.0,84,$534.86M
3,Schindler's List,Steven Spielberg,(1993),195 min,"Biography, Drama, History",9.0,95,$96.90M
4,12 Angry Men,Sidney Lumet,(1957),96 min,"Crime, Drama",9.0,97,$4.36M


In [None]:
df['data'] = df['title'] + ' ' + df['director'] + ' ' +  df['genre']
df['data']

0          The Shawshank Redemption Frank Darabont Drama
1        The Godfather Francis Ford Coppola Crime, Drama
2      The Dark Knight Christopher Nolan Action, Crim...
3      Schindler's List Steven Spielberg Biography, D...
4                 12 Angry Men Sidney Lumet Crime, Drama
                             ...                        
995    A Very Long Engagement Jean-Pierre Jeunet Dram...
996            Shine Scott Hicks Biography, Drama, Music
997    Philomena Stephen Frears Biography, Comedy, Drama
998         The Invisible Man James Whale Horror, Sci-Fi
999          Cell 211 Daniel Monzón Action, Crime, Drama
Name: data, Length: 1000, dtype: object

In [None]:
#remove punctuations - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]','')
# Lower case
df['data'] = df['data'].str.lower()

In [None]:
def remove_stopwords(text):
    words = text.split()
    return" ".join(word for word in words if word not in stopwords.words('english'))
df['data'] = df['data'].apply(remove_stopwords)
df['data']

0              shawshank redemption frank darabont drama
1            godfather francis ford coppola crime, drama
2      dark knight christopher nolan action, crime, d...
3      schindler's list steven spielberg biography, d...
4                 12 angry men sidney lumet crime, drama
                             ...                        
995    long engagement jean-pierre jeunet drama, myst...
996            shine scott hicks biography, drama, music
997    philomena stephen frears biography, comedy, drama
998             invisible man james whale horror, sci-fi
999          cell 211 daniel monzón action, crime, drama
Name: data, Length: 1000, dtype: object

In [None]:
#vectorize the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['data']).toarray()
X.shape

(1000, 2267)

In [None]:
# similarity matrix
similarity = cosine_similarity(X, X)
similarity.shape

(1000, 1000)

In [None]:
def get_index_from_title(title):
    try: 
        return df[df.title == title].index[0]
    except:
        return None

def recommend_movie(title, Limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i],similarity[index][i]))
        movie_scores.sort(key=lambda x:x[1],reverse = True)
        return movie_scores[1:Limit+1]

In [None]:
recommend_movie('Harry Potter and the Goblet of Fire',5)

[('Harry Potter and the Prisoner of Azkaban', 0.5555555555555556),
 ("Harry Potter and the Sorcerer's Stone", 0.5555555555555556),
 ('Harry Potter and the Deathly Hallows: Part 2', 0.5270462766947299),
 ('Harry Potter and the Deathly Hallows: Part 1', 0.5270462766947299),
 ('Stardust', 0.408248290463863)]