# Movie recommendation

In [28]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
import textblob
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [84]:
url = 'C:/Users/Hp/OneDrive/Desktop/PYDS/Movie_recommender/Data/imdb_raw.csv'
df = pd.read_csv(url)
df.head()

Unnamed: 0,title,director,release_year,runtime,genre,rating,metascore,gross
0,The Shawshank Redemption,Frank Darabont,(1994),142 min,Drama,9.3,82,$28.34M
1,The Godfather,Francis Ford Coppola,(1972),175 min,"Crime, Drama",9.2,100,$134.97M
2,The Dark Knight,Christopher Nolan,(2008),152 min,"Action, Crime, Drama",9.0,84,$534.86M
3,Schindler's List,Steven Spielberg,(1993),195 min,"Biography, Drama, History",9.0,95,$96.90M
4,12 Angry Men,Sidney Lumet,(1957),96 min,"Crime, Drama",9.0,97,$4.36M


# Top movies according to rating in IMBD

In [82]:
nd= df[['title','release_year','rating']]
top_df = nd.head(10)
top_df
stop_df = top_df.sort_values(by='rating',ascending=False)
stop_df.head()

Unnamed: 0,title,release_year,rating
0,The Shawshank Redemption,(1994),9.3
1,The Godfather,(1972),9.2
2,The Dark Knight,(2008),9.0
3,Schindler's List,(1993),9.0
4,12 Angry Men,(1957),9.0


In [92]:
df.shape

(1000, 9)

In [89]:
df['data'] = df['title'] + ' ' + df['director'] + ' ' +  df['genre']
df['data'].head()

0        The Shawshank Redemption Frank Darabont Drama
1      The Godfather Francis Ford Coppola Crime, Drama
2    The Dark Knight Christopher Nolan Action, Crim...
3    Schindler's List Steven Spielberg Biography, D...
4               12 Angry Men Sidney Lumet Crime, Drama
Name: data, dtype: object

In [90]:
#remove punctuations - anything that is not a word or a space
df['data'] = df['data'].str.replace('[^\w\s]','')
# Lower case
df['data'] = df['data'].str.lower()

In [91]:
def remove_stopwords(text):
    words = text.split()
    return" ".join(word for word in words if word not in stopwords.words('english'))
df['data'] = df['data'].apply(remove_stopwords)
df['data']

0              shawshank redemption frank darabont drama
1            godfather francis ford coppola crime, drama
2      dark knight christopher nolan action, crime, d...
3      schindler's list steven spielberg biography, d...
4                 12 angry men sidney lumet crime, drama
                             ...                        
995    long engagement jean-pierre jeunet drama, myst...
996            shine scott hicks biography, drama, music
997    philomena stephen frears biography, comedy, drama
998             invisible man james whale horror, sci-fi
999          cell 211 daniel monzón action, crime, drama
Name: data, Length: 1000, dtype: object

In [33]:
#vectorize the data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['data']).toarray()
X.shape

(1000, 2267)

In [34]:
# similarity matrix
similarity = cosine_similarity(X, X)
similarity.shape

(1000, 1000)

# Movie Recommendation based on entered movie

In [73]:
def get_index_from_title(title):
    try: 
        return df[df.title == title].index[0]
    except:
        return None


def recommend_movie(title, Limit = 10):
    index = get_index_from_title(title)
    if index is None:
        return []
    else:
        movie_scores = []
        for i in range(similarity.shape[0]):
            movie_scores.append((df['title'][i],similarity[index][i]))
        movie_scores.sort(key=lambda x:x[1],reverse = True)
        return movie_scores[1:Limit+1]

In [None]:
movie = input('The the name of the movie: ') 
nr = int(input('Enter the number of recomendations: '))
recommend_movie(movie,nr)

In [93]:
rm= recommend_movie('The Godfather',5)
rm

[('The Godfather Part II', 0.8660254037844387),
 ('Apocalypse Now', 0.6172133998483676),
 ('The Conversation', 0.6172133998483676),
 ('Casino', 0.36514837167011077),
 ('Z', 0.36514837167011077)]

In [49]:
drm=pd.DataFrame(rm)
drm

Unnamed: 0,0,1
0,The Godfather Part II,0.866025
1,Apocalypse Now,0.617213
2,The Conversation,0.617213
3,Casino,0.365148
4,Z,0.365148


In [77]:
drm.rename(
    mapper={'0':'Title',
    '1':'Movie Scores'},
    axis=1,
    inplace=True
)
drm

Unnamed: 0,0,1
0,The Godfather Part II,0.866025
1,Apocalypse Now,0.617213
2,The Conversation,0.617213
3,Casino,0.365148
4,Z,0.365148
