# Recommendation system with content-based filtering

In [1]:
import pandas as pd

### Data load and preprocess

In [2]:
dataset = pd.read_csv('tmdb_movies_data.csv')
dataset.head(3)

Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,...,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj
0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,...,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,6/9/2015,5562,6.5,2015,137999939.3,1392446000.0
1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,...,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,5/13/2015,6185,7.1,2015,137999939.3,348161300.0
2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,3/18/2015,2480,6.3,2015,101199955.5,271619000.0


In [3]:
dataset = dataset[['id','original_title','overview','cast','director','genres','release_year']]
dataset.head(3)

Unnamed: 0,id,original_title,overview,cast,director,genres,release_year
0,135397,Jurassic World,Twenty-two years after the events of Jurassic ...,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,Action|Adventure|Science Fiction|Thriller,2015
1,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,Action|Adventure|Science Fiction|Thriller,2015
2,262500,Insurgent,Beatrice Prior must confront her inner demons ...,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,Adventure|Science Fiction|Thriller,2015


In [4]:
#see if there any empty value
dataset.isnull().sum()

id                 0
original_title     0
overview           4
cast              76
director          44
genres            23
release_year       0
dtype: int64

In [5]:
#remove the rows which have empty values
dataset.dropna(inplace=True)

In [6]:
#remove string value
def remove_string(texts):
    temp = []
    for i in texts:
        l = i.split(" ")
        new_string =''.join(l)
        temp.append(new_string)
    return temp


dataset['director'] = remove_string(dataset['director'])
dataset.head(3)

Unnamed: 0,id,original_title,overview,cast,director,genres,release_year
0,135397,Jurassic World,Twenty-two years after the events of Jurassic ...,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,ColinTrevorrow,Action|Adventure|Science Fiction|Thriller,2015
1,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,GeorgeMiller,Action|Adventure|Science Fiction|Thriller,2015
2,262500,Insurgent,Beatrice Prior must confront her inner demons ...,Shailene Woodley|Theo James|Kate Winslet|Ansel...,RobertSchwentke,Adventure|Science Fiction|Thriller,2015


In [7]:
# replace pipe symbol with apace
def remove_pipe(texts):
    temp=[]
    for i in texts:
        l = i.split("|")
        new_string =','.join(l)
        temp.append(new_string)
    return temp

dataset['cast'] = remove_pipe(dataset['cast'])
dataset['genres'] = remove_pipe(dataset['genres'])
dataset.head(3)

Unnamed: 0,id,original_title,overview,cast,director,genres,release_year
0,135397,Jurassic World,Twenty-two years after the events of Jurassic ...,"Chris Pratt,Bryce Dallas Howard,Irrfan Khan,Vi...",ColinTrevorrow,"Action,Adventure,Science Fiction,Thriller",2015
1,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...,"Tom Hardy,Charlize Theron,Hugh Keays-Byrne,Nic...",GeorgeMiller,"Action,Adventure,Science Fiction,Thriller",2015
2,262500,Insurgent,Beatrice Prior must confront her inner demons ...,"Shailene Woodley,Theo James,Kate Winslet,Ansel...",RobertSchwentke,"Adventure,Science Fiction,Thriller",2015


In [8]:
def make_num_to_str(texts):
    l = []
    for i in texts:
        l.append(str(i))
    return l


#concat some columns
dataset['info'] = dataset['overview'] + dataset['cast'] + dataset['director'] + dataset['genres'] + make_num_to_str(dataset['release_year'])

final_dataset = dataset[['id', 'original_title', 'info']]

final_dataset.head(2)

Unnamed: 0,id,original_title,info
0,135397,Jurassic World,Twenty-two years after the events of Jurassic ...
1,76341,Mad Max: Fury Road,An apocalyptic story set in the furthest reach...


In [9]:
#Converts text to lower case
def convert_lower_case(texts):
    l = []
    for i in texts:
        l.append(i.lower())
    return l

final_dataset['info'] = convert_lower_case(final_dataset['info'])
final_dataset.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['info'] = convert_lower_case(final_dataset['info'])


Unnamed: 0,id,original_title,info
0,135397,Jurassic World,twenty-two years after the events of jurassic ...
1,76341,Mad Max: Fury Road,an apocalyptic story set in the furthest reach...


### Remove stopwords

In [10]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


def tokenize_and_remove_stopwords(texts):
    ready_texts = []
    stop_words = set(stopwords.words('english'))
    for i in texts:
        words = word_tokenize(i)
        final_words = []
        for word in words:
            if word not in stop_words: 
                final_words.append(word)
        final_texts = " ".join(final_words)
        ready_texts.append(final_texts)
         
    return ready_texts


final_dataset['info'] = tokenize_and_remove_stopwords(final_dataset['info'])
print(final_dataset.head(3))

       id      original_title  \
0  135397      Jurassic World   
1   76341  Mad Max: Fury Road   
2  262500           Insurgent   

                                                info  
0  twenty-two years events jurassic park , isla n...  
1  apocalyptic story set furthest reaches planet ...  
2  beatrice prior must confront inner demons cont...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['info'] = tokenize_and_remove_stopwords(final_dataset['info'])


### Stemming

In [11]:
from nltk.stem.porter import PorterStemmer

def stem(texts):
    ready_texts = []
    ps = PorterStemmer()

    for i in texts:
        words = word_tokenize(i)
        final_words = []
        for word in words:
            stemmed_word = ps.stem(word)  
            final_words.append(stemmed_word)
        final_texts = " ".join(final_words)
        ready_texts.append(final_texts)
         
    return ready_texts


final_dataset['info'] = stem(final_dataset['info'])
print(final_dataset.head())
    

       id                original_title  \
0  135397                Jurassic World   
1   76341            Mad Max: Fury Road   
2  262500                     Insurgent   
3  140607  Star Wars: The Force Awakens   
4  168259                     Furious 7   

                                                info  
0  twenty-two year event jurass park , isla nubla...  
1  apocalypt stori set furthest reach planet , st...  
2  beatric prior must confront inner demon contin...  
3  thirti year defeat galact empir , han solo all...  
4  deckard shaw seek reveng domin toretto famili ...  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_dataset['info'] = stem(final_dataset['info'])


### Vectorization

In [12]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorization = CountVectorizer(max_features=10000)

#I used cosine distance because euclidean distance isn't good for high dimentional distance
find_similarity = cosine_similarity(vectorization.fit_transform(final_dataset['info']).toarray())

In [13]:
def create_tuple(index):
    find_distance = find_similarity[index]
    l = []
    for index, score in enumerate(find_distance):
        l.append((index, score))
    return l

In [14]:

def movie_recommandation(movie_name):
    find = final_dataset['original_title'] == movie_name
    find_index = final_dataset[find].index[0]

    movies = sorted(create_tuple(find_index), key=lambda x:x[1], reverse=True)

    top_4_similar_movies = movies[1:5]

    for movie in top_4_similar_movies:
        print(f"{final_dataset.iloc[movie[0]].original_title} \n")


In [15]:
# 'Insurgent'
movie_recommandation('Insurgent')

Divergent 

The Covenant 

Fase 7 

Contact 



In [16]:
import pickle
pickle.dump(final_dataset, open('movies_list.pkl', 'wb'))
pickle.dump(find_similarity, open('find_similarity.pkl', 'wb'))
