In [1]:
import numpy as np
import pandas as pd
import ast
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer

In [2]:
movies = pd.read_csv('tmdb_movies.csv')
credits = pd.read_csv('tmdb_credits.csv')
# movies.head()
#credits.head()
movies = movies.merge(credits,on='title')
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

In [3]:
# genres keywords title overview id cast crew
movies = movies[['genres','id','keywords','title','overview','cast','crew']]
movies.head()

Unnamed: 0,genres,id,keywords,title,overview,cast,crew
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",Spectre,A cryptic message from Bond’s past sends him o...,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",John Carter,"John Carter is a war-weary, former military ca...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [4]:
movies.isnull().sum()

genres      0
id          0
keywords    0
title       0
overview    3
cast        0
crew        0
dtype: int64

In [5]:
movies.dropna(inplace=True)

In [6]:
movies.duplicated().sum()

0

In [7]:
movies.iloc[0]

genres      [{"id": 28, "name": "Action"}, {"id": 12, "nam...
id                                                      19995
keywords    [{"id": 1463, "name": "culture clash"}, {"id":...
title                                                  Avatar
overview    In the 22nd century, a paraplegic Marine is di...
cast        [{"cast_id": 242, "character": "Jake Sully", "...
crew        [{"credit_id": "52fe48009251416c750aca23", "de...
Name: 0, dtype: object

In [8]:

def parse_features(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)]
    except Exception as e:
        print(f"parse_features error: {e}")
        return []

def parse_director(text):
    try:
        return [i['name'] for i in ast.literal_eval(text) if i.get('job') == "Director"]
    except Exception as e:
        print(f"parse_director error: {e}")
        return []

def parse_cast(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)][:3]
    except Exception as e:
        print(f"parse_cast error: {e}")
        return []
        
def clean_data(x):
    if isinstance(x, list):
        return [str(i).lower().replace(" ", "") for i in x]
    elif isinstance(x, str):
        return str(x).lower().replace(" ", "")
    else:
        return ''


In [9]:
movies['genres'] = movies['genres'].apply(parse_features)
movies['keywords'] = movies['keywords'].apply(parse_features)
movies['cast'] = movies['cast'].apply(parse_cast)
movies['crew'] = movies['crew'].apply(parse_director)

In [10]:
movies['genres'] = movies['genres'].apply(clean_data)
movies['keywords'] = movies['keywords'].apply(clean_data)
movies['cast'] = movies['cast'].apply(clean_data)
movies['crew'] = movies['crew'].apply(clean_data)
movies['overview'] = movies['overview'].apply(lambda x: x.lower().split())

In [11]:
movies.head()

Unnamed: 0,genres,id,keywords,title,overview,cast,crew
0,"[action, adventure, fantasy, sciencefiction]",19995,"[cultureclash, future, spacewar, spacecolony, ...",Avatar,"[in, the, 22nd, century,, a, paraplegic, marin...","[samworthington, zoesaldana, sigourneyweaver]",[jamescameron]
1,"[adventure, fantasy, action]",285,"[ocean, drugabuse, exoticisland, eastindiatrad...",Pirates of the Caribbean: At World's End,"[captain, barbossa,, long, believed, to, be, d...","[johnnydepp, orlandobloom, keiraknightley]",[goreverbinski]
2,"[action, adventure, crime]",206647,"[spy, basedonnovel, secretagent, sequel, mi6, ...",Spectre,"[a, cryptic, message, from, bond’s, past, send...","[danielcraig, christophwaltz, léaseydoux]",[sammendes]
3,"[action, crime, drama, thriller]",49026,"[dccomics, crimefighter, terrorist, secretiden...",The Dark Knight Rises,"[following, the, death, of, district, attorney...","[christianbale, michaelcaine, garyoldman]",[christophernolan]
4,"[action, adventure, sciencefiction]",49529,"[basedonnovel, mars, medallion, spacetravel, p...",John Carter,"[john, carter, is, a, war-weary,, former, mili...","[taylorkitsch, lynncollins, samanthamorton]",[andrewstanton]


In [12]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [13]:
new_df = movies[['id','title','tags']]
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"[in, the, 22nd, century,, a, paraplegic, marin..."
1,285,Pirates of the Caribbean: At World's End,"[captain, barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[a, cryptic, message, from, bond’s, past, send..."
3,49026,The Dark Knight Rises,"[following, the, death, of, district, attorney..."
4,49529,John Carter,"[john, carter, is, a, war-weary,, former, mili..."


In [14]:
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [15]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [16]:
cv = CountVectorizer(max_features=5000, stop_words='english')

In [17]:
def stem_text(text):
    return ' '.join([ps.stem(word) for word in text.split()])


In [18]:
ps = PorterStemmer()
new_df['tags'] = new_df['tags'].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem_text)


In [19]:
vectors = cv.fit_transform(new_df['tags']).toarray()
vectors.shape

(4806, 5000)

In [20]:
similarity = cosine_similarity(vectors)

In [21]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movie_list:
        print(new_df.iloc[i[0]].title)

In [22]:
# recommend('Wall-E')
new_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a parapleg marin is dispa..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believ to be dead, ha c..."
2,206647,Spectre,a cryptic messag from bond’ past send him on a...
3,49026,The Dark Knight Rises,follow the death of district attorney harvey d...
4,49529,John Carter,"john carter is a war-weary, former militari ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just want to play hi guitar and ca...
4805,72766,Newlyweds,a newlyw couple' honeymoon is upend by the arr...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduc a dedic q..."
4807,126186,Shanghai Calling,when ambiti new york attorney sam is sent to s...


In [23]:
import pickle

pickle.dump(new_df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

In [24]:
import gzip

with gzip.open('movies.pkl.gz', 'wb') as f:
    pickle.dump(movies, f)

In [None]:
with gzip.open('movies.pkl.gz', 'wb') as f:
    pickle.dump(movies, f)