# Movie Recommendation

### Imports and Data Sets

In [5]:
import numpy as np
import pandas as pd
import ast
import nltk
import re
import requests

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from tabulate import tabulate
from IPython.display import display, HTML, Image

TMDB_API_KEY = 'a1fe2f0ac92d2dd849674115d68777a5'

ps = PorterStemmer()
cv = CountVectorizer(max_features = 5000, stop_words = 'english')

credits_df = pd.read_csv('credits.csv')
movies_df = pd.read_csv('movies.csv')

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

### Combine movies and credits df using title as key

In [7]:
movies_df = movies_df.merge(credits_df, on='title')
movies_df = movies_df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity']]

In [8]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4808 entries, 0 to 4807
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   movie_id    4808 non-null   int64  
 1   title       4808 non-null   object 
 2   overview    4805 non-null   object 
 3   genres      4808 non-null   object 
 4   keywords    4808 non-null   object 
 5   cast        4808 non-null   object 
 6   crew        4808 non-null   object 
 7   popularity  4808 non-null   float64
dtypes: float64(1), int64(1), object(6)
memory usage: 300.6+ KB


### Checking for and removing any empty or duplicated rows 

In [9]:
movies_df.isnull().sum()

movie_id      0
title         0
overview      3
genres        0
keywords      0
cast          0
crew          0
popularity    0
dtype: int64

In [10]:
movies_df.dropna(inplace=True)

In [11]:
movies_df.duplicated().sum()

0

### Making Genres and Keywords into a readable list

In [12]:
def convert_list(obj):
    L=[]
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

### Fetching the top 3 cast

In [13]:
def fetch_lead_actors(obj):
    L=[]
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

### Fetching the Director(s)

In [14]:
def fetch_director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
    return L

### Using the above functions

In [15]:
movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())

In [16]:
movies_df['genres'] = movies_df['genres'].apply(convert_list)

In [17]:
movies_df['keywords'] = movies_df['keywords'].apply(convert_list)

In [18]:
movies_df['cast'] = movies_df['cast'].apply(fetch_lead_actors)

In [19]:
movies_df['crew'] = movies_df['crew'].apply(fetch_director)

### Removes all white space for comparison

In [20]:
movies_df['genres'] = movies_df['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x:[i.replace(" ","") for i in x])

movies_df.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew,popularity
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],150.437577
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],139.082615
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],107.376788
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman]",[ChristopherNolan],112.31295
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton]",[AndrewStanton],43.926995


### New column with combined, overview, genres, keywords, cast and crew

In [21]:
movies_df['tags'] = movies_df['overview'] + movies_df['genres'] + movies_df['keywords'] + movies_df['cast'] + movies_df['crew']

### New dataframe with 3 headers

In [22]:
movies = movies_df[['movie_id', 'title', 'tags']]

###### Turns into a string rather than a list

In [23]:
movies['tags'] = movies['tags'].apply(lambda x:' '.join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['tags'] = movies['tags'].apply(lambda x:' '.join(x))


In [24]:
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [25]:
movies['tags'] = movies['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['tags'] = movies['tags'].apply(lambda x:x.lower())


In [26]:
vectors = cv.fit_transform(movies['tags']).toarray()
len(cv.get_feature_names_out())

5000

### Uses ps (automatic NLP)

In [27]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [28]:
movies['tags'] = movies['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['tags'] = movies['tags'].apply(stem)


In [29]:
similarity = cosine_similarity(vectors)

In [30]:
sorted(list(enumerate(similarity[0])), reverse=True, key=lambda x:x[1])[1:11]

[(539, 0.26089696604360174),
 (507, 0.25302403842552984),
 (1194, 0.25226248955475644),
 (1216, 0.2480694691784169),
 (582, 0.24397501823713333),
 (1444, 0.24397501823713333),
 (260, 0.2428706962875665),
 (1920, 0.24053511772118194),
 (3730, 0.23904572186687872),
 (74, 0.22934123614693147)]

In [31]:
def split_names(name):
    
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', name)

In [32]:
def fetch_poster_path(movie_id):
    url = f'https://api.themoviedb.org/3/movie/{movie_id}?api_key={TMDB_API_KEY}&language=en-US'
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for HTTP errors
        data = response.json()
        poster_path = data.get('poster_path')
        if poster_path:
            return f'https://image.tmdb.org/t/p/w500/{poster_path}'
    except requests.exceptions.RequestException as e:
        print(f"Error fetching poster for movie ID {movie_id}: {e}")
    return None

In [33]:
def get_poster_html(movie_id):
    poster_path = fetch_poster_path(movie_id)
    if poster_path:
        return f'<img src="{poster_path}" style="max-height:150px;">'
    else:
        return ''

In [34]:
def get_movie_link_html(title):
    google_search_link = f'https://www.google.com/search?q={title}'
    return f'<a href="{google_search_link}" target="_blank">{title}</a>'

In [41]:
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:26]
    
    # Create a dictionary to store the combined scores (similarity + popularity_weight * popularity)
    combined_scores = {}

    for i in movies_list:
        index = i[0]
        
        # Define your weights
        popularity_weight = 0.002
        similarity_weight = 1-popularity_weight

        # Calculate the combined score
        combined_score = (similarity_weight * i[1]) + (popularity_weight * movies_df.iloc[index].popularity)

        combined_scores[index] = combined_score

    # Sort the dictionary by combined scores in descending order
    
    sorted_combined_scores = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)

    # Get the top 10 movies based on the combined scores
    movies_final_list = sorted_combined_scores[:10]

    # Create an empty HTML string to accumulate the content
    html_content = ''

    # Add column headers
    headers_html = '<div style="display: flex; align-items: center; font-weight: bold; text-align: center; padding: 10px;">'
    headers_html += '<div style="flex: 1;">#</div>'
    headers_html += '<div style="flex: 2;">Poster</div>'
    headers_html += '<div style="flex: 2;">Movie Title</div>'
    headers_html += '<div style="flex: 8;">Overview</div>'
    headers_html += '<div style="flex: 2;">Cast</div>'
    headers_html += '<div style="flex: 2;">Crew</div>'
    headers_html += '</div>'
    html_content += headers_html

    for counter, i in enumerate(movies_final_list, start=1):
        title = movies_df.iloc[i[0]].title
        overview = ' '.join(movies_df.iloc[i[0]].overview)
        cast = split_names('<br>'.join(movies_df.iloc[i[0]].cast))
        crew = split_names('<br>'.join(movies_df.iloc[i[0]].crew))
        movie_id = movies_df.iloc[i[0]].movie_id

        # Get HTML for the poster image
        poster_html = get_poster_html(movie_id)
        
        # Get clickable link HTML for the movie title
        movie_title_link_html = get_movie_link_html(title)

        # Build the row HTML
        row_html = '<div style="display: flex; align-items: center; text-align: center; padding: 10px;">'
        row_html += f'<div style="flex: 1;">{counter}</div>'
        row_html += f'<div style="flex: 2;">{poster_html}</div>'
        row_html += f'<div style="flex: 2;">{movie_title_link_html}</div>'
        row_html += f'<div style="flex: 8;">{overview}</div>'
        row_html += f'<div style="flex: 2;">{cast}</div>'
        row_html += f'<div style="flex: 2;">{crew}</div>'
        row_html += '</div>'

        # Append the row HTML to the content
        html_content += row_html

    # Display the accumulated HTML content
    display(HTML(html_content))

In [43]:
recommend('Iron Man')