# Recommendation system entire setup

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os


movies = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_movies.csv') # 'r' for raw string, o/w use / or \\
credits = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_credits.csv')

print(movies.head(2))
print(credits.head(2))
print(movies.shape)
print(credits.shape)

movies =movies.merge(credits, on='title')
print(movies.shape)

print(movies.iloc[0])

print(movies['original_language'].value_counts())

movies.columns

movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] 
movies.head(2)
movies.shape

movies.isnull().sum()


movies.dropna(inplace=True) # Removes any rows (by default) that contain at least one NaN (missing value), apply the change directly to movies
movies.isnull().sum() # no of nulls
movies.shape
movies.duplicated().sum()

movies.iloc[0]['genres']

import ast # to convert string into list

def convert(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        l.append(i['name']) # only name {"id": 28, "name": "Action"}
    return l

import ast # to convert string into list

def convert_cast(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    counter = 0
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        if counter>=3:
            break
        l.append(i['name']) # only name {"id": 28, "name": "Action"}
        counter+=1
    return l


import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')


movies['genres'] = movies['genres'].apply(convert) # runs the convert function on every row in the genres column
movies.iloc[0]['cast']

movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)

movies.head(2)

movies.iloc[0]['crew']
# fetch only the directory as job
# {"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"},

def fetch_director(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        if i['job'] == 'Director':
            l.append(i['name']) # only name {"id": 28, "name": "Action"}
            break
    return l

movies['crew'] = movies['crew'].apply(fetch_director)
movies.head(2)

movies.iloc[0]['overview']

movies['overview'] = movies['overview'].apply(lambda x : x.split()) # seperate each word, put into list
movies.head(2)


# Sam Worthington
# SamWorthington

# data means each row in passing column
def remove_spaces(data):
    names = []

    for i in data:
        names.append(i.replace(" ", ""))
    return names

movies['cast'] = movies['cast'].apply(remove_spaces)
movies['crew'] = movies['crew'].apply(remove_spaces)
movies['keywords'] = movies['keywords'].apply(remove_spaces)
movies['genres'] = movies['genres'].apply(remove_spaces)

movies.head(2)

# all those columns are lists
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head()

movies.iloc[0]['tags']

new_df = movies[['movie_id', 'title', 'tags']].copy() # Keep original movies table intact

new_df.head()

new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))
new_df.head()
new_df.iloc[0]['tags']
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())
new_df.head()


import nltk
from nltk.stem import PorterStemmer
ps = PorterStemmer()
# 
# dispatched ---> dispatch, following ---> follow , ....


def stem(text):
    l = []
    for i in text.split():   # text is a string but text.split() is a list
        l.append(ps.stem(i)) # stem each word

    return " ".join(l) # Join back into one string

new_df['tags'] = new_df['tags'].apply(stem)

new_df.iloc[0]['tags']


from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vector = vectorizer.fit_transform(new_df['tags']).toarray()
vector
vector.shape


from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)
similarity
similarity.shape

new_df[new_df['title'] == 'Spider-Man'].index[0]


from difflib import get_close_matches

def reccommend_movie(movie):
    
    titles = new_df['title'].tolist()
    
    # Find the closest match (allowing typos / case differences)
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)
    
    if not matches:
        print(f"No close match found for '{movie}'")
        return
    
    best_match = matches[0]
    index = new_df[new_df['title'] == best_match].index[0]


    # index = new_df[new_df['title'] == movie].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key= lambda x : x[1])

    print(f"\nResults for: {best_match}\n")
    for i in distance[1:6]:
        print(new_df.iloc[i[0]].title)



print(reccommend_movie('the drk night risen'))


# import pickle

# pickle.dump(new_df, open('pkls/movie_list.pkl', 'wb'))
# pickle.dump(similarity, open('pkls/similarity.pkl', 'wb'))








      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   

                                       homepage     id  \
0                   http://www.avatarmovie.com/  19995   
1  http://disney.go.com/disneypictures/pirates/    285   

                                            keywords original_language  \
0  [{"id": 1463, "name": "culture clash"}, {"id":...                en   
1  [{"id": 270, "name": "ocean"}, {"id": 726, "na...                en   

                             original_title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   

                                            overview  popularity  \
0  In the 22nd century, a paraplegic Marine is di...  150.437577   
1  Captain Barbossa, long believed to be dead, ha...  139.082615   

                                production_companies 

KeyboardInterrupt: 