In [68]:
# Importing Pandas and Numpy libraries

import pandas as pd
import numpy as np

In [69]:
# Importing necessary files

df_movies = pd.read_csv(r"C:\Users\rjtg2\OneDrive\Desktop\Rohan\IIT INDORE\GDSC\Inductions\tmdb_5000_movies.csv")
df_credits = pd.read_csv(r"C:\Users\rjtg2\OneDrive\Desktop\Rohan\IIT INDORE\GDSC\Inductions\tmdb_5000_credits.csv")

In [70]:
# Creating dataframe and filtering out unwanted columns (Taking the ones that are required)

df = df_movies.merge(df_credits)
df = df[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast']]

In [71]:
# Checking for null values

df.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
dtype: int64

In [72]:
# Filtering out null values

df = df.dropna()
df.duplicated().sum()

0

In [73]:
# Creating a function (With an empty list) that brings elements of a column into a proper, required format 

import ast
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

# Creating a function (With an empty list) that brings elements of a column into a proper, required format, but only for the first five objects

def convert5(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 5:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

# Creating a function (With an empty list) that applies 

# def fetch_director(obj):
#     L = []
#     for i in ast.literal_eval(obj):
#         if i['job'] == 'Director':
#             L.append(i['name'])
#             break
#         return L


In [74]:
# Applies the previous functions to the different features

df['genres'] = df['genres'].apply(convert)
df['keywords'] = df['keywords'].apply(convert)
df['cast'] = df['cast'].apply(convert5)
# df['crew'] = df['crew'].apply(fetch_director)

df['overview'] = df['overview'].apply(lambda x: x.split())

In [75]:
# Changing the elements of different columns such that there is no spacing in between words of each feature.

df['genres'] = df['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
df['keywords'] = df['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
df['cast'] = df['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
# df['crew'] = df['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [76]:
# Merging different column elements and creating a new dataframe 

df['tags'] = df['overview'] + df['genres'] + df['keywords'] + df['cast']
df_new = df[['movie_id', 'title', 'tags']]

# 

df_new['tags'] = df_new['tags'].apply(lambda x: " ".join(x))

# Transforms all letters of tags column to lower case 

df_new['tags'] = df_new['tags'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['tags'] = df_new['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['tags'] = df_new['tags'].apply(lambda x: x.lower())


In [77]:
# importing library to reduce words to their base forms

import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [78]:
# Creating a function to convert words to their base forms

def reduce(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [79]:
# Reducing words of the column 'tags' to their base forms

df_new['tags'] = df_new['tags'].apply(reduce)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_new['tags'] = df_new['tags'].apply(reduce)


In [80]:
# Imports a library to create multi-dimensional vectors

from sklearn.feature_extraction.text import CountVectorizer

# Creates a vector with 5000 columns

cv = CountVectorizer(max_features = 5000, stop_words = 'english')

In [81]:
# Creates a two dimmensional matrix representing how many times the 5000 most common words are used each time in a movie description 

vectors = cv.fit_transform(df_new['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [82]:
# Creating an array in a multidimensional space that represents the similarity between two movies by somewhat calculating distance between the two vectors

from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)
similarity[1]

array([0.08111071, 1.        , 0.05892557, ..., 0.02311251, 0.        ,
       0.02512595])

In [83]:
def recommend(movie):
    index = df_new[df_new['title'] == movie].index[0]
    distances = similarity[index]
    movies_list = sorted(list(enumerate(distances)), reverse = True, key = lambda x: x[1])[1:6]

    for i in movies_list:
        print(df_new.iloc[i[0]].title)

In [90]:
import pickle
pickle.dump(df_new, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))