In [1]:
#Step 1: Import Required Libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
#Step 2: Load the Dataset
#Next, we'll load the movie dataset into a pandas DataFrame:
movies = pd.read_csv('https://raw.githubusercontent.com/rashida048/Some-NLP-Projects/master/movie_dataset.csv')


In [3]:
#Step 3: Data Preprocessing
#Before we can use the dataset to build a recommendation model, we need to preprocess the data. We'll start by dropping any rows with missing values:
movies.dropna(inplace=True)


In [4]:
#Then, we'll create a new column that combines the movie's title, cast, director, and keywords
movies['combined_features'] = movies['title'] + ' ' + movies['cast'] + ' ' + movies['director'] + ' ' + movies['keywords']


In [5]:
#Step 4: Feature Extraction
#We'll use scikit-learn's CountVectorizer to convert the text data into a matrix of token counts:
cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])


In [6]:
#Step 5: Similarity Scores
#Now we'll calculate the cosine similarity between the movies based on the count matrix:
cosine_sim = cosine_similarity(count_matrix)


In [7]:
#Step 6: Recommendation Function
#Finally, we'll create a function that takes a movie title as input and returns the top 10 similar movies:
def recommend(movie):
    index = movies[movies['title'] == movie].index[0]
    distances = cosine_sim[index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
    recommended_movies = [movies.iloc[i[0]].title for i in movie_list]
    return recommended_movies


In [16]:
#Step 7: Test the Model
#Let's test the model by recommending movies similar to "The Avengers":
print(recommend('The Avengers'))


['Men in Black II', 'Men in Black', 'The Time Machine', 'In Time', 'Hot Tub Time Machine', 'Project Almanac', 'Primer', 'X-Men: Days of Future Past', 'In the Valley of Elah', "Bill & Ted's Excellent Adventure"]
