### **Importing relevant libraries.**

In [6]:
# Importing libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### **Loading dataset.**

In [7]:
# Loading data
df=pd.read_csv("movie_descriptions.csv")

#Outputing first 5 rows
df.head()

Unnamed: 0,id,title,genre,desc
0,48477,Burning Man (2007),documentary,Every year during the week of Labor Day Weeke...
1,45172,Questioning Faith: Confessions of a Seminaria...,documentary,"When Macky Alston's fellow seminarian, Alan S..."
2,423,The Young Doctors (1961),drama,Two pathologists -- a veteran department head...
3,9588,Casanova (2015),drama,"Exiled from his beloved Venice, Giacomo Casan..."
4,38150,"""Front of House"" (2014)",comedy,"Rob, Hillary and Liam are three friends whose..."


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      5000 non-null   int64 
 1   title   5000 non-null   object
 2   genre   5000 non-null   object
 3   desc    5000 non-null   object
dtypes: int64(1), object(3)
memory usage: 156.4+ KB


In [10]:
df.describe()

Unnamed: 0,id
count,5000.0
mean,27188.6762
std,15680.129994
min,1.0
25%,13691.25
50%,26974.0
75%,41004.5
max,54198.0


In [11]:
df.columns

Index(['id', 'title', 'genre', 'desc'], dtype='object')

### **Text processing.**

In [14]:
# Use TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Assuming the 'description' column contains the movie descriptions
df['desc'] = df['desc'].fillna('')  # Fill missing descriptions with empty strings
tfidf_matrix = tfidf.fit_transform(df['desc'])

In [15]:
# Compute the cosine similarity matrix based on TF-IDF vectors
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [16]:
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]
    
    # Get the pairwise similarity scores for all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 5 most similar movies
    sim_scores = sim_scores[1:6]  # Exclude the first movie (itself)
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 5 most similar movies
    return df['title'].iloc[movie_indices]

In [17]:
recommendations = get_recommendations('The Matrix')
print(recommendations)

IndexError: index 0 is out of bounds for axis 0 with size 0