<a href="https://colab.research.google.com/github/SanjyotAmritkar/Movie_Recommendation_LUMAA/blob/main/recommend_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd

import numpy as np
import nltk
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords

# Download NLTK stopwords (only needed once)
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
# Load the dataset
df = pd.read_csv("/IMDB Top 250 Movies.csv")  # Ensure correct file path

# Display the first few rows
df.head()

#check the names of the columns
print(df.columns)



Index(['rank', 'name', 'year', 'rating', 'genre', 'certificate', 'run_time',
       'tagline', 'budget', 'box_office', 'casts', 'directors', 'writers'],
      dtype='object')


In [20]:
# Fill missing values with empty strings and join tagline and genre
df['description'] = df[['tagline', 'genre']].fillna('').agg(' '.join, axis=1)

# Check the new 'description' column to ensure it has been created correctly
df[['tagline', 'genre', 'description']].head()


Unnamed: 0,tagline,genre,description
0,Fear can hold you prisoner. Hope can set you f...,Drama,Fear can hold you prisoner. Hope can set you f...
1,An offer you can't refuse.,"Crime,Drama","An offer you can't refuse. Crime,Drama"
2,Why So Serious?,"Action,Crime,Drama","Why So Serious? Action,Crime,Drama"
3,All the power on earth can't change destiny.,"Crime,Drama",All the power on earth can't change destiny. C...
4,Life Is In Their Hands -- Death Is On Their Mi...,"Crime,Drama",Life Is In Their Hands -- Death Is On Their Mi...


In [22]:
# Function to clean text
def clean_text(text):
    if pd.isna(text):  # Handle NaN values
        return ""
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = text.strip()  # Remove whitespace
    text = " ".join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Apply text cleaning to the "Description" column
df['cleaned_description'] = df['description'].apply(clean_text)

# Check cleaned descriptions
df[['name', 'cleaned_description']].head()



Unnamed: 0,name,cleaned_description
0,The Shawshank Redemption,fear hold prisoner hope set free drama
1,The Godfather,offer cant refuse crimedrama
2,The Dark Knight,serious actioncrimedrama
3,The Godfather Part II,power earth cant change destiny crimedrama
4,12 Angry Men,life hands death minds crimedrama


In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'cleaned_description' column
tfidf_matrix = tfidf.fit_transform(df['cleaned_description'])

# Check the shape of the matrix to verify
print(tfidf_matrix.shape)


(250, 952)


In [24]:
from sklearn.metrics.pairwise import cosine_similarity

# Define a function to recommend movies based on the user's input
def recommend_movies(query, top_n=5):
    query_tfidf = tfidf.transform([clean_text(query)])  # Transform the user input to a TF-IDF vector
    similarity_scores = cosine_similarity(query_tfidf, tfidf_matrix)  # Compute similarity

    # Get the indices of the top N most similar movies
    top_indices = similarity_scores[0].argsort()[-top_n:][::-1]

    # Return the top N movie names with similarity scores
    return df.iloc[top_indices][['name', 'rating']].reset_index(drop=True)

# Test the function
user_query = "I love thrilling action movies set in space, with a comedic twist."
recommendations = recommend_movies(user_query, top_n=5)
print(recommendations)


                       name  rating
0                     Alien     8.5
1  The Shawshank Redemption     9.3
2           Cinema Paradiso     8.5
3        Lawrence of Arabia     8.3
4              Finding Nemo     8.2
