# Loading Dependencies

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import spacy
!python -m spacy download en_core_web_md
nlp = spacy.load('en_core_web_md')


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing and Cleaning Dataset

In [None]:
df  = pd.read_csv('/kaggle/input/tmdb-15000-movies-dataset-with-credits/movie_data.csv', lineterminator="\n")

In [None]:
print(df.head())
print(df.info())
print(df.describe())

In [None]:
# Check for missing values
print(df.isnull().sum())


# Drop duplicates
df.drop_duplicates(inplace=True)

In [None]:
# Dropping for missing values
print(df.dropna(inplace=True))

In [None]:
# Drop unnecessary columns
df = df.drop(['Unnamed: 0', 'backdrop_path', 'poster_path', 'video'], axis=1)

# Convert release_date to datetime format
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')

# Replace missing values with empty strings
df = df.fillna('')

# Stopwords using TF-IDF

In [None]:
# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Compute TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
def get_recommendations(title, cosine_sim=cosine_sim, df=df):
    # Get the index of the movie that matches the title
    idx = df[df['title'] == title].index[0]

    # Get the pairwise similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar movies
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]


# Recommendation

In [None]:
get_recommendations('The Dark Knight')

# Improving Recommendation

In [None]:
# Import necessary libraries
import spacy
nlp = spacy.load('en_core_web_md')

# Define a function to preprocess text data using Spacy
def preprocess(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and token.is_alpha])

# Preprocess the overview column
df['overview_processed'] = df['overview'].apply(preprocess)

# Create a new TF-IDF vectorizer with the preprocessed text
tfidf_processed = TfidfVectorizer(stop_words='english')
tfidf_matrix_processed = tfidf_processed.fit_transform(df['overview_processed'])

# Compute cosine similarity matrix with the preprocessed text
cosine_sim_processed = cosine_similarity(tfidf_matrix_processed, tfidf_matrix_processed)

# Use the new cosine similarity matrix to get recommendations
get_recommendations('The Dark Knight', cosine_sim_processed, df)