In [3]:
# Importing Libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
warnings.simplefilter(action='ignore', category=FutureWarning)

In [4]:
data = pd.read_csv(r"C:\Users\revan\Downloads\disney_movies.csv")

In [5]:
# Display the first few rows to understand its structure
data.head()

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
0,Snow White and the Seven Dwarfs,1937-12-21,Musical,G,184925485,5228953251
1,Pinocchio,1940-02-09,Adventure,G,84300000,2188229052
2,Fantasia,1940-11-13,Musical,G,83320000,2187090808
3,Song of the South,1946-11-12,Adventure,G,65000000,1078510579
4,Cinderella,1950-02-15,Drama,G,85000000,920608730


In [102]:
data.tail()

Unnamed: 0,movie_title,release_date,genre,mpaa_rating,total_gross,inflation_adjusted_gross
574,The Light Between Oceans,2016-09-02,Drama,PG-13,12545979,12545979
575,Queen of Katwe,2016-09-23,Drama,PG,8874389,8874389
576,Doctor Strange,2016-11-04,Adventure,PG-13,232532923,232532923
577,Moana,2016-11-23,Adventure,PG,246082029,246082029
578,Rogue One: A Star Wars Story,2016-12-16,Adventure,PG-13,529483936,529483936


In [104]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   movie_title               579 non-null    object
 1   release_date              579 non-null    object
 2   genre                     562 non-null    object
 3   mpaa_rating               523 non-null    object
 4   total_gross               579 non-null    int64 
 5   inflation_adjusted_gross  579 non-null    int64 
dtypes: int64(2), object(4)
memory usage: 27.3+ KB


In [9]:
data.shape

(579, 6)

In [11]:
data.describe()

Unnamed: 0,total_gross,inflation_adjusted_gross
count,579.0,579.0
mean,64701790.0,118762500.0
std,93013010.0,286085300.0
min,0.0,0.0
25%,12788860.0,22741230.0
50%,30702450.0,55159780.0
75%,75709030.0,119202000.0
max,936662200.0,5228953000.0


In [13]:
data.columns

Index(['movie_title', 'release_date', 'genre', 'mpaa_rating', 'total_gross',
       'inflation_adjusted_gross'],
      dtype='object')

In [15]:
data["release_date"].value_counts()

release_date
1997-12-25    3
1998-12-25    2
2013-11-22    2
2000-11-22    2
2002-01-01    2
             ..
1994-09-02    1
1994-08-19    1
1994-08-12    1
1994-07-15    1
2016-12-16    1
Name: count, Length: 553, dtype: int64

In [106]:
# Preprocessing: Handle missing values and format the data
data['genre'] = data['genre'].fillna('Unknown')
data['mpaa_rating'] = data['mpaa_rating'].fillna('Not Rated')
data['release_date'] = pd.to_datetime(data['release_date'])

In [108]:
# Create a combined feature for recommendations
data['combined_features'] = (
   data['genre'] + " " + data['mpaa_rating']
)

In [110]:

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())


Missing Values:
movie_title                 0
release_date                0
genre                       0
mpaa_rating                 0
total_gross                 0
inflation_adjusted_gross    0
combined_features           0
dtype: int64


In [112]:
# Data summary
print("\nDataset Information:")
print(data.info())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 579 entries, 0 to 578
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype         
---  ------                    --------------  -----         
 0   movie_title               579 non-null    object        
 1   release_date              579 non-null    datetime64[ns]
 2   genre                     579 non-null    object        
 3   mpaa_rating               579 non-null    object        
 4   total_gross               579 non-null    int64         
 5   inflation_adjusted_gross  579 non-null    int64         
 6   combined_features         579 non-null    object        
dtypes: datetime64[ns](1), int64(2), object(4)
memory usage: 31.8+ KB
None


In [114]:
# Preprocessing
# Handle missing values in 'genre' and 'mpaa_rating'
data['genre'] = data['genre'].fillna('Unknown')
data['mpaa_rating'] = data['mpaa_rating'].fillna('Not Rated')

In [116]:
# Create a combined features column
data['combined_features'] = data['genre'] + " " + data['mpaa_rating']

In [118]:
# Display the first few rows of the modified dataset
print("\nModified Dataset Preview:")
print(data[['movie_title', 'combined_features']].head())


Modified Dataset Preview:
                       movie_title combined_features
0  Snow White and the Seven Dwarfs         Musical G
1                        Pinocchio       Adventure G
2                         Fantasia         Musical G
3                Song of the South       Adventure G
4                       Cinderella           Drama G


In [120]:
# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

In [122]:
# Check the columns in the DataFrame
print("Available Columns in DataFrame:")
print(data.columns)

# Ensure combined_features column is created correctly
if 'combined_features' not in data.columns:
    print("\nCreating the 'combined_features' column...")
    data['combined_features'] = data['genre'] + " " + data['mpaa_rating']
    print("Column 'combined_features' created successfully!")

# Verify the creation of combined_features
print("\nPreview of combined_features:")
print(data[['movie_title', 'combined_features']].head())


Available Columns in DataFrame:
Index(['movie_title', 'release_date', 'genre', 'mpaa_rating', 'total_gross',
       'inflation_adjusted_gross', 'combined_features'],
      dtype='object')

Preview of combined_features:
                       movie_title combined_features
0  Snow White and the Seven Dwarfs         Musical G
1                        Pinocchio       Adventure G
2                         Fantasia         Musical G
3                Song of the South       Adventure G
4                       Cinderella           Drama G


In [124]:
# Handle missing values for 'genre' and 'mpaa_rating'
data['genre'] = data['genre'].fillna('Unknown')
data['mpaa_rating'] = data['mpaa_rating'].fillna('Not Rated')

# Recreate combined_features column
data['combined_features'] = data['genre'] + " " + data['mpaa_rating']

# Check for NaN values in combined_features
print("\nChecking for missing values in 'combined_features':")
print(data['combined_features'].isnull().sum())

# Fill any remaining NaN values in combined_features (safety step)
data['combined_features'] = data['combined_features'].fillna('')

# Apply TF-IDF vectorization
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['combined_features'])

print("\nTF-IDF Matrix created successfully!")
print("Matrix shape:", tfidf_matrix.shape)



Checking for missing values in 'combined_features':
0

TF-IDF Matrix created successfully!
Matrix shape: (579, 18)


In [126]:
# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [128]:
# Define a function for movie recommendations
def recommend_movies(title, cosine_sim=cosine_sim, data=data, top_n=5):
    # Map movie titles to indices
    indices = pd.Series(data.index, index=data['movie_title']).drop_duplicates()

In [130]:
def find_movie_index(title, indices):
    # Get the index of the movie that matches the title
    idx = indices.get(title, None)
    if idx is None:
        return f"Movie '{title}' not found in the dataset."
    else:
        return f"Movie '{title}' is found at index {idx}."

In [132]:
def get_movie_recommendations(title, indices, cosine_sim, data):
    # Get the index of the movie that matches the title
    idx = indices.get(title, None)
    if idx is None:
        return f"Movie '{title}' not found in the dataset."
        # Get the similarity scores of all movies with the given movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the movies based on similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the indices of the 10 most similar movies (excluding the first one, which is the movie itself)
    movie_indices = [i[0] for i in sim_scores[1:11]]

    # Return the titles of the 10 most similar movies
    return data.iloc[movie_indices]['title'].tolist()

In [134]:
# Example: Recommend movies similar to "Cinderella"
movie_to_search = "Cinderella"
recommendations = recommend_movies(movie_to_search)
print(f"\nRecommendations for '{movie_to_search}':")
print(recommendations)


Recommendations for 'Cinderella':
None


In [136]:
# Feature Importances (Not applicable here but added to mimic format)
print("\nFeature Importances:")
print("This concept does not directly apply to recommendation systems like content-based filtering.")


Feature Importances:
This concept does not directly apply to recommendation systems like content-based filtering.
