# ðŸŽ¬ Movie Recommendation System with EDA

Complete Jupyter Notebook containing Exploratory Data Analysis (EDA) and a content-based movie recommendation system using TF-IDF and cosine similarity. Paste your `movies.csv` into Colab `/content/` or the same folder when running locally.

In [None]:
# 1. Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

sns.set_style('whitegrid')
print("Libraries imported successfully.")

In [None]:
# 2. Load dataset
# Update the path if your CSV is located elsewhere.
csv_path = '/content/movies.csv'  # change if needed
movies_data = pd.read_csv(csv_path)
print("Dataset loaded from:", csv_path)
print("Dataset shape:", movies_data.shape)
display(movies_data.head())

In [None]:
# 3. Basic info and missing values
print('--- DataFrame Info ---')
display(movies_data.info())
print('\n--- Missing values per column ---')
display(movies_data.isnull().sum())

In [None]:
# 4. Missing values heatmap
plt.figure(figsize=(12,4))
sns.heatmap(movies_data.isnull(), cbar=False, cmap='coolwarm')
plt.title('Missing Values Heatmap')
plt.show()

In [None]:
# 5. Basic statistics and duplicates
print('Unique movie titles:', movies_data['title'].nunique())
print('Duplicate rows count:', movies_data.duplicated().sum())
display(movies_data.describe(include='all').T)

In [None]:
# 6. Top genres and directors (visual)
plt.figure(figsize=(10,5))
if 'genres' in movies_data.columns:
    try:
        movies_data['genres'].value_counts().head(10).plot(kind='bar')
        plt.title('Top 10 Most Common Genres')
        plt.xlabel('Genre')
        plt.ylabel('Count')
        plt.show()
    except Exception as e:
        print('Could not plot genres:', e)
else:
    print("'genres' column not found.")

plt.figure(figsize=(10,5))
if 'director' in movies_data.columns:
    try:
        movies_data['director'].value_counts().head(10).plot(kind='bar')
        plt.title('Top 10 Directors with Most Movies')
        plt.xlabel('Director')
        plt.ylabel('Count')
        plt.show()
    except Exception as e:
        print('Could not plot directors:', e)
else:
    print("'director' column not found.")

In [None]:
# 7. Tagline availability & text length analysis
if 'tagline' in movies_data.columns:
    has_tagline = movies_data['tagline'].apply(lambda x: 0 if pd.isna(x) or str(x).strip()=='' else 1)
    plt.figure(figsize=(5,4))
    sns.countplot(x=has_tagline)
    plt.title('Movies with / without Tagline (1=yes, 0=no)')
    plt.show()
else:
    print("No 'tagline' column present.")

# Keywords and cast length (if present)
if 'keywords' in movies_data.columns:
    movies_data['keywords_length'] = movies_data['keywords'].apply(lambda x: len(str(x).split()))
else:
    movies_data['keywords_length'] = 0

if 'cast' in movies_data.columns:
    movies_data['cast_length'] = movies_data['cast'].apply(lambda x: len(str(x).split(',')))
else:
    movies_data['cast_length'] = 0

plt.figure(figsize=(10,4))
sns.histplot(movies_data['cast_length'], bins=20, kde=True)
plt.title('Distribution of Cast Size per Movie')
plt.xlabel('Number of Cast Members')
plt.show()

## EDA Summary

- Observations about dataset shape, missing columns, common genres and directors.
- If you ran the cells above, note down the printed observations (unique counts, missing fields, top genres).

Proceed to cleaning and feature preparation for the recommender system.

In [None]:
# 8. Data cleaning & feature preparation
# Select features to use for content-based filtering
selected_features = ['genres','keywords','tagline','cast','director']

# Ensure selected features exist; create missing ones as empty strings
for feature in selected_features:
    if feature not in movies_data.columns:
        movies_data[feature] = ''
    movies_data[feature] = movies_data[feature].fillna('')

# Combine features into one string
movies_data['combined_features'] = (
    movies_data['genres'].astype(str) + ' ' +
    movies_data['keywords'].astype(str) + ' ' +
    movies_data['tagline'].astype(str) + ' ' +
    movies_data['cast'].astype(str) + ' ' +
    movies_data['director'].astype(str)
)

print('Combined features column added. Example:')
display(movies_data[['title','combined_features']].head())

In [None]:
# 9. TF-IDF Vectorization and Cosine Similarity
vectorizer = TfidfVectorizer(stop_words='english')
feature_vectors = vectorizer.fit_transform(movies_data['combined_features'])
print('Feature vectors shape:', feature_vectors.shape)

# Compute similarity matrix (can be large; for very big datasets consider sampling or approximate methods)
similarity = cosine_similarity(feature_vectors)
print('Similarity matrix shape:', similarity.shape)

In [None]:
# 10. Recommendation function
def recommend_movies(movie_name, top_n=10):
    '''
    Given a movie name (string), print top_n similar movies from the dataset.
    Uses difflib to find the closest title match (case-insensitive).
    '''
    if 'title' not in movies_data.columns:
        print("Dataset has no 'title' column.")
        return

    titles = movies_data['title'].tolist()
    titles_lower = [t.lower() for t in titles]
    movie_name = movie_name.lower()
    matches = difflib.get_close_matches(movie_name, titles_lower, n=1, cutoff=0.4)
    if not matches:
        print('No close match found for:', movie_name)
        return
    close_match = matches[0]
    idx = titles_lower.index(close_match)

    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    print(f"Movies similar to: {movies_data.iloc[idx]['title']}") 
    for i, (movie_idx, score) in enumerate(sim_scores[1:top_n+1], start=1):
        print(f"{i}. {movies_data.iloc[movie_idx]['title']} (score: {score:.4f})")

In [None]:
# 11. Try the recommender
# Example: change the movie name to test
movie_input = input('Enter your favourite movie name: ')
recommend_movies(movie_input, top_n=15)

### Notebook created programmatically

If you'd like modifications (e.g., add Streamlit app cell, save vectors to disk, or include more visualizations), ask me and I will update the notebook.