# Content based filltering book Recommendation system

## Importing Packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os

In [None]:
movies = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_movies.csv') # 'r' for raw string, o/w use / or \\
credits = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_credits.csv')

In [None]:
movies.head(2)

In [None]:
credits.head(2)

In [None]:
movies.shape

In [None]:
credits.shape

### Merge movie, credits into one dataframe

In [None]:
movies =movies.merge(credits, on='title')
print(movies.shape)
movies.head(2)

In [None]:
print(movies.iloc[0])

In [None]:
movies.iloc[0]['genres']

In [None]:
movies['original_language'].value_counts()

## Let choose columns 

In [None]:
movies.columns

In [None]:
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']] 
movies.head(2)

In [None]:
movies.shape

## Overview & data cleaning

In [None]:
movies.isnull().sum()

In [None]:
movies.dropna(inplace=True) # Removes any rows (by default) that contain at least one NaN (missing value), apply the change directly to movies

In [None]:
movies.isnull().sum() # no of nulls

In [None]:
movies.shape

In [None]:
movies.duplicated().sum()

In [None]:
movies.iloc[0]['genres']

In [None]:
import ast # to convert string into list

def convert(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        l.append(i['name']) # only name {"id": 28, "name": "Action"}
    return l

In [None]:
import ast # to convert string into list

def convert_cast(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    counter = 0
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        if counter>=3:
            break
        l.append(i['name']) # only name {"id": 28, "name": "Action"}
        counter+=1
    return l

In [None]:
import ast
ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')

In [None]:
movies['genres'] = movies['genres'].apply(convert) # runs the convert function on every row in the genres column

In [None]:
movies.head(2)

In [None]:
movies.iloc[0]['keywords']

In [None]:
movies.iloc[0]['cast']

In [None]:
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)

movies.head(2)

In [None]:
movies.iloc[0]['crew']

In [None]:
# fetch only the directory as job
# {"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"},

def fetch_director(text):
    if not isinstance(text, str):
        if isinstance(text, list):  
            return text
        return []
        
    l = []
    for i in ast.literal_eval(text): # safely evaluates a string containing a Python literal into the actual object ("[...]' ---> [...]
        if i['job'] == 'Director':
            l.append(i['name']) # only name {"id": 28, "name": "Action"}
            break
    return l

In [None]:
movies['crew'] = movies['crew'].apply(fetch_director)
movies.head(2)

In [None]:
movies.iloc[0]['overview']

In [None]:
movies['overview'] = movies['overview'].apply(lambda x : x.split()) # seperate each word, put into list
movies.head(2)

In [None]:
# Sam Worthington
# SamWorthington

# data means each row in passing column
def remove_spaces(data):
    names = []

    for i in data:
        names.append(i.replace(" ", ""))
    return names

In [None]:
movies['cast'] = movies['cast'].apply(remove_spaces)
movies['crew'] = movies['crew'].apply(remove_spaces)
movies['keywords'] = movies['keywords'].apply(remove_spaces)
movies['genres'] = movies['genres'].apply(remove_spaces)

In [None]:
movies.head(2)

## creating new column tag

In [None]:
# all those columns are lists
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies.head()

In [None]:
movies.iloc[0]['tags']

In [None]:
new_df = movies[['movie_id', 'title', 'tags']].copy() # Keep original movies table intact

In [None]:
new_df.head()

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x : " ".join(x))

In [None]:
new_df.head()

In [None]:
new_df.iloc[0]['tags']

In [None]:
new_df['tags'] = new_df['tags'].apply(lambda x : x.lower())
new_df.head()

## What is Stemming?
    Reduce words to their root/base form.
    It’s a rule-based chopping method (not always linguistically correct).

    Stemming :
        Fast, rule-based, may produce non-words.
        societies → societi

    Lemmatization (better for NLP):
        Uses vocabulary + grammar rules.
        Produces real words.
        societies → society

In [None]:
import nltk
from nltk.stem import PorterStemmer

In [None]:
ps = PorterStemmer()

In [None]:
# 
# dispatched ---> dispatch, following ---> follow , ....


def stem(text):
    l = []
    for i in text.split():   # text is a string but text.split() is a list
        l.append(ps.stem(i)) # stem each word

    return " ".join(l) # Join back into one string

In [None]:
new_df['tags'] = new_df['tags'].apply(stem)

In [None]:
new_df.iloc[0]['tags']

## Scikit learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vector = vectorizer.fit_transform(new_df['tags']).toarray()

In [None]:
vector

In [None]:
vector.shape

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vector)

In [None]:
similarity

In [None]:
similarity.shape

In [None]:
new_df[new_df['title'] == 'Spider-Man'].index[0]

In [None]:
from difflib import get_close_matches

def reccommend_movie(movie):
    
    titles = new_df['title'].tolist()
    
    # Find the closest match (allowing typos / case differences)
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)
    
    if not matches:
        print(f"No close match found for '{movie}'")
        return
    
    best_match = matches[0]
    index = new_df[new_df['title'] == best_match].index[0]


    # index = new_df[new_df['title'] == movie].index[0]
    distance = sorted(list(enumerate(similarity[index])), reverse=True, key= lambda x : x[1])

    print(f"\nResults for: {best_match}\n")
    for i in distance[1:6]:
        print(new_df.iloc[i[0]].title)

In [None]:
reccommend_movie('the drk night risen')

In [None]:
import pickle

pickle.dump(new_df, open('pkls/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('pkls/similarity.pkl', 'wb'))

## codes

In [82]:
# ===============================
# Movie Recommendation System
# ===============================

import os
import ast
import pickle
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
from difflib import get_close_matches
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# ===============================
# Load Data
# ===============================
movies = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_movies.csv')
credits = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_credits.csv')

# Merge datasets on title
movies = movies.merge(credits, on='title')

# Keep only useful columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop missing values
movies.dropna(inplace=True)


# ===============================
# Data Cleaning Functions
# ===============================
def convert(text):
    """Convert JSON string into list of names."""
    if not isinstance(text, str):
        return []
    return [i['name'] for i in ast.literal_eval(text)]


def convert_cast(text):
    """Keep top 3 cast members."""
    if not isinstance(text, str):
        return []
    cast = []
    for idx, i in enumerate(ast.literal_eval(text)):
        if idx >= 3:
            break
        cast.append(i['name'])
    return cast


def fetch_director(text):
    """Fetch director from crew data."""
    if not isinstance(text, str):
        return []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []


def remove_spaces(names):
    """Remove spaces from names (e.g., Sam Worthington → SamWorthington)."""
    return [name.replace(" ", "") for name in names]


# ===============================
# Apply Cleaning
# ===============================
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces in multi-word names
for col in ['cast', 'crew', 'keywords', 'genres']:
    movies[col] = movies[col].apply(remove_spaces)

# Create tags column
movies['tags'] = (
    movies['overview']
    + movies['genres']
    + movies['keywords']
    + movies['cast']
    + movies['crew']
)

# New dataframe with essential info
new_df = movies[['movie_id', 'title', 'tags']].copy()
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())


# ===============================
# Text Preprocessing (Stemming)
# ===============================
ps = PorterStemmer()

def stem(text):
    """Apply stemming to tags text."""
    return " ".join(ps.stem(word) for word in text.split())

new_df['tags'] = new_df['tags'].apply(stem)


# ===============================
# Vectorization & Similarity
# ===============================
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vector = vectorizer.fit_transform(new_df['tags']).toarray()

similarity = cosine_similarity(vector)


# ===============================
# Recommendation Function
# ===============================
def recommend_movie(movie):
    titles = new_df['title'].tolist()
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)

    if not matches:
        print(f"No close match found for '{movie}'")
        return

    best_match = matches[0]
    index = new_df[new_df['title'] == best_match].index[0]
    distances = sorted(
        list(enumerate(similarity[index])),
        reverse=True,
        key=lambda x: x[1]
    )

    print(f"\nRecommendations for: {best_match}\n")
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)


# ===============================
# Example Run
# ===============================
recommend_movie('the drk night risen')


# ===============================
# Save Model
# ===============================
# os.makedirs('pkls', exist_ok=True)
# pickle.dump(new_df, open('pkls/movie_list.pkl', 'wb'))
# pickle.dump(similarity, open('pkls/similarity.pkl', 'wb'))



Recommendations for: The Dark Knight Rises

The Dark Knight
Batman Returns
Batman
Batman Forever
Batman Begins


In [84]:
recommend_movie('spidr man')


Recommendations for: Spider-Man

Spider-Man 3
Spider-Man 2
The Amazing Spider-Man 2
Arachnophobia
Kick-Ass


## ===============================

## Movie Recommendation System

## ===============================

```python
import os
import ast
import pickle
import numpy as np
import pandas as pd
from nltk.stem import PorterStemmer
from difflib import get_close_matches
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
```

**Explanation:**

* `os` → used to interact with the file system, e.g., create folders (`os.makedirs`).
* `ast` → safely converts strings that look like Python lists/dictionaries into actual Python objects.

  * Example: `'[{"id": 28, "name": "Action"}]'` → `[{"id":28,"name":"Action"}]`
* `pickle` → used to save Python objects to a file and load them later (like your dataframe and similarity matrix).
* `numpy` → used for numerical operations (arrays, matrices).
* `pandas` → used to handle tabular data (dataframes).
* `PorterStemmer` → reduces words to their root form: `"running"` → `"run"`, `"loved"` → `"love"`. Helps matching similar words.
* `get_close_matches` → finds closest strings from a list (helps with typos in movie names).
* `CountVectorizer` → converts text into a numerical matrix (count of words).
* `cosine_similarity` → measures similarity between vectors (used to find similar movies).

---

## ===============================

## Load Data

## ===============================

```python
movies = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_movies.csv')
credits = pd.read_csv(r'D:\Ai Projects\Movie Rec System - Content based\datasets\tmdb_5000_credits.csv')
```

**Explanation:**

* Reads CSV files containing **movies information** and **credits (cast/crew)** into pandas dataframes.
* `r` before the string means *raw string*, so Windows paths don’t need double `\\`.

```python
movies = movies.merge(credits, on='title')
```

* Combines the two datasets on the **title** column so each row has movie info + credits.

```python
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.dropna(inplace=True)
```

* Keeps only the useful columns for recommendation.
* Removes rows with missing data (`dropna`).

---

## ===============================

## Data Cleaning Functions

## ===============================

```python
def convert(text):
    """Convert JSON string into list of names."""
    if not isinstance(text, str):
        return []
    return [i['name'] for i in ast.literal_eval(text)]
```

* Takes columns like `genres` or `keywords` which are **strings that look like lists of dictionaries**, and converts them into a Python list of names.
* Example: `'[{"id":28,"name":"Action"}]'` → `["Action"]`.

```python
def convert_cast(text):
    """Keep top 3 cast members."""
    if not isinstance(text, str):
        return []
    cast = []
    for idx, i in enumerate(ast.literal_eval(text)):
        if idx >= 3:
            break
        cast.append(i['name'])
    return cast
```

* Keeps only the first **3 actors** from the cast (to reduce noise).

```python
def fetch_director(text):
    """Fetch director from crew data."""
    if not isinstance(text, str):
        return []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []
```

* From `crew`, finds **only the director**.

```python
def remove_spaces(names):
    """Remove spaces from names (e.g., Sam Worthington → SamWorthington)."""
    return [name.replace(" ", "") for name in names]
```

* Converts `"Sam Worthington"` → `"SamWorthington"` so multi-word names don’t break token matching later.

---

## ===============================

## Apply Cleaning

## ===============================

```python
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)
```

* Applies all the cleaning functions on the respective columns.
* Each row now has clean **lists** of words or names.

```python
movies['overview'] = movies['overview'].apply(lambda x: x.split())
```

* Splits the movie overview into a **list of words** instead of one big string.
* Example: `"A hero saves the world"` → `["A", "hero", "saves", "the", "world"]`.

```python
for col in ['cast', 'crew', 'keywords', 'genres']:
    movies[col] = movies[col].apply(remove_spaces)
```

* Ensures all multi-word names in these columns have no spaces.

```python
movies['tags'] = (
    movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
)
```

* Combines all features (`overview`, `genres`, `keywords`, `cast`, `crew`) into **one big list called `tags`**.
* This will be the main data we use for similarity.

```python
new_df = movies[['movie_id', 'title', 'tags']].copy()
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())
```

* Converts the lists of words into a **single lowercase string**.
* Example: `["hero", "Action", "SamWorthington"]` → `"hero action samworthington"`

---

## ===============================

## Text Preprocessing (Stemming)

## ===============================

```python
ps = PorterStemmer()

def stem(text):
    return " ".join(ps.stem(word) for word in text.split())

new_df['tags'] = new_df['tags'].apply(stem)
```

* Stemming reduces words to root forms so similar words match.
* Example: `"loved loving love"` → `"love love love"`

---

## ===============================

## Vectorization & Similarity

## ===============================

```python
vectorizer = CountVectorizer(max_features=5000, stop_words='english')
vector = vectorizer.fit_transform(new_df['tags']).toarray()
```

* Converts each movie’s `tags` into a **numerical vector** based on word counts.
* `max_features=5000` → only top 5000 frequent words are considered.
* `stop_words='english'` → ignores common words like "the", "is", etc.

```python
similarity = cosine_similarity(vector)
```

* Computes similarity between all movies using **cosine similarity** (values 0-1).
* Example: if Movie A and Movie B share many words in `tags`, similarity ~1; if unrelated, ~0.

---

## ===============================

## Recommendation Function

## ===============================

```python
def recommend_movie(movie):
    titles = new_df['title'].tolist()
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)
```

* Converts all movie titles into a list.
* Uses `get_close_matches` to **handle typos**, e.g., `"Spider-Men"` → `"Spider-Man"`.

```python
    if not matches:
        print(f"No close match found for '{movie}'")
        return
```

* If no match is found, it prints a message and stops.

```python
    best_match = matches[0]
    index = new_df[new_df['title'] == best_match].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
```

* Finds the index of the best matching movie.
* Sorts all other movies by similarity in **descending order**.

```python
    print(f"\nRecommendations for: {best_match}\n")
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)
```

* Prints the **top 5 recommended movies**.
* `distances[1:6]` → skip the first one because it’s the movie itself.

---

## ===============================

## Example Run

## ===============================

```python
recommend_movie('the drk night risen')
```

* Tests your system with a typo in the movie name.
* Returns closest match + top 5 recommendations.

---

## ===============================

## Save Model

## ===============================

```python
os.makedirs('pkls', exist_ok=True)
pickle.dump(new_df, open('pkls/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('pkls/similarity.pkl', 'wb'))
```

* Creates folder `pkls` if it doesn’t exist.
* Saves `new_df` and `similarity` matrix as `.pkl` files.
* Later, you can **load these files** and serve recommendations without rebuilding everything.

---

✅ That’s a full **line-by-line explanation**.

* The pipeline is: **Load → Clean → Combine Features → Preprocess → Vectorize → Compute Similarity → Recommend → Save**
* Each step is **modular**, so you can later add more features (like director rating, release year, etc.).

---


## Actual purpose of pkls, How to do recommendations

In [85]:
# ===============================
# Load saved recommender
# ===============================

import pickle
from difflib import get_close_matches

# Step 1: Load the saved files
new_df = pickle.load(open('pkls/movie_list.pkl', 'rb'))
similarity = pickle.load(open('pkls/similarity.pkl', 'rb'))

# Step 2: Define the recommendation function
def recommend_movie(movie):
    titles = new_df['title'].tolist()
    
    # Handle typos
    matches = get_close_matches(movie, titles, n=1, cutoff=0.6)
    if not matches:
        print(f"No close match found for '{movie}'")
        return
    
    best_match = matches[0]
    index = new_df[new_df['title'] == best_match].index[0]
    
    # Get similarity scores and sort
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    print(f"\nRecommendations for: {best_match}")
    for i in distances[1:6]:  # top 5 recommendations
        print("-", new_df.iloc[i[0]].title)

# Step 3: Make a prediction / get recommendations
recommend_movie("the drk night risen")  # Handles typos



Recommendations for: The Dark Knight Rises
- The Dark Knight
- Batman Returns
- Batman
- Batman Forever
- Batman Begins


In [88]:
recommend_movie('spidr man')


Recommendations for: Spider-Man
- Spider-Man 3
- Spider-Man 2
- The Amazing Spider-Man 2
- Arachnophobia
- Kick-Ass


# Vectorization & Similarity