In [5]:
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


def load_data(filepath: str) -> pd.DataFrame:
    """Load movies data from CSV file."""
    return pd.read_csv(filepath)


def preprocess_data(df: pd.DataFrame, selected_features: list) -> pd.DataFrame:
    """Fill nulls and combine selected text features into a single string."""
    for feature in selected_features:
        df[feature] = df[feature].fillna('')
    df['combined_features'] = df[selected_features].agg(' '.join, axis=1)
    return df


def compute_similarity_matrix(combined_text: pd.Series) -> np.ndarray:
    """Convert text to TF-IDF vectors and compute cosine similarity matrix."""
    vectorizer = TfidfVectorizer()
    feature_vectors = vectorizer.fit_transform(combined_text)
    similarity = cosine_similarity(feature_vectors)
    return similarity


def find_closest_title(input_title: str, title_list: list) -> str:
    """Find the closest matching title from the dataset using fuzzy matching."""
    matches = difflib.get_close_matches(input_title, title_list)
    return matches[0] if matches else None


def get_recommendations(movie_title: str, df: pd.DataFrame, similarity_matrix: np.ndarray, top_n: int = 30):
    """Get top N similar movies based on the cosine similarity score."""
    movie_index = df[df.title == movie_title]['index'].values[0]
    similarity_scores = list(enumerate(similarity_matrix[movie_index]))
    sorted_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    print("\n🎬 Movies recommended for you:\n")
    count = 0
    for index, score in sorted_movies:
        if count >= top_n:
            break
        recommended_title = df[df.index == index]['title'].values[0]
        print(f"{count + 1}. {recommended_title}")
        count += 1


def main():
    # Load and preprocess data
    filepath = './movies.csv'
    selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']

    df = load_data(filepath)

    # Add index column if not already present
    if 'index' not in df.columns:
        df.reset_index(inplace=True)
        df.rename(columns={'index': 'index'}, inplace=True)

    df = preprocess_data(df, selected_features)
    similarity_matrix = compute_similarity_matrix(df['combined_features'])

    # Get user input
    movie_name = input("🎥 Enter your favourite movie name: ")

    # Find closest match and recommend
    title_list = df['title'].tolist()
    closest_title = find_closest_title(movie_name, title_list)

    if closest_title:
        print(f"\n✅ Found closest match: {closest_title}")
        get_recommendations(closest_title, df, similarity_matrix)
    else:
        print("❌ Sorry, no close match found for that movie.")


if __name__ == "__main__":
    main()


🎥 Enter your favourite movie name: Iron man

✅ Found closest match: Iron Man

🎬 Movies recommended for you:

1. Iron Man
2. Iron Man 2
3. Iron Man 3
4. Avengers: Age of Ultron
5. The Avengers
6. Captain America: Civil War
7. Captain America: The Winter Soldier
8. Ant-Man
9. X-Men
10. Made
11. X-Men: Apocalypse
12. X2
13. The Incredible Hulk
14. The Helix... Loaded
15. X-Men: First Class
16. X-Men: Days of Future Past
17. Captain America: The First Avenger
18. Kick-Ass 2
19. Guardians of the Galaxy
20. Deadpool
21. Thor: The Dark World
22. G-Force
23. X-Men: The Last Stand
24. Duets
25. Mortdecai
26. The Last Airbender
27. Southland Tales
28. Zathura: A Space Adventure
29. Sky Captain and the World of Tomorrow
30. The Amazing Spider-Man 2
