In [1]:
import sys
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# Load dataset from a CSV file
def load_data(csv_path: str) -> pd.DataFrame:
    df = pd.read_csv(csv_path)
    df.dropna(subset=['keywords'], inplace=True)
    return df

# Build a TF-IDF matrix for a list of descriptions
# Returns fitted TfidfVectorizer and resulting matrix
def build_tfidf_matrix(descriptions):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(descriptions)
    return vectorizer, tfidf_matrix

# Given a user description, compute similarity with each item's keywords and return the top N similar items.
def get_N_similar_items(query: str, 
                            vectorizer: TfidfVectorizer, 
                            tfidf_matrix, 
                            df: pd.DataFrame, 
                            N: int = 5):
    
    # Transform the query into the same TF-IDF space
    query_vector = vectorizer.transform([query])
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()
    
    # Get the N item indices
    top_indices = similarity_scores.argsort()[::-1][:N]
    
    # Build a result list consisting of title and similarity score
    results = []
    for i in top_indices:
        results.append({
            'title': df.iloc[i]['title'],
            'similarity': round(float(similarity_scores[i]), 3)
        })
    return results

In [22]:
user_query = 'I like action movies set in space'

In [23]:
def main():  
    if len(sys.argv) < 2:
        print("Usage: python recommend.py 'Your description of preferences'")
        sys.exit(1)
    
    # Load data
    data_path = "data/movies.csv"
    df = load_data(data_path)
    
    # Build TF-IDF matrix
    vectorizer, tfidf_matrix = build_tfidf_matrix(df['keywords'])
    
    # Get top similar items
    recommendations = get_N_similar_items(user_query, vectorizer, tfidf_matrix, df, N=5)
    
    # Print recommendations
    print(f"\nUser Query: {user_query}")
    print("Top Recommendations:")
    for i, rec in enumerate(recommendations, start=1):
        print(f"{i}. {rec['title']} (Similarity: {rec['similarity']})")

if __name__ == "__main__":
    main()



User Query: I like action movies set in space
Top Recommendations:
1. King's Ransom (Similarity: 0.413)
2. Gravity (Similarity: 0.326)
3. Space Dogs (Similarity: 0.322)
4. Sinister (Similarity: 0.309)
5. Space Chimps (Similarity: 0.291)
