In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv("master_dataset_final.csv")

required_columns = {'title', 'main_director', 'release_date', 'soup'}
if not required_columns.issubset(df.columns):
    raise ValueError("Missing required columns in the CSV. Needed: title, main_director, release_date, soup")

vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)
count_matrix = vectorizer.fit_transform(df['soup'])


print("Count Matrix Shape:", count_matrix.shape)


cosine_sim = cosine_similarity(count_matrix)


cosine_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])
cosine_df.to_csv("cosine_similarity_matrix.csv")


def recommend_movies(movie_title, num_recommendations=5):
    if movie_title not in cosine_df.columns:
        print(f"Movie '{movie_title}' not found in the dataset.")
        return []

    
    similarity_scores = cosine_df[movie_title].sort_values(ascending=False)
    top_titles = similarity_scores.iloc[1:num_recommendations+1].index

    
    recommendations = []
    for title in top_titles:
        movie_row = df[df['title'] == title].iloc[0]
        recommendations.append({
            'Movie Title': movie_row['title'],
            'Director': movie_row['main_director'],
            'Release Date': movie_row['release_date']
        })

   
    rec_df = pd.DataFrame(recommendations)
    safe_filename = f"recommendations_for_{movie_title.replace(' ', '_')}.csv"
    rec_df.to_csv(safe_filename, index=False)

    
    print("\n{:<60} | {:<20} | {}".format("Movie Title", "Director", "Release Date"))
    print("-" * 100)
    for row in recommendations:
        print("{:<60} | {:<20} | {}".format(row['Movie Title'], row['Director'], row['Release Date']))

    
    return recommendations


recommend_movies("Avatar", 10)


✅ Count Matrix Shape: (2500, 6908)
✅ Cosine similarity matrix saved to 'cosine_similarity_matrix.csv'

Movie Title                                                  | Director             | Release Date
----------------------------------------------------------------------------------------------------
Aliens                                                       | James Cameron        | 1986-07-18
Titanic                                                      | James Cameron        | 1997-11-18
True Lies                                                    | James Cameron        | 1994-07-14
The Terminator                                               | James Cameron        | 1984-10-26
Terminator 2: Judgment Day                                   | James Cameron        | 1991-07-01
Star Trek Into Darkness                                      | J.J. Abrams          | 2013-05-05
Battle Royale                                                | Kinji Fukasaku       | 2000-12-16
Home              

[{'Movie Title': 'Aliens',
  'Director': 'James Cameron',
  'Release Date': '1986-07-18'},
 {'Movie Title': 'Titanic',
  'Director': 'James Cameron',
  'Release Date': '1997-11-18'},
 {'Movie Title': 'True Lies',
  'Director': 'James Cameron',
  'Release Date': '1994-07-14'},
 {'Movie Title': 'The Terminator',
  'Director': 'James Cameron',
  'Release Date': '1984-10-26'},
 {'Movie Title': 'Terminator 2: Judgment Day',
  'Director': 'James Cameron',
  'Release Date': '1991-07-01'},
 {'Movie Title': 'Star Trek Into Darkness',
  'Director': 'J.J. Abrams',
  'Release Date': '2013-05-05'},
 {'Movie Title': 'Battle Royale',
  'Director': 'Kinji Fukasaku',
  'Release Date': '2000-12-16'},
 {'Movie Title': 'Home',
  'Director': 'Tim Johnson',
  'Release Date': '2015-03-18'},
 {'Movie Title': 'Meet Dave',
  'Director': 'Brian Robbins',
  'Release Date': '2008-07-08'},
 {'Movie Title': 'PK',
  'Director': 'Rajkumar Hirani',
  'Release Date': '2014-12-18'}]

In [7]:
print("Count Matrix Shape:", count_matrix.shape)


Count Matrix Shape: (2500, 6908)
