In [151]:
import pandas as pd
import numpy as np

In [152]:
movies=pd.read_csv('dataset.csv')

In [153]:
# Simulate user_id and ratings
import random

# Assume 100 unique users
num_users = 100
movies['user_id'] = [random.randint(1, num_users) for _ in range(len(movies))]
movies['rating'] = [random.randint(1, 5) for _ in range(len(movies))]

# Create the user-item matrix
user_item_matrix = movies.pivot_table(index='user_id', columns='title', values='rating')
user_item_matrix.fillna(0, inplace=True)


In [154]:
# Select relevant numeric features for similarity calculation
features = movies[['vote_average', 'popularity', 'vote_count']].fillna(0)

# Calculate cosine similarity
from sklearn.metrics.pairwise import cosine_similarity
item_similarity = cosine_similarity(features)


In [155]:
def recommend_item_based(movie_title, num_recommendations=5):
    # Find the index of the movie
    if movie_title not in movies['title'].values:
        return "Movie not found in the dataset."

    movie_index = movies[movies['title'] == movie_title].index[0]

    # Get similarity scores for the movie
    similar_movies = list(enumerate(item_similarity[movie_index]))
    similar_movies = sorted(similar_movies, key=lambda x: x[1], reverse=True)

    # Retrieve top recommendations
    recommendations = []
    for i in similar_movies[1:num_recommendations + 1]:  # Skip the first (itself)
        recommendations.append(movies.iloc[i[0]].title)

    return recommendations


In [156]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,user_id,rating
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,96,3
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,78,3
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,21,2
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,31,5
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,92,4


In [157]:
movies.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count', 'user_id', 'rating'],
      dtype='object')

In [158]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
 9   user_id            10000 non-null  int64  
 10  rating             10000 non-null  int64  
dtypes: float64(2), int64(4), object(5)
memory usage: 859.5+ KB


In [159]:
movies['tags']=movies['genre']+movies['overview']

In [160]:
movies.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count,user_id,rating,tags
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862,96,3,"Drama,CrimeFramed in the 1940s for the double ..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731,78,3,"Comedy,Drama,RomanceRaj is a rich, carefree, h..."
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280,21,2,"Drama,CrimeSpanning the years 1945 to 1955, a ..."
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959,31,5,"Drama,History,WarThe true story of how busines..."
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811,92,4,"Drama,CrimeIn the continuing saga of the Corle..."


In [161]:
new_df = movies[['id', 'title', 'genre', 'overview', 'tags']]
new_df = new_df.drop(columns=['overview'])  # Only drop 'overview', keep 'genre'


In [162]:
movies['tags'] = movies['genre'].fillna('') + ' ' + movies['overview'].fillna('')
new_df = movies[['id', 'title', 'tags']]


In [163]:
new_df

Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,"Drama,Crime Framed in the 1940s for the double..."
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance Raj is a rich, carefree, ..."
2,238,The Godfather,"Drama,Crime Spanning the years 1945 to 1955, a..."
3,424,Schindler's List,"Drama,History,War The true story of how busine..."
4,240,The Godfather: Part II,"Drama,Crime In the continuing saga of the Corl..."
...,...,...,...
9995,10196,The Last Airbender,"Action,Adventure,Fantasy The story follows the..."
9996,331446,Sharknado 3: Oh Hell No!,"Action,TV Movie,Science Fiction,Comedy,Adventu..."
9997,13995,Captain America,"Action,Science Fiction,War During World War II..."
9998,2312,In the Name of the King: A Dungeon Siege Tale,"Adventure,Fantasy,Action,Drama A man named Far..."


In [164]:
# Ensure 'genre' is included in new_df
new_df = movies[['id', 'title', 'genre', 'tags']]

def recommend(movies, genre=None, release_year=None, min_rating=None):
    try:
        index = new_df[new_df['title'] == movies].index[0]
    except IndexError:
        return "Movie not found in the dataset."

    distance = sorted(list(enumerate(sim[index])), reverse=True, key=lambda vec: vec[1])

    recommendations = []
    for i in distance[1:]:
        movie = new_df.iloc[i[0]]
        if genre and genre.lower() not in movie['genre'].lower():
            continue
        recommendations.append(movie['title'])
        if len(recommendations) == 5:
            break

    return recommendations

In [165]:
from sklearn.feature_extraction.text import CountVectorizer

In [166]:
cv=CountVectorizer(max_features=10000,stop_words='english')

In [167]:
cv

In [168]:
vec=cv.fit_transform(new_df['tags'].values.astype('U')).toarray()

In [169]:
vec

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [170]:
vec.shape

(10000, 10000)

In [171]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
sim=cosine_similarity(vec)

In [None]:
sim

In [None]:
new_df[new_df['title']=='The Shawshank Redemption']

In [None]:
dist=sorted(list(enumerate(sim[0])),reverse=True, key=lambda vec:vec[1])

In [None]:
dist

In [None]:
for i in dist[0:5]:
  print(new_df.iloc[i[0]].title)

In [None]:
new_df.columns

In [None]:
def cli_interface():
    print("Welcome to the Movie Recommendation System!")
    movie_name = input("Enter a movie you like: ")

    # Optional genre filter
    genre = input("Enter a genre to filter by (or leave blank): ")

    # Call the recommend function
    recommendations = recommend(movie_name, genre)

    # Display results
    if isinstance(recommendations, str):  # If it's an error message
        print(recommendations)
    else:
        if recommendations:
            print("\nHere are some movies you might enjoy:")
            for idx, movie in enumerate(recommendations, 1):
                print(f"{idx}. {movie}")
        else:
            print("Sorry, no movies match your criteria.")

# Run the CLI
cli_interface()
