In [33]:
!pip install scikit-surprise



In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from surprise import Dataset, Reader, SVD
from surprise.model_selection import cross_validate



In [35]:
cleaned_df = pd.read_csv('../data/cleaned_dataset.csv')

In [36]:
# Ensure 'combined_features' column is created correctly
def combine_features(row):
    # Combine genres, cast, and language as an example
    return ' '.join(row['genres']) + ' ' + ' '.join(row['cast']) + ' ' + row['original_language']

cleaned_df['combined_features'] = cleaned_df.apply(combine_features, axis=1)
# Check that 'combined_features' has been created successfully
print(cleaned_df['combined_features'].isnull().sum())  # Should output 0 if no null values
print(cleaned_df['combined_features'].head())  # View some sample rows

# Replace any remaining empty strings with NaN and drop rows with NaN in 'combined_features'
cleaned_df = cleaned_df.replace({'combined_features': ''}, np.nan)
cleaned_df = cleaned_df.dropna(subset=['combined_features'])
count_vectorizer = CountVectorizer(stop_words='english')
count_matrix = count_vectorizer.fit_transform(cleaned_df['combined_features'])
cosine_sim = cosine_similarity(count_matrix)
print(count_matrix.shape)
print(cosine_sim.shape)

# Function to get movie recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = cleaned_df[cleaned_df['title_x'] == title].index[0]  # Adjust column name as needed
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Get top 10 most similar movies
    movie_indices = [i[0] for i in sim_scores]
    return cleaned_df['title_x'].iloc[movie_indices]

# Test the recommendation system
print(get_recommendations('Avatar'))


0
0    [ ' A c t i o n ' ,   ' A d v e n t u r e ' , ...
1    [ ' A d v e n t u r e ' ,   ' F a n t a s y ' ...
2    [ ' A c t i o n ' ,   ' A d v e n t u r e ' , ...
3    [ ' A c t i o n ' ,   ' C r i m e ' ,   ' D r ...
4    [ ' A c t i o n ' ,   ' A d v e n t u r e ' , ...
Name: combined_features, dtype: object
(4803, 32)
(4803, 4803)
1     Pirates of the Caribbean: At World's End
2                                      Spectre
3                        The Dark Knight Rises
4                                  John Carter
5                                 Spider-Man 3
6                                      Tangled
7                      Avengers: Age of Ultron
8       Harry Potter and the Half-Blood Prince
9           Batman v Superman: Dawn of Justice
10                            Superman Returns
Name: title_x, dtype: object


In [32]:
# Load data into Surprise format for collaborative filtering
ratings_df = pd.DataFrame({
    'user_id': np.random.randint(1, 500, size=len(cleaned_df)),  # Dummy user data
    'movie_id': cleaned_df['movie_id'],
    'rating': cleaned_df['vote_average']
})

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(ratings_df[['user_id', 'movie_id', 'rating']], reader)

# Use Singular Value Decomposition (SVD) for collaborative filtering
algo = SVD()

# Cross-validation to evaluate the model
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Train the model
trainset = data.build_full_trainset()
algo.fit(trainset)

# Recommend movies to a user
def recommend_for_user(user_id):
    movie_ids = cleaned_df['movie_id'].values
    ratings = [algo.predict(user_id, movie_id).est for movie_id in movie_ids]
    movie_recommendations = pd.DataFrame({
        'title': cleaned_df['title_x'],
        'rating': ratings
    }).sort_values(by='rating', ascending=False)
    return movie_recommendations.head(10)

# Test the recommendation function
print(recommend_for_user(1))

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.2819  1.2029  1.2316  1.0772  1.2439  1.2075  0.0699  
MAE (testset)     0.8887  0.8418  0.8443  0.7919  0.8406  0.8414  0.0307  
Fit time          0.23    0.35    0.26    0.13    0.17    0.23    0.07    
Test time         0.01    0.01    0.02    0.01    0.01    0.01    0.00    
                                              title    rating
3238                           Little Miss Sunshine  6.544355
4639                                           Cube  6.537046
3228                        Yeh Jawaani Hai Deewani  6.505054
3894                             A Room with a View  6.494881
4678                   The Business of Fancydancing  6.487124
3813                             Gone with the Wind  6.483067
2386                                 One Man's Hero  6.481842
287                                Django Unchained  6.477962
329   The Lor