<a href="https://colab.research.google.com/github/Saramohamed188/DEPI/blob/main/movies_depii.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
pip install scikit-surprise



In [18]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import ast
from surprise import SVD, Dataset, Reader , accuracy
from surprise.model_selection import train_test_split as surprise_train_test_split


In [19]:
movies = pd.read_csv('movies_metadata.csv', engine='python', on_bad_lines='skip')

irrelevant_columns = ['homepage', 'imdb_id', 'adult', 'video', 'poster_path', 'status', 'tagline', 'overview']
movies = movies.drop(irrelevant_columns, axis=1)

print(movies.head())


                               belongs_to_collection    budget  \
0  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000   
1                                                NaN  65000000   
2  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0   
3                                                NaN  16000000   
4  {'id': 96871, 'name': 'Father of the Bride Col...         0   

                                              genres     id original_language  \
0  [{'id': 16, 'name': 'Animation'}, {'id': 35, '...    862                en   
1  [{'id': 12, 'name': 'Adventure'}, {'id': 14, '...   8844                en   
2  [{'id': 10749, 'name': 'Romance'}, {'id': 35, ...  15602                en   
3  [{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...  31357                en   
4                     [{'id': 35, 'name': 'Comedy'}]  11862                en   

                original_title  popularity  \
0                    Toy Story   21.946943   
1                      J

In [20]:
movies = movies.dropna(subset=['title', 'genres', 'release_date', 'vote_average', 'vote_count'])

movies = movies.drop_duplicates()

movies['release_date'] = pd.to_datetime(movies['release_date'], errors='coerce')

In [21]:
movies = movies[movies['vote_count'] > 0]

In [22]:
def clean_genres(genres):
    try:
        genres = ast.literal_eval(genres)
        return [genre['name'] for genre in genres]
    except:
        return []

movies['genres'] = movies['genres'].apply(clean_genres)

In [23]:
movies = movies.join(movies['genres'].str.join('|').str.get_dummies())

movies = movies.drop(['genres', 'production_companies', 'production_countries', 'spoken_languages'], axis=1)

print("Cleaned dataset preview:\n", movies.head())
print("Cleaned dataset info:\n", movies.info())

movies.to_csv('cleaned_movies_metadata.csv', index=False)

Cleaned dataset preview:
                                belongs_to_collection    budget     id  \
0  {'id': 10194, 'name': 'Toy Story Collection', ...  30000000    862   
1                                                NaN  65000000   8844   
2  {'id': 119050, 'name': 'Grumpy Old Men Collect...         0  15602   
3                                                NaN  16000000  31357   
4  {'id': 96871, 'name': 'Father of the Bride Col...         0  11862   

  original_language               original_title  popularity release_date  \
0                en                    Toy Story   21.946943   1995-10-30   
1                en                      Jumanji   17.015539   1995-12-15   
2                en             Grumpier Old Men   11.712900   1995-12-22   
3                en            Waiting to Exhale    3.859495   1995-12-22   
4                en  Father of the Bride Part II    8.387519   1995-02-10   

     revenue  runtime                        title  ...  History  Horror

In [24]:
movies['id'] = pd.to_numeric(movies['id'], errors='coerce')
movies = movies.dropna(subset=['id'])
movies['id'] = movies['id'].astype(int)

In [25]:
ratings = pd.read_csv('ratings.csv')
movie_data = pd.merge(ratings, movies, left_on='movieId', right_on='id')
train_data, test_data = train_test_split(movie_data, test_size=0.2)

In [26]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movie_data[['userId', 'movieId', 'rating']], reader)
trainset, testset = surprise_train_test_split(data, test_size=0.2)

In [27]:
svd = SVD()


In [28]:
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7a73bcc48280>

In [29]:
predictions = svd.test(testset)
accuracy.rmse(predictions)

RMSE: 0.8849


0.8848918356477918

In [30]:
from sklearn.metrics.pairwise import cosine_similarity
metadata_features = movies.drop(['id', 'title', 'vote_average', 'vote_count', 'release_date', 'belongs_to_collection'], axis=1, errors='ignore')

metadata_features = metadata_features.apply(pd.to_numeric, errors='coerce')

metadata_features = metadata_features.fillna(0)

cosine_sim = cosine_similarity(metadata_features, metadata_features)



In [31]:
print(metadata_features.dtypes)

budget                 int64
original_language    float64
original_title       float64
popularity           float64
revenue                int64
runtime              float64
Action                 int64
Adventure              int64
Animation              int64
Comedy                 int64
Crime                  int64
Documentary            int64
Drama                  int64
Family                 int64
Fantasy                int64
Foreign                int64
History                int64
Horror                 int64
Music                  int64
Mystery                int64
Romance                int64
Science Fiction        int64
TV Movie               int64
Thriller               int64
War                    int64
Western                int64
dtype: object


In [32]:
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = movies[movies['title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return movies['title'].iloc[movie_indices]


In [33]:
recommendations = get_recommendations('The Godfather')
print("\nRecommendations for 'The Godfather':\n", recommendations)


Recommendations for 'The Godfather':
 84             Angels and Insects
5115                     Spetters
7414                The Soft Skin
1266           Female Perversions
6551    The Other Side of the Bed
640                 Moll Flanders
403                     8 Seconds
2368               My Name Is Joe
7337          Pride and Prejudice
3329              The Good Mother
Name: title, dtype: object


In [34]:
with open('recommendation_evaluation.txt', 'w') as f:
    f.write(f"RMSE of collaborative filtering: {accuracy.rmse(predictions)}")

RMSE: 0.8849
