# RS ASSIGNMENT 01


## NAME: KUMAIL HAIDER
## ROLL: 20K-0455

### Question 02 (Part A)

In [23]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# NOTE:
# For given active user which I take (i.e: userID=2) the output of the top 5 recommended movies mostly remains the same 
# for any input movie name, it is because we are always recommending movies to the same active user 
# based on their similarity to other users who have rated the movies. 
# The recommendations would be different for different active users because the similarity between users would be different.

try:
    movies_file = pd.read_csv('C:\\Users\mt\\movies.csv')
    ratings_file = pd.read_csv('C:\\Users\mt\\ratings.csv')

    merged_file = pd.merge(ratings_file, movies_file, on='movieId')
    user_item_matrix = merged_file.pivot_table(index='userId', columns='title', values='rating')

#   COMPUTE COSINE SIMILARITY 
    active_user_similarity = cosine_similarity(user_item_matrix.fillna(0))

#   Function to predict ratings
    def predict_rating(user_item_matrix, active_user_similarity, active_user_id, movie_title):
        active_user_idx = user_item_matrix.index.get_loc(active_user_id)
        movie_idx = user_item_matrix.columns.get_loc(movie_title)
    
        active_user_sim_scores = active_user_similarity[active_user_idx]
        active_user_ratings = user_item_matrix.iloc[active_user_idx]
    
    # As data is big so we take k=10 nearest users to the active user
        similar_user_idxs = np.argsort(active_user_sim_scores)[::-1][1:11]
    
    # COMPUTE MEAN CENTERED RATING
        active_user_mean_rating = active_user_ratings.mean()
        movie_mean_rating = user_item_matrix[movie_title].mean()
        mc_rating = user_item_matrix.iloc[active_user_idx, movie_idx] - active_user_mean_rating - movie_mean_rating
    
        sim_scores = active_user_sim_scores[similar_user_idxs]
        sim_ratings = user_item_matrix.iloc[similar_user_idxs, movie_idx] - user_item_matrix.iloc[similar_user_idxs].mean(axis=1)
        weighted_sum = np.sum(sim_scores * sim_ratings)
    
    #PREDICTION FUNCTION (MEAN CENTERED)
        pred_rating = active_user_mean_rating + movie_mean_rating + (weighted_sum / np.sum(np.abs(sim_scores)))
        return pred_rating

    print("*** WELCOME TO USER BASED RECOMMENDER SYSTEM ***")
    movie_name = input("\nInput a movie name: ")
    movie_id = movies_file[movies_file['title'] == movie_name]['movieId'].iloc[0]

    #SUPPOSE WE TAKE ACTIVE USER=2
    active_user_id = 2

    pred_ratings = []
    for movie_title in user_item_matrix.columns:
        if movie_title == movie_name:
            continue
        pred_rating = predict_rating(user_item_matrix, active_user_similarity, active_user_id, movie_title)
        pred_ratings.append((movie_title, pred_rating))

    top_movies = sorted(pred_ratings, key=lambda x: x[1], reverse=True)[:5]

    print("Top 5 Recommended Movies by User-Based RS: ")
    for movie in top_movies:
        print(movie[0])
    
except Exception as e:
    print(e)

*** WELCOME TO USER BASED RECOMMENDER SYSTEM ***

Input a movie name: 20 Million Miles to Earth (1957)
Top 5 Recommended Movies by User-Based RS: 
'Salem's Lot (2004)
12 Angry Men (1997)
12 Chairs (1976)
61* (2001)
7 Faces of Dr. Lao (1964)


### Question 02 (Part B)

In [25]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

try:
    movies_file = pd.read_csv('C:\\Users\mt\\movies.csv')
    ratings_file = pd.read_csv('C:\\Users\mt\\ratings.csv')

    merged_file = pd.merge(ratings_file, movies_file, on='movieId')

    movie_stats = merged_file.groupby('title').agg({'rating': [np.mean, np.size]})

    popular_movies = movie_stats['rating']['size'] >= 50
    movie_stats = movie_stats[popular_movies].sort_values([('rating', 'mean')], ascending=False)

    movies_file['genre_list'] = movies_file['genres'].str.split('|')

    movies_file['genre_str'] = movies_file['genre_list'].apply(lambda x: ' '.join(x))

# Genre strings vectorization using the library functionTfidfVectorizer
    tf_idf = TfidfVectorizer(stop_words='english')
    tf_idf_matrix = tf_idf.fit_transform(movies_file['genre_str'])

# FINDING COSINE SIMILARITY BETWEEN MOVIES
    cosine_sim = cosine_similarity(tf_idf_matrix)

    def recommend_movies(movie_name):
        movie_id = movies_file[movies_file['title'] == movie_name]['movieId'].iloc[0]
        movie_idx = movies_file.index[movies_file['movieId'] == movie_id][0]
    
    # Finding cosine similarity score
        sim_scores = list(enumerate(cosine_sim[movie_idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Select top 3 most similar movies wrt given movie
        top_movies = [(movies_file.iloc[idx]['title'], sim_scores[idx][1]) for idx in [i[0] for i in sim_scores[1:4]]]
        return top_movies

    print("*** WELCOME TO CONTENT BASED RECOMMENDER SYSTEM ***")
    movie_name = input("Input a movie name: ")
    recommended_movies = recommend_movies(movie_name)

    print("\nTop 3 Recommended Movies by Content-Based RS: ")
    for movie in recommended_movies:
        print(movie[0])
    
except Exception as e:
    print(e)

*** WELCOME TO CONTENT BASED RECOMMENDER SYSTEM ***
Input a movie name: 20 Million Miles to Earth (1957)

Top 3 Recommended Movies by Content-Based RS: 
Meteor (1979)
It Came from Outer Space (1953)
Earth vs. the Flying Saucers (1956)
