<a href="https://colab.research.google.com/github/ShaliniR8/movie-recommendation-1/blob/feat%2Fexp/mov_rec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System Analysis and Training
This notebook performs data analysis, feature extraction, correlation analysis, and training for a movie recommendation system.

## Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import TruncatedSVD
from joblib import dump
import requests

In [6]:
# TMDb API token
API_TOKEN = "eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiI0OWUwOWFlY2QxNmY3Y2I5NjE3YzE0NGU2ZjVhOGIxNyIsIm5iZiI6MTczMzc4ODM5OS4zOSwic3ViIjoiNjc1NzgyZWY4MTM4YmU1NWQ5YTEyYzViIiwic2NvcGVzIjpbImFwaV9yZWFkIl0sInZlcnNpb24iOjF9.feZ8NoJAdpVBBW3n2N5FAs20bVEcAmoJnyx6uns7lHc"  # Replace with your own bearer token

HEADERS = {
    "Authorization": f"Bearer {API_TOKEN}"
}

url = f"https://api.themoviedb.org/3/movie/2"
response = requests.get(url, headers=HEADERS)

In [7]:
response.json()

{'adult': False,
 'backdrop_path': '/hQ4pYsIbP22TMXOUdSfC2mjWrO0.jpg',
 'belongs_to_collection': {'id': 1382526,
  'name': "Kaurismäki's Proletariat Trilogy",
  'poster_path': None,
  'backdrop_path': None},
 'budget': 0,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 18, 'name': 'Drama'},
  {'id': 10749, 'name': 'Romance'},
  {'id': 80, 'name': 'Crime'}],
 'homepage': '',
 'id': 2,
 'imdb_id': 'tt0094675',
 'origin_country': ['FI'],
 'original_language': 'fi',
 'original_title': 'Ariel',
 'overview': 'After the coal mine he works at closes and his father commits suicide, a Finnish man leaves for the city to make a living but there, he is framed and imprisoned for various crimes.',
 'popularity': 26.244,
 'poster_path': '/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg',
 'production_companies': [{'id': 2303,
   'logo_path': None,
   'name': 'Villealfa Filmproductions',
   'origin_country': 'FI'}],
 'production_countries': [{'iso_3166_1': 'FI', 'name': 'Finland'}],
 'release_date': '1988-10-21',
 

In [None]:


def fetch_movie_details(movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}"
    response = requests.get(url, headers=HEADERS)
    if response.status_code == 200:
        data = response.json()
        budget = data.get("budget", 0)
        genres = ", ".join(genre['name'] for genre in data.get("genres", [])[:2])  # Top 2 genres
        origin_country = ", ".join(country['name'] for country in data.get("production_countries", []))
        production_companies = ", ".join(company['name'] for company in data.get("production_companies", []))
        overview = data.get("overview", "No overview available")
        img = data.get("poster_path", "")
        return budget, genres, origin_country, production_companies, overview, img
    else:
        return 0, "Unknown", "Unknown", "Unknown", "Unknown"

movie_data = pd.read_csv("Movie_Id_Titles.csv", header=None, names=["movie_id", "movie_name"])

# New columns
movie_data["budget"] = 0
movie_data["genres"] = ""
movie_data["origin_country"] = ""
movie_data["production_companies"] = ""
movie_data["overview"] = ""
movie_data["poster_path"] = ""

# Fetch details and populate the columns
for index, row in movie_data.iterrows():
    budget, genres, origin_country, production_companies, overview, poster_path = fetch_movie_details(row["movie_id"])
    movie_data.at[index, "budget"] = budget
    movie_data.at[index, "genres"] = genres
    movie_data.at[index, "origin_country"] = origin_country
    movie_data.at[index, "production_companies"] = production_companies
    movie_data.at[index, "overview"] = overview
    movie_data.at[index, "poster_path"] = poster_path
    print(movie_data.loc[index])

output_path = "Movie_Id_Titles_Enhanced.csv"
movie_data.to_csv(output_path, index=False, header=False)

print(f"Updated file saved to {output_path}")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
movie_name                                                War, The (1994)
budget                                                            7500000
genres                                                     Romance, Drama
origin_country                                   United States of America
production_companies    Orion Pictures, Breathless Associates, Miko Pr...
overview                Jesse, a small-time criminal, high-tails it to...
Name: 1058, dtype: object
movie_id                                                             1059
movie_name              Don't Be a Menace to South Central While Drink...
budget                                                                  0
genres                                                      Drama, Action
origin_country                                                      Japan
production_companies                                                 TOHO
overview             

ValueError: too many values to unpack (expected 2)

## Data Loading

In [None]:
file_tsv_path = "file.tsv"
movie_details_path = "Movie_Id_Titles_Enhanced.csv"

ratings_df = pd.read_csv(file_tsv_path, sep="\t", header=None, names=["user_id", "item_id", "rating", "timestamp"])
movies_df = pd.read_csv(movie_details_path, header=None, names=["movie_id", "movie_name", "budget", "genres", "origin_country", "production_companies", "overview"])

ratings_df.head(), movies_df.head()


(   user_id  item_id  rating  timestamp
 0        0       50       5  881250949
 1        0      172       5  881250949
 2        0      133       1  881250949
 3      196      242       3  881250949
 4      186      302       3  891717742,
   movie_id         movie_name  budget         genres origin_country  \
 0  item_id              title       0        Unknown        Unknown   
 1        1   Toy Story (1995)       0        Unknown        Unknown   
 2        2   GoldenEye (1995)       0  Comedy, Drama        Finland   
 3        3  Four Rooms (1995)       0  Comedy, Drama        Finland   
 4        4  Get Shorty (1995)       0        Unknown        Unknown   
 
         production_companies  \
 0                    Unknown   
 1                    Unknown   
 2  Villealfa Filmproductions   
 3  Villealfa Filmproductions   
 4                    Unknown   
 
                                             overview  
 0                                            Unknown  
 1           

## Clean and preprocess data

In [None]:
movies_df = movies_df[movies_df.movie_id != "item_id"]
ratings_df['item_id'] = ratings_df['item_id'].astype(str)
movies_df['movie_id'] = movies_df['movie_id'].astype(str)
ratings_with_movies = pd.merge(ratings_df, movies_df, left_on='item_id', right_on='movie_id', how='inner')

ratings_summary = ratings_with_movies.groupby('movie_name')['rating'].agg(['count', 'mean']).reset_index()
ratings_summary = ratings_summary.rename(columns={"count": "num_ratings", "mean": "avg_rating"}).sort_values(by="num_ratings", ascending=False)

ratings_with_movies.head(), ratings_summary.head()


(   user_id item_id  rating  timestamp movie_id  \
 0        0      50       5  881250949       50   
 1        0     172       5  881250949      172   
 2        0     133       1  881250949      133   
 3      196     242       3  881250949      242   
 4      186     302       3  891717742      302   
 
                         movie_name    budget                   genres  \
 0                 Star Wars (1977)         0                  Unknown   
 1  Empire Strikes Back, The (1980)  30000000  Science Fiction, Action   
 2        Gone with the Wind (1939)         0              Documentary   
 3                     Kolya (1996)  54000000             Crime, Drama   
 4         L.A. Confidential (1997)   7800000          Crime, Thriller   
 
              origin_country  \
 0                   Unknown   
 1  United States of America   
 2  United States of America   
 3  United States of America   
 4    France, United Kingdom   
 
                                 production_companie

## Create pivot table for collaborative filtering

In [None]:
movie_user_matrix = ratings_with_movies.pivot_table(index='movie_name', columns='user_id', values='rating', fill_value=0)
movie_user_sparse = csr_matrix(movie_user_matrix.values)
movie_similarity = cosine_similarity(movie_user_sparse)
movie_similarity_df = pd.DataFrame(movie_similarity, index=movie_user_matrix.index, columns=movie_user_matrix.index)
user_movie_matrix = ratings_with_movies.pivot_table(index='user_id', columns='movie_name', values='rating', fill_value=0)
user_movie_sparse = csr_matrix(user_movie_matrix.values)
user_similarity = cosine_similarity(user_movie_sparse)
user_similarity_df = pd.DataFrame(user_similarity, index=user_movie_matrix.index, columns=user_movie_matrix.index)

## Recommendation functions

In [None]:


def predict_rating(movie_name):
    """Predict the average rating of a movie."""
    if movie_name in ratings_summary['movie_name'].values:
        return ratings_summary.loc[ratings_summary['movie_name'] == movie_name, 'avg_rating'].values[0]
    return "Movie not found"

def recommend_similar_movies(movie_name, top_n=5):
    """Recommend similar movies to a given movie."""
    if movie_name in movie_similarity_df.index:
        similar_movies = movie_similarity_df[movie_name].sort_values(ascending=False).iloc[1:top_n + 1]
        return similar_movies.index.tolist()
    return "Movie not found"

def top_users_for_movie(movie_name, top_n=10):
    """Find top users who rated a movie highly."""
    movie_ratings = ratings_with_movies[ratings_with_movies['movie_name'] == movie_name]
    top_users = movie_ratings.sort_values(by='rating', ascending=False).head(top_n)
    return top_users['user_id'].tolist()

def recommend_movies_for_user(user_id, top_n=5):
    """Recommend movies for a user based on similar users."""
    if user_id in user_similarity_df.index:
        similar_users = user_similarity_df[user_id].sort_values(ascending=False).iloc[1:top_n + 1]
        similar_user_ids = similar_users.index
        recommendations = ratings_with_movies[ratings_with_movies['user_id'].isin(similar_user_ids)]
        top_recommendations = recommendations.groupby('movie_name')['rating'].mean().sort_values(ascending=False).head(top_n)
        return top_recommendations.index.tolist()
    return "User not found"

predicted_rating_star_wars = predict_rating("Star Wars (1977)")
similar_movies_star_wars = recommend_similar_movies("Star Wars (1977)")
top_users_star_wars = top_users_for_movie("Star Wars (1977)")
user_224_recommendations = recommend_movies_for_user(224)

predicted_rating_star_wars, similar_movies_star_wars, top_users_star_wars, user_224_recommendations


(4.359589041095891,
 ['Return of the Jedi (1983)',
  'Raiders of the Lost Ark (1981)',
  'Empire Strikes Back, The (1980)',
  'Toy Story (1995)',
  'Godfather, The (1972)'],
 [0, 268, 748, 864, 337, 419, 392, 85, 791, 862],
 ['Killing Fields, The (1984)',
  'Dead Man Walking (1995)',
  'Waiting for Guffman (1996)',
  'Cool Hand Luke (1967)',
  'To Catch a Thief (1955)'])

In [None]:
# Save the dataframes as CSVs
movie_similarity_csv_path = "movie_similarity.csv"
user_similarity_csv_path = "user_similarity.csv"
ratings_summary_csv_path = "ratings_summary.csv"

movie_similarity_df.to_csv(movie_similarity_csv_path)
user_similarity_df.to_csv(user_similarity_csv_path)
ratings_summary.to_csv(ratings_summary_csv_path)

# Paths for download
[movie_similarity_csv_path, user_similarity_csv_path, ratings_summary_csv_path]


['movie_similarity.csv', 'user_similarity.csv', 'ratings_summary.csv']