In [163]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [164]:
import math
import os
import random
from collections import Counter

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt


def loading_data(data_path):

    if os.path.isfile(os.path.join(data_path, "ratings.csv")):
        ratings = pd.read_csv(os.path.join(data_path, "ratings.csv"))
        movies = pd.read_csv(os.path.join(data_path, "movies.csv"))
        links = pd.read_csv(os.path.join(data_path, "links.csv"))
        metadata = pd.read_csv(os.path.join(data_path, "movies_metadata.csv"))
        return ratings, movies, links, metadata, True
    else:
        return [], [], [], [], False


def analysis(ratings, movie_data):

    print("Information about the data : \n")
    print("*********************************************************")
    print("Number of Users :", len(np.unique(ratings["userId"])))
    print("Number of movies :", len(np.unique(movie_data["movieId"])))

    print("\n*********************************************************")
    print("\nMovies with highest number of user ratings :\n")
    
    for i in sorted(
        Counter(ratings["movieId"]).items(), key=lambda x: x[1], reverse=True
    )[:5]:
        print(movie_data["title"][i[0]])

    print("\n*********************************************************")
    print("\nUser who gave more ratings  :")
    print(
        ratings.groupby("userId")
        .count()
        .sort_values(["movieId"], ascending=False)["movieId"]
        .head(5)
    )

In [165]:
ratings, movies_data, links, metadata, status = loading_data(r"/content/drive/MyDrive/Colab Notebooks/Movie/Movie-Recommendation/dataset")
# movies_data["Overview"] = "Nan"

  if self.run_code(code, result):


In [166]:
ratings.shape, movies_data.shape, links.shape, metadata.shape

((100836, 4), (9742, 3), (9742, 3), (45466, 24))

In [167]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [168]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [169]:
import requests # to make TMDB API calls
import locale # to format currency as USD
locale.setlocale( locale.LC_ALL, '' )
import json
import pandas as pd

In [170]:
API_key = '########################'

def get_data(API_key, Movie_ID):
    query = 'https://api.themoviedb.org/3/movie/'+str(Movie_ID)+'?api_key='+API_key+'&language=en-US'
    response =  requests.get(query)
    if response.status_code==200: 
        array = response.json()
        text = json.dumps(array)
        return (text)
    else:
        return ("error")

def write_file(data):
    
    for i in range(data.shape[0]):
        result = get_data(API_key, data["tmdbId"].iloc[i])
        
        if result == "error":
            overview = "None"
        else:
            dataset = json.loads(result)
            try:
                overview = dataset['overview']
            except:
                overview = "None"
        data["Overview"].iloc[i] = overview
    return data

In [171]:
def using_API_call(movies_data):
  movies_data = pd.merge(links, movies_data, on="movieId")

  movies_data = movies_data.dropna()
  movies_data["tmdbId"] = movies_data["tmdbId"].astype("int32")

  movies_data = write_file(movies_data)
  return movies_data

def using_metadata(metadata, movies_data):
  metadata.rename(columns={"id":"movieId", "genres":"movie_genres"}, inplace=True)
  movies_data.rename(columns={"title":"movie_title"}, inplace=True)
  
  metadata.rename(columns={"id":"movieId"}, inplace=True)
  metadata = metadata.drop(labels=[35587,19730, 29503], axis=0)
  metadata["movieId"] = metadata["movieId"].astype("int32")

  movies_data = pd.merge(metadata, movies_data, on="movieId")
  movies_data = movies_data.drop(['adult', 'belongs_to_collection', 'budget', 'homepage','movie_genres', 'movie_title',
       'imdb_id', 'original_language', 'original_title',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',''
       'spoken_languages', 'status', 'tagline', 'video',
       'vote_average', 'vote_count'], axis="columns")
  movies_data = movies_data.dropna()

  return movies_data

In [172]:
metadata.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [173]:
# def using_metadata_and_API(metadata, movies_data, links):
  
#   movies_data = pd.merge(links, movies_data, on="movieId")
#   movies_data = movies_data.dropna()

#   for i in range(movies_data.shape[0]):
    
#     try:
#       temp = metadata[metadata["id"] == movies_data["movieId"].iloc[i]]
#       if len(temp) > 0:
#            overview = temp["Overview"].iloc[0]
#       else:
#           overview = "None"
#           # result = get_data(API_key, movies_data["tmdbId"].iloc[i])
          
#           # if result == "error":
#           #     overview = "None"
#           # else:
#           #     dataset = json.loads(result)
#           #     try:
#           #         overview = dataset['overview']
#           #     except:
#           #         overview = "None"
#       movies_data["Overview"].iloc[i] = overview
#     except Exception as w:
#       print(w)

#   return movies_data

In [174]:
movies_data = using_metadata(metadata, movies_data)
movies_data.shape

(2776, 4)

In [175]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

In [176]:
tfv_matrix = tfv.fit_transform(movies_data['overview'])
print(tfv_matrix.shape)

(2776, 6407)


In [177]:
tfv_matrix

<2776x6407 sparse matrix of type '<class 'numpy.float64'>'
	with 65610 stored elements in Compressed Sparse Row format>

In [178]:
# movies_data.to_csv(r"/content/drive/MyDrive/Colab Notebooks/Movie/Movie-Recommendation/dataset/processed_dataset")

In [179]:
from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
print(sig[0])

[0.7616597  0.76159805 0.76159416 ... 0.76159416 0.76159416 0.76159416]


In [184]:
def give_recomendations(title, sig=sig):
    # Get the index corresponding to original_title
    idx = movies_data.index[movies_data["title"] == title].tolist()
    if len(idx) > 1:
      print("More than one movie is register with this name!!!!!!!!!")
    elif len(idx) == 0:
      print("Movie is not registered with us!!!!!!!!!!")
    else:
      idx = idx[0]
      # Get the pairwsie similarity scores
      sig_scores = list(enumerate(sig[idx]))

      # Sort the movies
      sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

      # Scores of the 10 most similar movies
      sig_scores = sig_scores[1:11]

      # Movie indices
      movie_indices = [i[0] for i in sig_scores]

      # Top 10 most similar movies
      return movies_data['title'].iloc[movie_indices]

In [187]:
give_recomendations("Casino",sig)

2368           The Grasshopper
2120       The Las Vegas Story
809             Ocean's Eleven
1692                 Lucky You
8            Leaving Las Vegas
1070              Over the Top
218     The Godfather: Part II
1142            Viva Las Vegas
810             Ocean's Eleven
292              Fools Rush In
Name: title, dtype: object