# Preprocessing and cleaning of api request calls

In [1]:
# Importing libraries 
import os 
import pandas as pd 
import requests

from dotenv import load_dotenv

## Retrieving csv files from movies lens

In [2]:
# Loading access keys 
load_dotenv()

API_KEY = os.getenv("API_KEY")

In [3]:
path1 = "../ml-latest-small/ratings.csv"
path2 = "../ml-latest-small/movies.csv"
path3 = "../ml-latest-small/links.csv"

In [4]:
rating_df = pd.read_csv(path1)

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_df = pd.read_csv(path2)
movies_df.head(10)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [None]:
links_df =  pd.read_csv(path3)

links_df.head(10)

In [None]:
# API url parameters
url = "http://www.omdbapi.com/?i="
movie_id = "tt075314"
key = "apikey=" + API_KEY + "&"


In [None]:
# Checking API response
requests.get("http://www.omdbapi.com/")

In [None]:
# Testing api call with toy story imdbid from links_df
url1 = url + movie_id + "&" + key

In [None]:
# Print json results
print(requests.get(url1).json())

In [12]:
# Clean up the imdb id to allow it to be used in the api call 

for i in range(0,len(links_df["imdbId"])):
    if len(str(links_df["imdbId"][i])) == 6:
        links_df["imdbId"][i] = "tt0" + str(links_df["imdbId"][i])
    elif len(str(links_df["imdbId"][i])) == 5:
        links_df["imdbId"][i] = "tt00" + str(links_df["imdbId"][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  links_df["imdbId"][i] = "tt0" + str(links_df["imdbId"][i])


In [13]:
links_df["imdbId"].head()

0    tt0114709
1    tt0113497
2    tt0113228
3    tt0114885
4    tt0113041
Name: imdbId, dtype: object

In [14]:
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,tt0114709,862.0
1,2,tt0113497,8844.0
2,3,tt0113228,15602.0
3,4,tt0114885,31357.0
4,5,tt0113041,11862.0


In [15]:
# Cut movies down to 200, for testing purposes 
movies_cut_200 = movies_df[0:200]

In [16]:
movies_cut_200.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [17]:
# Combine imbdId into the movie_cut_200 df 
movies_data = movies_cut_200.merge(links_df, on = "movieId")

In [18]:
movies_data = movies_df.merge(links_df, on = "movieId")

In [19]:
movies_data.head()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709,862.0
1,2,Jumanji (1995),Adventure|Children|Fantasy,tt0113497,8844.0
2,3,Grumpier Old Men (1995),Comedy|Romance,tt0113228,15602.0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885,31357.0
4,5,Father of the Bride Part II (1995),Comedy,tt0113041,11862.0


In [20]:
len(movies_data)

9742

In [21]:
Movie_plots = []
Genres = []
Poster = []
Rated = []

for i in range(0,len(movies_data)):
    movie_imdbID = str(movies_data["imdbId"][i])

    url2 = url + movie_imdbID + "&" + key
    
    response = requests.get(url2).json()
    plot = response["Plot"]
    genre = response["Genre"]
    poster = response["Poster"]
    rated = response["Rated"]
    
    # Add the plot to a list
    Movie_plots.append(plot)
    Genres.append(genre)
    Poster.append(poster)
    Rated.append(rated)
    
    

KeyError: 'Plot'

In [None]:
movies_data[""]

In [None]:
# Converting the list of elements into a dictionary then to a dataframe
movie_metadata_dict = {"Plot":Movie_plots, "Genre_imdb":Genres, "Poster":Poster,"Rated":Rated}

test_df = pd.DataFrame(movie_metadata_dict)

test_df.head()

In [None]:
# Joining the movie_data and the new elements into one dataframe
test = pd.concat([movies_data,test_df], axis=1)
test.head()

In [None]:
test["title"]

In [None]:
# Dropping and renaming of columns
movie_metadata = test[["movieId", "imdbId","title","Rated","Genre_imdb","Plot","Poster"]]
movie_metadata = movie_metadata.rename(columns = {"movieId":"MovieID","imdbId":"ImdbID","title":"Title"})

In [None]:
movie_metadata.head()

In [None]:
# Check metadata info
movie_metadata.info()

In [None]:
print( "ImdbID duplicates:" + str(movie_metadata["ImdbID"].duplicated().any()))
print( "Title duplicates:" + str(movie_metadata["Title"].duplicated().any()))

In [None]:
outname = 'movie_metadata'

outdir = '../Resources'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)    

movie_metadata.to_csv(fullname, index=False)

## Cleaning rating_df

In [None]:
rating_df.info()

In [None]:
rating_df.head()

In [None]:
a = rating_df.groupby(["userId","movieId"]).count()

a.sort_values("rating", ascending=False)

In [None]:
outname = 'user_ratings'

outdir = '../Resources'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)    

rating_df.to_csv(fullname, index= False)

# Using TMBD api 

In [None]:
import requests

url = "https://api.themoviedb.org/3/movie/862/recommendations?language=en-US&page=1"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZTdjNTZkZTFkNjQxZWIyOGVhODRiNWRkODgzOTUxMCIsInN1YiI6IjY1Y2IyNzRmOGMzMTU5MDE3YzM5MGFlNSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.i4FruSg4IDDl--CWmoNT3SUKtiqCgZm4O54CFZS2uzs"
}

response = requests.get(url, headers=headers)

print(response)

In [None]:
response.json()

In [None]:
# Fing movie info

url = "https://api.themoviedb.org/3/find/tt0114709?external_source=imdb_id"

headers = {
    "accept": "application/json",
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIzZTdjNTZkZTFkNjQxZWIyOGVhODRiNWRkODgzOTUxMCIsInN1YiI6IjY1Y2IyNzRmOGMzMTU5MDE3YzM5MGFlNSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.i4FruSg4IDDl--CWmoNT3SUKtiqCgZm4O54CFZS2uzs"
}

response = requests.get(url, headers=headers).json()

print(response)

In [None]:
response

In [None]:
response['movie_results'][0]