# Preprocessing and cleaning of api request calls

In [1]:
# Importing libraries 
import os 
import pandas as pd 
import requests

from dotenv import load_dotenv

## Retrieving csv files from movies lens

In [2]:
# Loading access keys 
load_dotenv()

API_KEY = os.getenv("API_KEY")

In [3]:
path1 = "../ml-latest-small/ratings.csv"
path2 = "../ml-latest-small/movies.csv"
path3 = "../ml-latest-small/links.csv"

In [4]:
rating_df = pd.read_csv(path1)

In [5]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
movies_df = pd.read_csv(path2)
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
links_df =  pd.read_csv(path3)

links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [8]:
# API url parameters
url = "http://www.omdbapi.com/?i="
movie_id = "tt075314"
key = "apikey=" + API_KEY + "&"


In [9]:
# Checking API response
requests.get("http://www.omdbapi.com/")

<Response [200]>

In [10]:
# Testing api call with toy story imdbid from links_df
url1 = url + movie_id + "&" + key

In [11]:
# Print json results
print(requests.get(url1).json())

{'Response': 'False', 'Error': 'Incorrect IMDb ID.'}


In [15]:
# Clean up the imdb id to allow it to be used in the api call 

for i in range(0,len(links_df["imdbId"])):
    if len(str(links_df["imdbId"][i])) == 6:
        links_df["imdbId"][i] = "tt0" + str(links_df["imdbId"][i])
    elif len(str(links_df["imdbId"][i])) == 5:
        links_df["imdbId"][i] = "tt00" + str(links_df["imdbId"][i])

In [16]:
links_df["imdbId"].head()

0    tt0114709
1    tt0113497
2    tt0113228
3    tt0114885
4    tt0113041
Name: imdbId, dtype: object

In [17]:
links_df = links_df.drop(columns = "tmdbId")

In [18]:
links_df.head()

Unnamed: 0,movieId,imdbId
0,1,tt0114709
1,2,tt0113497
2,3,tt0113228
3,4,tt0114885
4,5,tt0113041


In [19]:
# Cut movies down to 200, for testing purposes 
movies_cut_200 = movies_df[0:200]

In [20]:
movies_cut_200.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
# Combine imbdId into the movie_cut_200 df 
movies_data = movies_cut_200.merge(links_df, on = "movieId")

In [22]:
movies_data.head()

Unnamed: 0,movieId,title,genres,imdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709
1,2,Jumanji (1995),Adventure|Children|Fantasy,tt0113497
2,3,Grumpier Old Men (1995),Comedy|Romance,tt0113228
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885
4,5,Father of the Bride Part II (1995),Comedy,tt0113041


In [32]:
Movie_plots = []
Genres = []
Poster = []
Rated = []

for i in range(0,len(movies_data)):
    movie_imdbID = str(movies_data["imdbId"][i])

    url2 = url + movie_imdbID + "&" + key
    
    response = requests.get(url2).json()
    plot = response["Plot"]
    genre = response["Genre"]
    poster = response["Poster"]
    rated = response["Rated"]
    
    # Add the plot to a list
    Movie_plots.append(plot)
    Genres.append(genre)
    Poster.append(poster)
    Rated.append(rated)
    
    

In [24]:
movies_data[""]

["A cowboy doll is profoundly threatened and jealous when a new spaceman action figure supplants him as top toy in a boy's bedroom.",
 'When two kids find and play a magical board game, they release a man trapped in it for decades - and a host of dangers that can only be stopped by finishing the game.',
 "John and Max resolve to save their beloved bait shop from turning into an Italian restaurant, just as its new female owner catches Max's attention.",
 "Based on Terry McMillan's novel, this film follows four very different African-American women and their relationships with men.",
 "George Banks must deal not only with his daughter's pregnancy, but also with his wife's."]

In [36]:
# Converting the list of elements into a dictionary then to a dataframe
movie_metadata_dict = {"Plot":Movie_plots, "Genre_imdb":Genres, "Poster":Poster,"Rated":Rated}

test_df = pd.DataFrame(movie_metadata_dict)

test_df.head()

Unnamed: 0,Plot,Genre_imdb,Poster,Rated
0,A cowboy doll is profoundly threatened and jea...,"Animation, Adventure, Comedy",https://m.media-amazon.com/images/M/MV5BMDU2ZW...,G
1,When two kids find and play a magical board ga...,"Adventure, Comedy, Family",https://m.media-amazon.com/images/M/MV5BZTk2Zm...,PG
2,John and Max resolve to save their beloved bai...,"Comedy, Romance",https://m.media-amazon.com/images/M/MV5BMjQxM2...,PG-13
3,"Based on Terry McMillan's novel, this film fol...","Comedy, Drama, Romance",https://m.media-amazon.com/images/M/MV5BYzcyMD...,R
4,George Banks must deal not only with his daugh...,"Comedy, Family, Romance",https://m.media-amazon.com/images/M/MV5BOTEyNz...,PG


In [41]:
# Joining the movie_data and the new elements into one dataframe
test = pd.concat([movies_data,test_df], axis=1)
test.head()

Unnamed: 0,movieId,title,genres,imdbId,Plot,Genre_imdb,Poster,Rated
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,tt0114709,A cowboy doll is profoundly threatened and jea...,"Animation, Adventure, Comedy",https://m.media-amazon.com/images/M/MV5BMDU2ZW...,G
1,2,Jumanji (1995),Adventure|Children|Fantasy,tt0113497,When two kids find and play a magical board ga...,"Adventure, Comedy, Family",https://m.media-amazon.com/images/M/MV5BZTk2Zm...,PG
2,3,Grumpier Old Men (1995),Comedy|Romance,tt0113228,John and Max resolve to save their beloved bai...,"Comedy, Romance",https://m.media-amazon.com/images/M/MV5BMjQxM2...,PG-13
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,tt0114885,"Based on Terry McMillan's novel, this film fol...","Comedy, Drama, Romance",https://m.media-amazon.com/images/M/MV5BYzcyMD...,R
4,5,Father of the Bride Part II (1995),Comedy,tt0113041,George Banks must deal not only with his daugh...,"Comedy, Family, Romance",https://m.media-amazon.com/images/M/MV5BOTEyNz...,PG


In [56]:
# Dropping and renaming of columns
movie_metadata = test[["movieId", "imdbId","title","Rated","Genre_imdb","Plot","Poster"]]
movie_metadata = movie_metadata.rename(columns = {"movieId":"MovieID","imdbId":"ImdbID","title":"Title"})

In [57]:
movie_metadata.head()

Unnamed: 0,MovieID,ImdbID,Title,Rated,Genre_imdb,Plot,Poster
0,1,tt0114709,Toy Story (1995),G,"Animation, Adventure, Comedy",A cowboy doll is profoundly threatened and jea...,https://m.media-amazon.com/images/M/MV5BMDU2ZW...
1,2,tt0113497,Jumanji (1995),PG,"Adventure, Comedy, Family",When two kids find and play a magical board ga...,https://m.media-amazon.com/images/M/MV5BZTk2Zm...
2,3,tt0113228,Grumpier Old Men (1995),PG-13,"Comedy, Romance",John and Max resolve to save their beloved bai...,https://m.media-amazon.com/images/M/MV5BMjQxM2...
3,4,tt0114885,Waiting to Exhale (1995),R,"Comedy, Drama, Romance","Based on Terry McMillan's novel, this film fol...",https://m.media-amazon.com/images/M/MV5BYzcyMD...
4,5,tt0113041,Father of the Bride Part II (1995),PG,"Comedy, Family, Romance",George Banks must deal not only with his daugh...,https://m.media-amazon.com/images/M/MV5BOTEyNz...


In [61]:
# Check metadata info
movie_metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   MovieID     200 non-null    int64 
 1   ImdbID      200 non-null    object
 2   Title       200 non-null    object
 3   Rated       200 non-null    object
 4   Genre_imdb  200 non-null    object
 5   Plot        200 non-null    object
 6   Poster      200 non-null    object
dtypes: int64(1), object(6)
memory usage: 11.1+ KB


In [66]:
print( "ImdbID duplicates:" + str(movie_metadata["ImdbID"].duplicated().any()))
print( "Title duplicates:" + str(movie_metadata["Title"].duplicated().any()))

ImdbID duplicates:False
Title duplicates:False


In [92]:
outname = 'movie_metadata'

outdir = '../Resources'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)    

movie_metadata.to_csv(fullname, index=False)

## Cleaning rating_df

In [67]:
rating_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100836 entries, 0 to 100835
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   userId     100836 non-null  int64  
 1   movieId    100836 non-null  int64  
 2   rating     100836 non-null  float64
 3   timestamp  100836 non-null  int64  
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


In [68]:
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [89]:
a = rating_df.groupby(["userId","movieId"]).count()

a.sort_values("rating", ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,1,1
434,4993,1,1
434,4963,1,1
434,4896,1,1
434,4886,1,1
...,...,...,...
227,58303,1,1
227,56782,1,1
227,56367,1,1
227,55820,1,1


In [91]:
outname = 'user_ratings'

outdir = '../Resources'
if not os.path.exists(outdir):
    os.mkdir(outdir)

fullname = os.path.join(outdir, outname)    

rating_df.to_csv(fullname, index= False)