In [44]:
import pandas as pd
import ast
import re
from tabulate import tabulate
from IPython.display import Image, HTML

import os
os.getcwd()

movies = pd.read_csv("/Users/carolinechen/Documents/Python Programing/Groupwork/tmdb_5000_movies.csv", delimiter= ",")
allmovies = pd.read_csv("/Users/carolinechen/Documents/Python Programing/Groupwork/movies_metadata.csv", delimiter= ",")


In [45]:
%matplotlib inline
import json
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [46]:
#####################
#                   #
# CLEANING          #
#                   #
#####################

#converting the columns that are into json format into a simple list thanks to library ast
movies['genres'] = [list(set([y['name'] for y in x])) for x in movies['genres'].apply(ast.literal_eval)]
movies['id'] = movies['id'].astype('float')
allmovies = allmovies[~allmovies.id.str.contains("-")] #drop rows where wrogn id format (like date!)
allmovies['id'] = allmovies['id'].astype('float')
movies = pd.merge(movies, allmovies[['id','poster_path']],on='id', how='left')

#Get list of all the genres possible for a movi
genre_possible = set(x for l in movies["genres"] for x in l)
#print(genre_possible)


In [47]:
#Create a column "decade" on your dataframe
movies["release_date"] = pd.to_datetime(movies["release_date"])
movies['year'] = movies['release_date'].dt.year

In [48]:
def release_decade(c):
    if c["year"] >1999 and c["year"] < 2010:
        return 2000
    elif c["year"] >= 2010:
        return 2010
    else:
        return (c["year"] - ((c["year"]//100)*100))//10*10

In [49]:
#movies["decade"].fillna("2010", inplace=True)
#movies.loc[movies['decade'].isnull()] #empty df so there is no longer a problem
movies['decade'] = movies.apply(release_decade, axis=1)

In [50]:
#####################
#                   #
# MODEL             #
#                   #
#####################

#Loop to get what genres the user is interested in
genres_choisi=[]
while True:    # infinite loop
    try:
        element = input("\nWhat movie genre interests you (Press Q to stop)? ")
        if element.upper() == "Q":
            break  # stops the loop
        elif element.lower().capitalize() in genres_choisi: #remove duplicates
            print("This genre is already specified!")
            pass
        elif element.lower().capitalize() not in genre_possible: #Check if a chosen genre does exist in our database, if not remove it from our list and ask the user to input a new one or no
            print(element, " is not a valid genre, please choose from", *genre_possible)
            ajouter = input("Do you want to input a new genre? (Y/N) ")
            if ajouter.upper() == "Y":
                genres_choisi.append(input("\nWhat genre exactly?").lower().capitalize())
            else:
                break
        else:
            genres_choisi.append(element.lower().capitalize())
    finally:
        print("You're interested in ", *genres_choisi, sep = "\n")
    if len(genres_choisi) > 5:
        print("Too many genres! We'll keep", *genres_choisi)
        break

#Loop to get what decades the user is interested in
decade_choisi = []
while True:    # infinite loop
    try:
        element = int(input("\nWhich decade interests you (Press 0 to stop)? "))
        if element == 0:
            break  # stops the loop
        elif element in decade_choisi: #remove duplicates
            print("This decade is already specified!")
            pass
        else:
            decade_choisi.append(element)
    except ValueError:
        print("Input a number (10,20,..., 2000, 2010) please!")
    finally:
        print("You're interested in movies from era(s):", *decade_choisi, sep = "\n")
    if len(decade_choisi) > 5:
        print("You've already chosen 5 decades! We'll keep", *decade_choisi)
        break


What movie genre interests you (Press Q to stop)? action
You're interested in 
Action

What movie genre interests you (Press Q to stop)? 
  is not a valid genre, please choose from Drama Thriller Western History Action Family Animation Crime War Music Horror Science Fiction Documentary Fantasy Adventure Foreign Comedy Mystery Romance TV Movie
Do you want to input a new genre? (Y/N) N
You're interested in 
Action

Which decade interests you (Press 0 to stop)? 90
You're interested in movies from era(s):
90

Which decade interests you (Press 0 to stop)? 0
You're interested in movies from era(s):
90


In [51]:
decade_choisi = [float(x) for x in decade_choisi]

#get the poster image
base_poster_url = 'http://image.tmdb.org/t/p/w185/'
movies['poster_path'] = "<img src='" + base_poster_url + movies['poster_path'] + "' style='height:100px;'>"

#Dataframe with only movies in selected decade
df1 = movies[movies['decade'].isin(decade_choisi)]

#Générer une dataframe correspondant aux genres voulues 
df2 = df1[[all(x in y for x in genres_choisi) for y in df1["genres"].values]]

#Sort movies based on score calculated above and print a beautiful table of the TOP10
df2 = df2.sort_values('vote_average', ascending=False)

print("Out of", str(len(df1.index)), "movies meeting your criteria, here's the top 10..." )
print(tabulate(df2[['title', "year", 'vote_average', 'genres']].head(10), headers=["Movie", "Year","Average score", "Genre"], showindex="never", tablefmt="fancy_grid"))

Out of 778 movies meeting your criteria, here's the top 10...
╒════════════════════════════╤════════╤═════════════════╤═══════════════════════════════════════════════════════════════════╕
│ Movie                      │   Year │   Average score │ Genre                                                             │
╞════════════════════════════╪════════╪═════════════════╪═══════════════════════════════════════════════════════════════════╡
│ One Man's Hero             │   1999 │             9.3 │ ['History', 'Drama', 'Western', 'Action']                         │
├────────────────────────────┼────────┼─────────────────┼───────────────────────────────────────────────────────────────────┤
│ The Matrix                 │   1999 │             7.9 │ ['Science Fiction', 'Action']                                     │
├────────────────────────────┼────────┼─────────────────┼───────────────────────────────────────────────────────────────────┤
│ Bound by Honor             │   1993 │             7.7 

In [55]:
#Parfois les images ne s'affichent pas, réessayez. 
#df2.columns = ["Poster", "Movie title","Average score", "Genre"]
df2 = df2[["poster_path","title", "vote_average", "genres"]].head(20)
display(HTML(df2.to_html(escape = False, index = False)))

poster_path,title,vote_average,genres
,9.3,"[History, Drama, Western, Action]",
,Bound by Honor,7.7,"[Crime, Drama, Thriller, Action]"
,Terminator 2: Judgment Day,7.7,"[Thriller, Science Fiction, Action]"
,Tombstone,7.4,"[Drama, Adventure, Western, History, Action]"
,The Crow,7.3,"[Thriller, Fantasy, Action]"
,The Boondock Saints,7.2,"[Thriller, Crime, Action]"
,Menace II Society,7.2,"[Crime, Drama, Action]"
,The Fugitive,7.2,"[Crime, Adventure, Mystery, Action, Thriller]"
,Total Recall,7.1,"[Adventure, Science Fiction, Action]"
,Star Trek: First Contact,7.0,"[Thriller, Adventure, Science Fiction, Action]"
