### Just like preprocessing 3, we'll scrape movie data for 2022 using wikipead and TMDB and add it to "movie_data_upto_2021.csv". 

In [32]:
import pandas as pd
import numpy as np

#### Extracting some features of 2022 movies from Wikipedia
#### NOTE: We won't scrape the entire data for 2022 as the year hasn't ended yet. We'll scrape data of only those movies, which have been relesed.

In [33]:
# extracting tables from wikipedia page
link = "https://en.wikipedia.org/wiki/List_of_American_films_of_2022"
#  the argument 2,3,4,5 here indicates the table number
# We are adding movie data only upto march as the rest of the movie hasn't released yet
t1_2022 = pd.read_html(link, header=0)[3]

In [34]:
table_2022 = t1_2022[["Title", "Cast and crew"]]

In [35]:
from tmdbv3api import TMDb
import json
import requests
tmdb = TMDb()
tmdb.api_key = "4bfa18d4dd4f5d041a3d88cfb6728654"

In [36]:
from tmdbv3api import Movie
tmdb_movie = Movie()
def extract_genres(x):
    genres = []
    result = tmdb_movie.search(x)
    if not result:
        # This "if not result" condition is specifically added for 2020 data as for some movie we don't get result on TMBb
        return np.NaN
    else:
        movie_id = result[0].id
        response = requests.get('https://api.themoviedb.org/3/movie/{}?api_key={}'.format(movie_id,tmdb.api_key))
        # extracting data in json format
        data_json = response.json()
        if data_json['genres']:
        # Actually there's a key "genres" which basically holds a list of dictionaries and each dictionary has one genre
        # paired to key "name". So we here loop thorugh each dictionary and extract each & every genre of a given movie.    
            genre_str = " " 
            for i in range(0,len(data_json['genres'])):
                genres.append(data_json['genres'][i]['name'])
            return genre_str.join(genres)
        else:
            return np.NaN

In [37]:
table_2022['genres'] = table_2022['Title'].map(lambda x: extract_genres(str(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table_2022['genres'] = table_2022['Title'].map(lambda x: extract_genres(str(x)))


In [38]:
table_2022.head()

Unnamed: 0,Title,Cast and crew,genres
0,The 355,Simon Kinberg (director/screenplay); Theresa R...,Action Thriller
1,The Legend of La Llorona,Patricia Harris Seeley (director/screenplay); ...,Horror Thriller
2,The Commando,Asif Akbar (director); Koji Steven Sakai (scre...,Action Crime Thriller
3,Scream,"Matt Bettinelli-Olpin, Tyler Gillett (director...",Horror Mystery Thriller
4,Hotel Transylvania: Transformania,"Jennifer Kluska, Derek Drymon (directors); Amo...",Animation Family Fantasy Comedy Adventure


In [39]:
movie_data_2022_march = table_2022

In [40]:
def extract_director_name(x):
    if " (director)" in x:
        return x.split(" (director)")[0]
    elif " (director/screenplay)" in x:
        return x.split(" (director/screenplay)")[0]
    # This " (directors/screenplay)" & " (directors)" condition is specifically added for 2022 data.
    elif " (directors/screenplay)" in x:
        directors = x.split(" (directors/screenplay)")[0]
        directors_list = directors.split(", ")
        return " ".join(directors_list)
    elif " (directors)" in x:
        directors = x.split(" (directors)")[0]
        directors_list = directors.split(", ")
        return " ".join(directors_list)
    else:
        return x.split(" (directors)")[0]

In [41]:
movie_data_2022_march["director_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_director_name(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_2022_march["director_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_director_name(x))


In [42]:
def extract_actor_names(x, actor_num):
    # here we are separating actor names from rest of the crew.    
    cast = x.split("screenplay); ")[-1]
    if actor_num == 0:
        return cast.split(", ")[actor_num]
    elif actor_num == 1:
        if len(cast.split(", ")) < 2:
            return np.NaN
        else:
            return cast.split(", ")[actor_num]
    elif actor_num == 2:
        if len(cast.split(", ")) < 3:
            return np.NaN
        else:
            return cast.split(", ")[actor_num]
    

In [43]:
movie_data_2022_march["actor_1_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_actor_names(x, 0))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movie_data_2022_march["actor_1_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_actor_names(x, 0))


In [44]:
movie_data_2022_march["actor_2_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_actor_names(x, 1))

In [45]:
movie_data_2022_march["actor_3_name"] = movie_data_2022_march["Cast and crew"].map(lambda x: extract_actor_names(x, 2))

In [46]:
movie_data_2022_march

Unnamed: 0,Title,Cast and crew,genres,director_name,actor_1_name,actor_2_name,actor_3_name
0,The 355,Simon Kinberg (director/screenplay); Theresa R...,Action Thriller,Simon Kinberg,Jessica Chastain,Lupita Nyong'o,Penélope Cruz
1,The Legend of La Llorona,Patricia Harris Seeley (director/screenplay); ...,Horror Thriller,Patricia Harris Seeley,Autumn Reeser,Antonio Cupo,Danny Trejo
2,The Commando,Asif Akbar (director); Koji Steven Sakai (scre...,Action Crime Thriller,Asif Akbar,Mickey Rourke,Michael Jai White,
3,Scream,"Matt Bettinelli-Olpin, Tyler Gillett (director...",Horror Mystery Thriller,"Matt Bettinelli-Olpin, Tyler Gillett",Melissa Barrera,Mason Gooding,Jenna Ortega
4,Hotel Transylvania: Transformania,"Jennifer Kluska, Derek Drymon (directors); Amo...",Animation Family Fantasy Comedy Adventure,Jennifer Kluska Derek Drymon,Andy Samberg,Selena Gomez,Kathryn Hahn
...,...,...,...,...,...,...,...
56,The Lost City,"Aaron Nee, Adam Nee (directors/screenplay); Or...",Action Adventure Comedy Romance,Aaron Nee Adam Nee,Sandra Bullock,Channing Tatum,Daniel Radcliffe
57,Everything Everywhere All at Once,"Daniel Kwan, Daniel Scheinert (directors/scree...",Action Science Fiction Comedy,Daniel Kwan Daniel Scheinert,Michelle Yeoh,Stephanie Hsu,Ke Huy Quan
58,Infinite Storm,"Małgorzata Szumowska, Michał Englert (director...",Drama Thriller,Małgorzata Szumowska Michał Englert,Naomi Watts,Billy Howle,Sophie Okonedo
59,7 Days,Roshan Sethi (director/screenplay); Karan Soni...,Thriller Drama Crime,Roshan Sethi,Karan Soni,Geraldine Viswanathan,Mark Duplass


In [47]:
movie_data_2022_march = movie_data_2022_march.rename(columns={"Title": "movie_title"})

In [48]:
new_movie_data_2022_march = movie_data_2022_march.loc[:, ["movie_title", "genres", "director_name", "actor_1_name", "actor_2_name", "actor_3_name"]]

In [49]:
new_movie_data_2022_march["actor_2_name"] = new_movie_data_2022_march["actor_2_name"].replace(np.NaN, "unknown")
new_movie_data_2022_march["actor_3_name"] = new_movie_data_2022_march["actor_3_name"].replace(np.NaN, "unknown")

In [50]:
new_movie_data_2022_march["movie_title"] = new_movie_data_2022_march["movie_title"].str.lower()

In [51]:
new_movie_data_2022_march["combine"] = new_movie_data_2022_march["genres"] +" "+ new_movie_data_2022_march["director_name"] +" "+ new_movie_data_2022_march["actor_1_name"] +" "+ new_movie_data_2022_march["actor_2_name"] +" "+new_movie_data_2022_march["actor_3_name"]

In [52]:
new_movie_data_2022_march

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combine
0,the 355,Action Thriller,Simon Kinberg,Jessica Chastain,Lupita Nyong'o,Penélope Cruz,Action Thriller Simon Kinberg Jessica Chastain...
1,the legend of la llorona,Horror Thriller,Patricia Harris Seeley,Autumn Reeser,Antonio Cupo,Danny Trejo,Horror Thriller Patricia Harris Seeley Autumn ...
2,the commando,Action Crime Thriller,Asif Akbar,Mickey Rourke,Michael Jai White,unknown,Action Crime Thriller Asif Akbar Mickey Rourke...
3,scream,Horror Mystery Thriller,"Matt Bettinelli-Olpin, Tyler Gillett",Melissa Barrera,Mason Gooding,Jenna Ortega,"Horror Mystery Thriller Matt Bettinelli-Olpin,..."
4,hotel transylvania: transformania,Animation Family Fantasy Comedy Adventure,Jennifer Kluska Derek Drymon,Andy Samberg,Selena Gomez,Kathryn Hahn,Animation Family Fantasy Comedy Adventure Jenn...
...,...,...,...,...,...,...,...
56,the lost city,Action Adventure Comedy Romance,Aaron Nee Adam Nee,Sandra Bullock,Channing Tatum,Daniel Radcliffe,Action Adventure Comedy Romance Aaron Nee Adam...
57,everything everywhere all at once,Action Science Fiction Comedy,Daniel Kwan Daniel Scheinert,Michelle Yeoh,Stephanie Hsu,Ke Huy Quan,Action Science Fiction Comedy Daniel Kwan Dani...
58,infinite storm,Drama Thriller,Małgorzata Szumowska Michał Englert,Naomi Watts,Billy Howle,Sophie Okonedo,Drama Thriller Małgorzata Szumowska Michał Eng...
59,7 days,Thriller Drama Crime,Roshan Sethi,Karan Soni,Geraldine Viswanathan,Mark Duplass,Thriller Drama Crime Roshan Sethi Karan Soni G...


In [53]:
movie_data_upto_2021 = pd.read_csv("movie_data_upto_2021.csv")

In [54]:
movie_data_upto_2021

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combine
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...
...,...,...,...,...,...,...,...
6500,the king's man,Action Adventure Thriller War Mystery,Matthew Vaughn,Ralph Fiennes,Gemma Arterton,Rhys Ifans,Action Adventure Thriller War Mystery Matthew ...
6501,the tragedy of macbeth,Drama War,Joel Coen,Denzel Washington,Frances McDormand,Bertie Carvel,Drama War Joel Coen Denzel Washington Frances ...
6502,a journal for jordan,Drama Romance,Denzel Washington,Michael B. Jordan,Chanté Adams,Jalon Christian,Drama Romance Denzel Washington Michael B. Jor...
6503,american underdog,Drama,Erwin brothers,Zachary Levi,Anna Paquin,Dennis Quaid,Drama Erwin brothers Zachary Levi Anna Paquin ...


In [55]:
movie_data_upto_2022_march = movie_data_upto_2021.append(new_movie_data_2022_march, ignore_index=True)

In [56]:
movie_data_upto_2022_march

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combine
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...
...,...,...,...,...,...,...,...
6561,the lost city,Action Adventure Comedy Romance,Aaron Nee Adam Nee,Sandra Bullock,Channing Tatum,Daniel Radcliffe,Action Adventure Comedy Romance Aaron Nee Adam...
6562,everything everywhere all at once,Action Science Fiction Comedy,Daniel Kwan Daniel Scheinert,Michelle Yeoh,Stephanie Hsu,Ke Huy Quan,Action Science Fiction Comedy Daniel Kwan Dani...
6563,infinite storm,Drama Thriller,Małgorzata Szumowska Michał Englert,Naomi Watts,Billy Howle,Sophie Okonedo,Drama Thriller Małgorzata Szumowska Michał Eng...
6564,7 days,Thriller Drama Crime,Roshan Sethi,Karan Soni,Geraldine Viswanathan,Mark Duplass,Thriller Drama Crime Roshan Sethi Karan Soni G...


In [57]:
movie_data_upto_2022_march.isna().sum()

movie_title      0
genres           0
director_name    0
actor_1_name     0
actor_2_name     0
actor_3_name     0
combine          0
dtype: int64

In [58]:
movie_data_upto_2022_march.movie_title = movie_data_upto_2022_march.movie_title.str.strip()
# removing "leading and trailing space" and also the "null terminationg character(\xa0)" of the column

In [59]:
movie_data_upto_2022_march.to_csv("movie_data_upto_2022_march.csv", index= False)