# Import des librairies 

In [1]:
import pandas as pd 
from IPython.display import clear_output
import requests
from bs4 import BeautifulSoup
from urllib.request import Request, urlopen
import re
import csv
import numpy as np
import re
from tqdm import tqdm
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from concurrent import futures
import dateparser

import warnings
warnings.filterwarnings('ignore')

data_path="Data/"#Chemin du dossier pour enregistrer le fichier csv

In [2]:
def check_nan(df):
    for i in df.columns.tolist():
        print("Valeurs nan dans "+str(i)+" : "+str(df[i].isna().sum()))
        
def check_unique(df):
    for i in df.columns.tolist():
        print("Valeurs uniques dans "+str(i)+" : "+str(df[i].nunique()))

# Chargement des données déjà existantes

In [3]:
df_ids = pd.read_csv(data_path+"allocine_titre_id.csv")
df_ids.head()

Unnamed: 0,titre,id
0,avatar : la voie de l'eau,178014
1,les banshees d'inisherin,281293
2,tempête,289305
3,m3gan,266320
4,le tourbillon de la vie,288544


# Scrapping des données de la page casting pour chaque film 

## Fonctions 

In [4]:
def first_connexion(headless=False) :
    options = webdriver.ChromeOptions()
    
    if headless==True : 
        options.add_argument('-headless')
        options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(ChromeDriverManager().install(),options=options)
    driver.get("https://www.allocine.fr/")
    time.sleep(1)
    #driver.get_screenshot_as_file("screenshot.png")
    if headless==False : 
        cookies_box = driver.find_element(By.XPATH, '//*[@id="cmp-main"]/button[2]')
        cookies_box.click()
    time.sleep(5)
    return driver

In [5]:
def init_save_files(number) :
    for i in range(number) :
        df_movies_init= pd.DataFrame(columns= ['movie_id','realisateurs','scenaristes','acteurs'])
        df_movies_init.to_csv(data_path+f"allocine_casting/cast_movies_{i}.csv",index=False)
        
        df_people_init = pd.DataFrame(columns=["id","nom"])
        df_people_init.to_csv(data_path+f"allocine_casting/people_{i}.csv",index=False)

In [6]:
def init_movies_by_driver(nb_drivers, len_data) :
    drivers_ids_range ={}
    nb_films_by_driver = np.ceil(len_data/nb_drivers)
    for i in range(nb_drivers) :
        drivers_ids_range[i]=  {"start" : int(i * nb_films_by_driver), "end" : int((i+1)*nb_films_by_driver)}
    return drivers_ids_range 

In [7]:
def workload_driver(info_driver) :
    data_processed = df_ids.iloc[info_driver["index_start"]:info_driver["index_end"]].iloc[:nb_films_by_iter]
    driver= first_connexion(headless=True)#Première connexion
    #Scrapping des données
    new_data_movie = []
    new_data_people= []
    for index, id in enumerate(data_processed["id"]) : 
        new_data = get_data_casting(id, driver)
        new_data_movie.append(new_data["movie"])
        new_data_people +=new_data["people"]
        if index % 20 == 0 : 
            if info_driver["id"] ==0 : 
                clear_output()
            print(f"Driver n°{info_driver['id']} : {index} films")
                
    driver.close()
    
    #Sauvegarde des données
    df_movies= pd.DataFrame(new_data_movie)
    df_people = pd.DataFrame(new_data_people)
    
    
    temp_movies = pd.read_csv(data_path+f"allocine_casting/cast_movies_{info_driver['id']}.csv")
    temp_people = pd.read_csv(data_path+f"allocine_casting/people_{info_driver['id']}.csv")

    pd.concat([temp_movies, df_movies]).to_csv(data_path+f"allocine_casting/cast_movies_{info_driver['id']}.csv",index=False)
    pd.concat([temp_people, df_people]).to_csv(data_path+f"allocine_casting/people_{info_driver['id']}.csv",index=False)

In [8]:
#Scrappe les données de la page casting de l'identifant de film associé en utilisant le driver en input 
def get_data_casting(movie_id, driver) : 
    url=f"https://www.allocine.fr/film/fichefilm-{movie_id}/casting/"
    data={"movie":{}, "people":[]}
    data["movie"]["movie_id"] =movie_id

    driver.get(url)
    time.sleep(0.3)

    #Scrapping des réalisateurs : 
    sections = driver.find_elements_by_class_name("section.casting-director")
    if len(sections)>0 : 
        section = BeautifulSoup(sections[0].get_attribute("innerHTML"), 'html.parser')#On converti la section en soup bf4 
        directors = section.find_all("a",{"class" : "meta-title-link"})
        if len(directors)>0 :
            directors_id= []
            for director in directors : 
                id =re.sub(r"\D", "",director.get("href"))#On récupère l'id du réalisateur
                data["people"].append({"id":id, "nom":director.text})
                directors_id.append(id)
            data["movie"]["realisateurs"]= directors_id


    #Scrapping des scénarites : 
    sections= driver.find_elements_by_class_name("section.casting-list-gql")
    if len(sections) >0: 
        for section in sections : 
            section = BeautifulSoup(section.get_attribute("innerHTML"),"html.parser")
            titlebars = section.find_all("h2",{"class" :"titlebar-title titlebar-title-md" }) 
            if len(titlebars)>0 : 
                for titlebar in titlebars : 
                    if titlebar.text.lower() == "scénaristes" : 
                        scenarists= section.find_all("a",{"class": "xXx item link"})
                        if len(scenarists)>0 :
                            scenarists_id =[]
                            for scenarist in scenarists :
                                id= re.sub(r"\D", "",scenarist.get("href"))#On récupère l'id du scénariste
                                data["people"].append({"id":id, "nom":scenarist.text})
                                scenarists_id.append(id)
                            data["movie"]["scenaristes"]=scenarists_id


    #Scrapping des acteurs : 
    sections= driver.find_elements_by_class_name("section.casting-actor")
    if len(sections)>0 : 
        section = BeautifulSoup(sections[0].get_attribute("innerHTML"), 'html.parser')#On converti la section en soup bf4 
        actors= section.find_all("a",{"class" : "meta-title-link"})
        if len(actors)>0 : 
            actors_id = []
            for actor in actors : 
                id =re.sub(r"\D", "",actor.get("href"))#On récupère l'id de l'acteur
                data["people"].append({"id":id, "nom":actor.text})
                actors_id.append(id)
            data["movie"]["acteurs"]=actors_id 

    return data 

### Protos

In [75]:
movie_id=53750

driver = first_connexion()
url=f"https://www.allocine.fr/film/fichefilm-{movie_id}/casting/"
data={"movie":{}, "people":[]}
data["movie"]["movie_id"] =movie_id

driver.get(url)

time.sleep(0.3)

#Scrapping des réalisateurs : 
sections = driver.find_elements_by_class_name("section.casting-director")
if len(sections)>0 : 
    section = BeautifulSoup(sections[0].get_attribute("innerHTML"), 'html.parser')#On converti la section en soup bf4 
    directors = section.find_all("a",{"class" : "meta-title-link"})
    if len(directors)>0 :
        directors_id= []
        for director in directors : 
            id =re.sub(r"\D", "",director.get("href"))#On récupère l'id du réalisateur
            data["people"].append({"id":id, "nom":director.text})
            directors_id.append(id)
        data["movie"]["realisateurs"]= directors_id

        
#Scrapping des scénarites : 
sections= driver.find_elements_by_class_name("section.casting-list-gql")
if len(sections) >0: 
    for section in sections : 
        section = BeautifulSoup(section.get_attribute("innerHTML"),"html.parser")
        titlebars = section.find_all("h2",{"class" :"titlebar-title titlebar-title-md" }) 
        if len(titlebars)>0 : 
            for titlebar in titlebars : 
                if titlebar.text.lower() == "scénaristes" : 
                    scenarists= section.find_all("a",{"class": "xXx item link"})
                    if len(scenarists)>0 :
                        scenarists_id =[]
                        for scenarist in scenarists :
                            id= re.sub(r"\D", "",scenarist.get("href"))#On récupère l'id du scénariste
                            data["people"].append({"id":id, "nom":scenarist.text})
                            scenarists_id.append(id)
                        data["movie"]["scenaristes"]=scenarists_id
        
        
#Scrapping des acteurs : 
sections= driver.find_elements_by_class_name("section.casting-actor")
if len(sections)>0 : 
    section = BeautifulSoup(sections[0].get_attribute("innerHTML"), 'html.parser')#On converti la section en soup bf4 
    actors= section.find_all("a",{"class" : "meta-title-link"})
    if len(actors)>0 : 
        actors_id = []
        for actor in actors : 
            id =re.sub(r"\D", "",actor.get("href"))#On récupère l'id de l'acteur
            data["people"].append({"id":id, "nom":actor.text})
            actors_id.append(id)
        data["movie"]["acteurs"]=actors_id 

        

driver.close()
data

{'movie': {'movie_id': 53750,
  'realisateurs': ['119261'],
  'scenaristes': ['40125', '30532'],
  'acteurs': ['97468',
   '536',
   '12563',
   '14454',
   '105159',
   '44953',
   '561',
   '6351']},
 'people': [{'id': '119261', 'nom': 'David Bowers'},
  {'id': '40125', 'nom': 'Timothy Harris'},
  {'id': '30532', 'nom': 'Osamu Tezuka'},
  {'id': '97468', 'nom': 'Freddie Highmore'},
  {'id': '536', 'nom': 'Nicolas Cage'},
  {'id': '12563', 'nom': 'Charlize Theron'},
  {'id': '14454', 'nom': 'Samuel L. Jackson'},
  {'id': '105159', 'nom': 'Kristen Bell'},
  {'id': '44953', 'nom': 'Eugene Levy'},
  {'id': '561', 'nom': 'Donald Sutherland'},
  {'id': '6351', 'nom': 'Bill Nighy'}]}

## Algorithme final 

19H15

In [9]:
%%time
nb_drivers = 6
nb_films_by_iter =100
nb_movies= len(df_ids)

ids_movies_by_drivers = init_movies_by_driver(nb_drivers, nb_movies)

nb_iterations =(nb_movies//(nb_drivers*nb_films_by_iter))+1

init_save_files(nb_drivers)

#Nombre de fois que les drivers vont se lancer et se fermer
for j in range(nb_iterations) : 
    print("Itération :",j)
    with futures.ThreadPoolExecutor() as executor: 
        future_results = [ executor.submit(workload_driver,{'id':i, 'index_start' :ids_movies_by_drivers[i]["start"]+nb_films_by_iter*j , 'index_end':ids_movies_by_drivers[i]["end"]})  for i in range(nb_drivers) ] 
        for future_result in future_results: 
            future_result = future_result.result() # can use `timeout` to wait max seconds for each thread               

#Concaténation et enregirstrement des données
df_movies=pd.DataFrame()
for i in range(nb_drivers): 
    df_movies=pd.concat([df_movies, pd.read_csv(data_path+f"allocine_casting/cast_movies_{i}.csv")])
df_movies.to_csv(data_path+"allocine_casting_movies.csv",index=False)

df_people=pd.DataFrame()
for i in range(nb_drivers): 
    df_people=pd.concat([df_people, pd.read_csv(data_path+f"allocine_casting/people_{i}.csv")])
df_people.to_csv(data_path+f"allocine_id_people.csv",index=False) 

Driver n°0 : 20 films
Driver n°2 : 20 films
Driver n°5 : 20 films
Driver n°4 : 20 films
Driver n°3 : 20 films
CPU times: total: 8min 48s
Wall time: 2h 3min 11s


# Lecture des données 

## df_movies

In [10]:
df_movies=pd.read_csv(data_path+"allocine_casting_movies.csv")
print("Shape df_movies :",df_movies.shape)
print()
check_nan(df_movies)
print()
check_unique(df_movies)
df_movies.head()

Shape df_movies : (19982, 4)

Valeurs nan dans movie_id : 0
Valeurs nan dans realisateurs : 149
Valeurs nan dans scenaristes : 707
Valeurs nan dans acteurs : 150

Valeurs uniques dans movie_id : 19982
Valeurs uniques dans realisateurs : 8658
Valeurs uniques dans scenaristes : 16657
Valeurs uniques dans acteurs : 19815


Unnamed: 0,movie_id,realisateurs,scenaristes,acteurs
0,178014,['1066'],"['49984', '1066', '1066', '49985', '49984', '4...","['41339', '34515', '259', '6407', '12973', '22..."
1,281293,['142452'],['142452'],"['41623', '21353', '101150', '646346', '60312'..."
2,289305,['16478'],"['267165', '923099', '267165', '267165', '1647...","['18273', '194479', '465360', '895145', '84779..."
3,266320,['692670'],"['575327', '575327', '97569']","['502680', '844314', '794353', '982500', '7069..."
4,288544,['507431'],"['507431', '923535']","['415513', '67071', '17614', '167254', '136213..."


## df_people 

In [11]:
df_people = pd.read_csv(data_path+f"allocine_id_people.csv")
print("Shape df_people :",df_people.shape)
print()
check_nan(df_people)
print()
check_unique(df_people)
df_people.head()

Shape df_people : (218909, 2)

Valeurs nan dans id : 0
Valeurs nan dans nom : 0

Valeurs uniques dans id : 78448
Valeurs uniques dans nom : 78426


Unnamed: 0,id,nom
0,1066,James Cameron
1,49984,Amanda Silver
2,1066,James Cameron
3,1066,James Cameron
4,49985,Rick Jaffa


# Nettoyage des données : 

In [18]:
df_movies =df_movies.drop_duplicates()
print(df_movies.shape)
check_unique(df_movies)

(19982, 4)
Valeurs uniques dans movie_id : 19982
Valeurs uniques dans realisateurs : 8658
Valeurs uniques dans scenaristes : 16657
Valeurs uniques dans acteurs : 19815


In [19]:
df_people =df_people.drop_duplicates(["id"])
print(df_people.shape)
check_unique(df_people)

(78448, 2)
Valeurs uniques dans id : 78448
Valeurs uniques dans nom : 78425


# Sauvegarde des données 

In [24]:
df_people.to_csv(data_path+f"allocine_id_people.csv",index=False)
df_movies.to_csv(data_path+"allocine_casting_movies.csv",index=False)