#### Importing Libraries

In [9]:
import selenium
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains
import json
import time
from time import sleep
import random
import warnings
warnings.filterwarnings('ignore')

#### Load the NBA Players' Names Data

In [10]:
with open('NBA Players.json','r') as file:
    data=json.load(file)
players=data['NBA Player']
# We check that the players list does not contain duplicate values 
players==list(dict.fromkeys(players))
#True means that our list contains only unique values

True

#### Initialize a Chrome Session

In [11]:
driver=webdriver.Edge(executable_path='C:\\Users\\TAHA\\Webdriver_taha\\msedgedriver.exe')
time.sleep(random.randint(3,6))
driver.get('https://fr.wikipedia.org/wiki/Wikip%C3%A9dia:Accueil_principal')

#### Scraping Functions

In [12]:
# This function locates the search bar and types the player's name
# then simulates an enter key to laod the player's wikipedia page
def search_player(driver,player):
    time.sleep(1)
    driver.find_element(By.CLASS_NAME,'vector-header-end').find_element(By.ID,'p-search').find_element(By.TAG_NAME,'a').click()
    time.sleep(1)
    search_bar=driver.find_element(By.CLASS_NAME,'vector-typeahead-search-container').find_element(By.CLASS_NAME,'cdx-text-input__input')
    time.sleep(1)
    search_bar.send_keys(player)
    time.sleep(1)
    search_bar=driver.find_element(By.CLASS_NAME,'vector-typeahead-search-container').find_element(By.CLASS_NAME,'cdx-text-input__input')
    driver.find_element(By.ID,'searchform').find_element(By.TAG_NAME,'button').click()
    time.sleep(1)

In [13]:
# This function checks whether a table corresponds 
# to a player's statistics table or not 
def table_check(elem):
    headers=elem.find_elements(By.TAG_NAME,'th')
    try:
        if headers[0].text.replace(' ','')=='Saison' and headers[1].text.replace(' ','')=='Équipe' and headers[2].text.replace(' ','')=='MJ':
            return True
        else:
            return False
    except:
        return False
## Typically, the rows of a table that contains performance data follow this pattern: Saison, Équope, MJ (Match Player)
# We will use this pattern to identify those tables

In [14]:
# This function retrieves performance data for each player and creates a list of dictionaries as output
def retrieve_data_from_table(performance_tables):
    player_performance_data=[]
    dict={}
    for elem in performance_tables:
        try:
            headers=elem.find_elements(By.TAG_NAME,'th')
            headers=[elem.text.replace(' ','') for elem in headers][2:]# We rule out the first two headers which are Saison and Equipe as they do not correspond to any data
            #Some notable players have an all-star game performance row that appears as the last row right below the career perforamnce row, while others have the career performacne
            #row as the last row. Since we are only interested in the career row, we apply the following condition that takes into account these case scenarios. 
            # If a player has an all-star game row, we take the row that's above it
            if elem.find_element(By.TAG_NAME,'tbody').find_elements(By.TAG_NAME,'tr')[-1].find_element(By.TAG_NAME,'td').text.replace(' ','')=='All-Star Game':
                table_data=elem.find_element(By.TAG_NAME,'tbody').find_elements(By.TAG_NAME,'tr')[-2].find_elements(By.TAG_NAME,'td')[1:] 
            else:
                table_data=elem.find_element(By.TAG_NAME,'tbody').find_elements(By.TAG_NAME,'tr')[-1].find_elements(By.TAG_NAME,'td')[1:]
            # We take the last row that contains the data and remove the first header "Carrière"
            table_data=[elem.text.replace(' ','') for elem in table_data if not any(char.isalpha() for char in elem.text)]
            dict={headers[i]:table_data[i] for i in range(len(headers))}
            player_performance_data.append(dict)
            print('Data successfully retrieved')
        except:
            print('Error while retrieving data')
            player_performance_data.append(dict)
    return player_performance_data

#### Data Scraping

In [15]:
#Initialize the data dictionary to empty
player_data_dict={}
i=0
# Run the scraping algorithm through the entire list of players
for player in players:
    try:
        i=i+1
        #Searching the player on wikipedia
        print('Player number ',i,': ',player)
        try:
            search_player(driver,player)
            #We locate the performance tables in the wikipedia page
            container=driver.find_element(By.CLASS_NAME,'mw-page-container').find_element(By.CLASS_NAME,'mw-content-container').find_element(By.CLASS_NAME,'mw-body')
            tables=container.find_elements(By.TAG_NAME,'table')
            driver.find_element(By.ID,'Statistiques') # When the page does not contain a statistics section, this means that the right page was not loaded.
            #Therefore, we add (Basketball) after the player's name in the search form
        except:
            try:
                search_player(driver,player+' (Basketball)')
                #We locate the performance tables in the wikipedia page
                container=driver.find_element(By.CLASS_NAME,'mw-page-container').find_element(By.CLASS_NAME,'mw-content-container').find_element(By.CLASS_NAME,'mw-body')
                tables=container.find_elements(By.TAG_NAME,'table')
                driver.find_element(By.ID,'Statistiques')
            except:
                print('Error during the search phase')
        time.sleep(random.randint(2,4))

        try:
            performance_tables=[elem for elem in tables if table_check(elem)==True]
        except:
            print('Error while locating the performance tables')
        # Retrieve the performance data from the wikipedia page
        player_data_dict[player]=retrieve_data_from_table(performance_tables)
        print('##########################################')
        time.sleep(random.randint(3,5))
    except:
        print('Error while inserting data for player: ',player)
        print('##########################################')
        player_data_dict[player]=[]
        time.sleep(random.randint(3,5))

Player number  1 :  Precious Achiuwa
Data successfully retrieved
Data successfully retrieved
Data successfully retrieved
##########################################
Player number  2 :  Steven Adams
Data successfully retrieved
Data successfully retrieved
Data successfully retrieved
##########################################
Player number  3 :  Bam Adebayo
Data successfully retrieved
Data successfully retrieved
Data successfully retrieved
##########################################
Player number  4 :  Ochai Agbaji
Data successfully retrieved
Data successfully retrieved
Data successfully retrieved
##########################################
Player number  5 :  James Akinjo
Error during the search phase
##########################################
Player number  6 :  Santi Aldama
Data successfully retrieved
Data successfully retrieved
##########################################
Player number  7 :  Trey Alexander
Error during the search phase
##########################################
Player numb

In [16]:
file_name="NBA Players' Performances.json"
with open(file_name,'w') as json_file:
    json.dump(player_data_dict,json_file)