# Importing libraries
***

In [1]:
from bs4 import BeautifulSoup
import csv
import requests
import time
import random
from selenium import webdriver

***
# Crawling Ranking Page URLs from Tapology

Fetching current rankings page from Tapology, parsing HTML content to find all hyperlinks, filtering the links to include only those that contain '-top-' in their URL and exclude any with '?page=', and then compiling these URLs into a list called urls.
***

In [2]:
base_url = 'https://www.tapology.com/rankings/groups/current'
urls = []

response = requests.get(base_url)
soup = BeautifulSoup(response.content, 'html.parser')

page_links = soup.select('a[href]')

for link in page_links:
    url = link['href']
    prefix = 'https://www.tapology.com'
    final_url = prefix + url
    if "-top-" in url:
        if '?page=' not in final_url:
            if final_url not in base_url:
                urls.append(final_url)

***
# Checking collected URLs

Making sure the **'urls'** list only contains necessary links.
***

In [3]:
urls

['https://www.tapology.com/rankings/current-top-ten-best-pound-for-pound-mma-and-ufc-fighters',
 'https://www.tapology.com/rankings/current-top-ten-heavyweight-mma-fighters-265-pounds',
 'https://www.tapology.com/rankings/current-top-ten-light-heavyweight-mma-fighters-205-pounds',
 'https://www.tapology.com/rankings/current-top-ten-middleweight-mma-fighters-185-pounds',
 'https://www.tapology.com/rankings/current-top-ten-welterweight-mma-fighters-170-pounds',
 'https://www.tapology.com/rankings/current-top-ten-lightweight-mma-fighters-155-pounds',
 'https://www.tapology.com/rankings/current-top-ten-featherweight-mma-fighters-145-pounds',
 'https://www.tapology.com/rankings/current-top-ten-bantamweight-mma-fighters-135-pounds',
 'https://www.tapology.com/rankings/35-top-flyweight-mma-fighters',
 'https://www.tapology.com/rankings/1261-top-women-bantamweight-fighters',
 'https://www.tapology.com/rankings/1262-top-women-flyweight-fighters',
 'https://www.tapology.com/rankings/1263-top-wom

***
# Adjusting Ranking Page URLs

The first link in the **'urls'** list does not meet the criteria and will therefore be deleted.
***

In [4]:
urls.pop(0)

'https://www.tapology.com/rankings/current-top-ten-best-pound-for-pound-mma-and-ufc-fighters'

***
# Collecting Top 10 Fighter Profile Links

Extracting profile URLs of the top 10 fighters from each ranking page in urls and compiling them into a list called **'fighter_page_list'**.
***

In [5]:
fighter_page_list = []

for url in urls:
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    rankings = soup.find_all(class_="rankingItemsItemRank")
    divs = soup.find_all('div', class_='rankingItemsItemRow name')    
    prefix = 'https://www.tapology.com'

    fighter_count = 0
    for rank, div in zip(rankings, divs):
        rank_number = int(rank.text.strip())
        if rank_number < 11 and fighter_count < 10:
            a_tag = div.find('h1', class_='name').find('a')

            if a_tag:
                full_link = prefix + a_tag['href']
                fighter_page_list.append(full_link)
                fighter_count += 1 

        if fighter_count >= 10:
            break

***
# Scraping fighter data 

Scraping detailed fighter information from their profile pages using Selenium and BeautifulSoup. Extracting attributes like name, nickname, age, nationality, record, earnings, height, reach, weight, as well as weight class and then compiling these details into a list called **'fighter_list'**.
***

In [None]:
def safe_get_text(tag, key):
    result = tag.find('strong', text=key)
    return result.find_next_sibling('span').text.strip() if result and result.find_next_sibling('span') else 'N/A'

driver = webdriver.Safari()

fighter_list = []

for fighter in fighter_page_list:
    driver.get(fighter)
    time.sleep(2)

    page_source = driver.page_source
    content = BeautifulSoup(page_source, 'html.parser')

    h1_tag = content.find_all('h1')
    if h1_tag:
        name = h1_tag[1].text.strip()

    nickname = safe_get_text(content, 'Nickname:')
    record = safe_get_text(content, 'Pro MMA Record:')
    earnings = safe_get_text(content, 'Career Disclosed Earnings:')
    height = safe_get_text(content, 'Height:')
    reach = safe_get_text(content, '| Reach:')
    weight_class = safe_get_text(content, 'Weight Class:')
    dob = safe_get_text(content, '| Date of Birth:')
    nationality = safe_get_text(content, 'Born:')
    age = safe_get_text(content, 'Age:')
    last_weight = safe_get_text(content, '| Last Weigh-In:')

    block = f'name: {name}, nickname: {nickname}, age: {age}, born: {dob}, nationality: {nationality}, record: {record}, earnings_$: {earnings}, height: {height}, reach: {reach}, weight: {last_weight}, weight_class: {weight_class}'
    
    fighter_list.append([block])

    time.sleep(random.randint(0, 7))

driver.quit()

***
# Creating .csv file

Writing the collected fighter data from fighter_list into a CSV file named **'top_10_fighters.csv'** for further data analysis.
***

In [None]:
with open('/Volumes/Dr Ive /Local Repo/WebScraWler/Data/top_10_fighters.csv', mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['name', 'nickname', 'age', 'born', 'nationality', 'record', 'earnings', 'height', 'reach', 'weight', 'weight_class'])

    for fighter in fighter_list:
        details = fighter[0].split(', ')
        row = []
        temp_string = ""
        for detail in details:
            if ': ' in detail and temp_string == "":
                temp_string = detail.split(': ', 1)[1]
            else:
                if temp_string:
                    temp_string += ', ' + detail
                else:
                    temp_string = detail

            if temp_string and detail == details[-1]:  
                row.append(temp_string.strip())
                temp_string = ""

        writer.writerow(row)