In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from urllib import request
from urllib.request import urlopen
from string import ascii_lowercase as alc
from datetime import datetime

In [101]:
pd.set_option('display.max_colwidth', None)

In [2]:
base_url = 'https://www.ufc.com/events'

In [3]:
html = urlopen(base_url)
soup = BeautifulSoup(html, 'lxml')
event_titles = []
event_images = []
event_texts = []
ID = -1

In [4]:
events = soup.find_all(class_='l-listing__item')

In [5]:
def get_fighters_images(fighter_link):
    html = urlopen(fighter_link)
    soup = BeautifulSoup(html, 'lxml')
    event_fighters_red = []
    event_fighters_blue = []

    # Extract red fighter images
    red_fighter_rows = soup.find_all(class_='c-listing-fight__corner--red')
    for row in red_fighter_rows:
        fighter_img = row.find('img', class_='image-style-event-fight-card-upper-body-of-standing-athlete') or \
                      row.find('img', class_='image-style-teaser')
        if fighter_img:
            fighter_image_url = fighter_img['src']
            if not fighter_image_url.startswith('https'):
                fighter_image_url = 'https://www.ufc.com' + fighter_image_url
            print("Red_Image: ", fighter_image_url)
            event_fighters_red.append(fighter_image_url)

    # Extract blue fighter images
    blue_fighter_rows = soup.find_all(class_='c-listing-fight__corner--blue')
    for row in blue_fighter_rows:
        fighter_img = row.find('img', class_='image-style-event-fight-card-upper-body-of-standing-athlete') or \
                      row.find('img', class_='image-style-teaser')
        if fighter_img:
            fighter_image_url = fighter_img['src']
            if not fighter_image_url.startswith('https'):
                fighter_image_url = 'https://www.ufc.com' + fighter_image_url
            print("Blue_Image: ", fighter_image_url)
            event_fighters_blue.append(fighter_image_url)

    return event_fighters_red, event_fighters_blue


In [6]:
def event_category(event):
    event_card = None
    if event.find_parent(class_="main-card"):
        event_card = 'main_card'
    elif event.find_parent(class_='fight-card-prelims'):
        event_card = 'prelims'
    elif event.find_parent(class_='fight-card-prelims-early'):
        event_card = 'early_prelims'
    return event_card

In [7]:
def get_Nickname_text(fighter_link):
    html = urlopen(fighter_link)
    soup = BeautifulSoup(html, 'lxml')
    
    fighter_nick_find = soup.find(class_='hero-profile__nickname')
    
    if fighter_nick_find:
        fighter_nick = fighter_nick_find.get_text(strip=True)
        return fighter_nick
    else:
        # Return an empty string if no nickname is found
        return ""


In [8]:
def get_fighter_texts(fighter_link):
    html = urlopen(fighter_link)
    soup = BeautifulSoup(html, 'lxml')
    event_fighters_red = []
    event_red_nick = []
    event_fighters_blue = []
    event_blue_nick = []
    event_weightclass = []
    event_card = []
    
    # Extract fighter names for red corner
    red_fighter_rows = soup.find_all(class_='c-listing-fight__corner-name c-listing-fight__corner-name--red')
    for row in red_fighter_rows:
        red_fighter_nick_link = row.find('a')['href']
        if red_fighter_nick_link:
            red_link = urljoin(base_url, red_fighter_nick_link)
            event_red_nick.append(get_Nickname_text(red_fighter_nick_link))
        
        red_fighter_given_name_tag = row.find(class_='c-listing-fight__corner-given-name')
        red_fighter_family_name_tag = row.find(class_='c-listing-fight__corner-family-name')
        
        if red_fighter_given_name_tag:
            
            red_fighter_given_name = red_fighter_given_name_tag.get_text(strip=True)
            red_fighter_family_name = red_fighter_family_name_tag.get_text(strip=True)
            red_fighter = f"{red_fighter_given_name} {red_fighter_family_name}"
            event_fighters_red.append(red_fighter)
        else:
            red_fighter_name_tag = row.find('a')
            if red_fighter_name_tag:
                
                red_fighter_name = red_fighter_name_tag.get_text(strip=True)
                event_fighters_red.append(red_fighter_name)
            else:
                print("Given name or family name not found for a red corner fighter")
    
    # Extract fighter names for blue corner
    blue_fighter_rows = soup.find_all(class_='c-listing-fight__corner-name c-listing-fight__corner-name--blue')
    for row in blue_fighter_rows:
        
        blue_fighter_nick_link = row.find('a')['href']
        if blue_fighter_nick_link:
            blue_link = urljoin(base_url, blue_fighter_nick_link)
            event_blue_nick.append(get_Nickname_text(blue_fighter_nick_link))
        
        blue_fighter_given_name_tag = row.find(class_='c-listing-fight__corner-given-name')
        blue_fighter_family_name_tag = row.find(class_='c-listing-fight__corner-family-name')
        
        if blue_fighter_given_name_tag:
            event_card.append(event_category(blue_fighter_given_name_tag))
            blue_fighter_given_name = blue_fighter_given_name_tag.get_text(strip=True)
            blue_fighter_family_name = blue_fighter_family_name_tag.get_text(strip=True)
            blue_fighter = f"{blue_fighter_given_name} {blue_fighter_family_name}"
            event_fighters_blue.append(blue_fighter)
        else:
            blue_fighter_name_tag = row.find('a')
            if blue_fighter_name_tag:
                event_card.append(event_category(blue_fighter_name_tag))
                blue_fighter_name = blue_fighter_name_tag.get_text(strip=True)
                event_fighters_blue.append(blue_fighter_name)
            else:
                print('cannot find name for blue fighter')
    
    # Extract weight class of the fight
    weightclass_tag = soup.find_all(class_='c-listing-fight__class c-listing-fight__class--desktop')
    for row in weightclass_tag:
        weightclass_get = row.find(class_='c-listing-fight__class-text').get_text(strip=True)
        if weightclass_get:
            event_weightclass.append(weightclass_get)
        else:
            print("Weight class not found for the fight")
    
    # Determine if the fight is on the main card, prelims, or early prelims
    
    print(len(event_red_nick), len(event_blue_nick))
    
    return event_fighters_red, event_red_nick, event_fighters_blue, event_blue_nick, event_weightclass, event_card, 


In [9]:
for event in events:
    title = event.find(class_='c-card-event--result__headline').get_text(strip=True)
    date = event.find(class_='c-card-event--result__date tz-change-data').get_text(strip=True)
    redImages = event.find(class_='field field--name-red-corner field--type-entity-reference field--label-hidden field__item')
    blueImages = event.find(class_='field field--name-blue-corner field--type-entity-reference field--label-hidden field__item')
    # Extracting red image URLs
    red_image_urls = redImages.find('img', class_='image-style-event-results-athlete-headshot')
    red_urls = red_image_urls['src']
    
    if not re.match(r'^https?://', red_urls):
        red_urls = 'https://www.ufc.com' + red_urls
        print(red_urls)

    
    # Extracting blue image URLs
    blue_image_urls = blueImages.find('img', class_='image-style-event-results-athlete-headshot')
    blue_urls = blue_image_urls['src']
    
    if not re.match(r'^https?://', blue_urls):
        blue_urls = 'https://www.ufc.com' + blue_urls
        print(blue_urls)
    
    event_titles.append([len(event_titles), title, date, red_urls ,blue_urls])

    fighter_name_cell = event.find('h3', class_='c-card-event--result__headline')
    fighter_link = fighter_name_cell.find('a')['href'] if fighter_name_cell else None

    if fighter_link:
        full_fighter_link = urljoin(base_url, fighter_link)
        
        fighters_red, fighters_blue = get_fighters_images(full_fighter_link)
        ##event_images.append({'title': title, 'date': date, 'fighters_red': fighters_red, 'fighters_blue': fighters_blue})
        print(len(fighters_red), len(fighters_blue))
        fighter_texts_red, red_fighter_nickname, fighter_texts_blue, blue_fighter_nickname, fighter_weight, fighter_card = get_fighter_texts(full_fighter_link)
        min_length = min(len(fighter_texts_red), len(fighter_texts_blue))
        
        ID += 1
        for i in range(min_length):
            matchup = f"{ID}, {fighter_card[i]}, {fighter_weight[i]} , {fighters_red[i]}, {fighter_texts_red[i]}, {red_fighter_nickname[i]}, {fighter_texts_blue[i]}, {fighters_blue[i]}, {blue_fighter_nickname[i]}"
            event_texts.append(matchup)
        

Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-06/RIBAS_AMANDA_L_06-24.png?itok=lL75F3nj
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/WILLIAMS_KARL_L_05-13.png?itok=R52nPebO
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/SHAHBAZYAN_EDMEN_L_05-20.png?itok=n9KI7LUc
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-11/TALBOTT_PAYTON_L_11-18.png?itok=HT9daebw
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-08/QUARANTILLO_BILLY_L_08-05.png?itok=xyC9v73c
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-06/PADILLA_FERNANDO_L_04-29.png?itok=eCm0SzfE
Red_Image:  https://dmxg5wxfqgb4u.cloudfront.net/styles/even

In [10]:
list(event_texts)

['0, main_card, Women\'s Flyweight Bout , https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-06/RIBAS_AMANDA_L_06-24.png?itok=lL75F3nj, Amanda Ribas, , Rose Namajunas, https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-08/NAMAJUNAS_ROSE_R_09-02.png?itok=gHxmp9rq, "Thug"',
 '0, main_card, Heavyweight Bout , https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/WILLIAMS_KARL_L_05-13.png?itok=R52nPebO, Karl Williams, , Justin Tafa, https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2024-02/TAFA_JUSTIN_R_02-17.png?itok=dR_gEVUc, "Bad Man"',
 '0, main_card, Middleweight Bout , https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/SHAHBAZYAN_EDMEN_L_05-20.png?itok=n9KI7LUc, Edmen Shahbazyan, "The Golden Boy", AJ Dobson, https://dmxg5wxfqgb4u.cloudfront.net

In [11]:
list(event_titles)

[[0,
  'Ribas vs Namajunas',
  'Sat, Mar 23 / 10:00 PM EDT / Main Card',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2023-11/RIBAS_AMANDA_11-18.png?itok=s3YuLD3a',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2023-08/NAMAJUNAS_ROSE_09-02.png?itok=NLQIZews'],
 [1,
  'Blanchfield vs Fiorot',
  'Sat, Mar 30 / 10:00 PM EDT / Main Card',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2023-08/BLANCHFIELD_ERIN.png?itok=LdvmnU-H',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2023-08/FIOROT_MANON_09-02.png?itok=6YyG08va'],
 [2,
  'Allen vs Curtis 2',
  'Sat, Apr 6 / 6:00 PM EDT / Main Card',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2023-11/ALLEN_BRENDAN_11-18.png?itok=OK1-SN9O',
  'https://dmxg5wxfqgb4u.cloudfront.net/styles/event_results_athlete_headshot/s3/2024-01/CURTIS_CHRIS_01-20.png?itok=Sc9azhtU'],
 [3,
  'P

In [12]:
df = pd.DataFrame(event_titles)

In [13]:
df.columns = ["ID", "Event_Name", "Event_Date", "Red_Fighter_images", "Blue_Fighter_images"]

In [14]:
df.head()

Unnamed: 0,ID,Event_Name,Event_Date,Red_Fighter_images,Blue_Fighter_images
0,0,Ribas vs Namajunas,"Sat, Mar 23 / 10:00 PM EDT / Main Card",https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...,https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...
1,1,Blanchfield vs Fiorot,"Sat, Mar 30 / 10:00 PM EDT / Main Card",https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...,https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...
2,2,Allen vs Curtis 2,"Sat, Apr 6 / 6:00 PM EDT / Main Card",https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...,https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...
3,3,Pereira vs Hill,"Sat, Apr 13 / 10:00 PM EDT / Main Card",https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...,https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...
4,4,Nicolau vs Kape 2,"Sat, Apr 27 / 7:00 PM EDT / Main Card",https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...,https://dmxg5wxfqgb4u.cloudfront.net/styles/ev...


In [115]:
split_texts = [text.split(",") for text in event_texts]

In [116]:
df1 = pd.DataFrame(split_texts)

In [117]:
df1.insert(loc=1, column='Fight_Num', value=df1.index)

In [118]:
df1.columns =  ["ID", "Fight_Num", "Event_Card", "Event_Weight", "Red_Event_fighter_image", "Red_Fighter_Name", "Red_Fighter_Nickname" ,"Blue_Fighter_Name", "Blue_Event_fighter_image", "Blue_Fighter_Nickname" ]

In [119]:
df1.head()

Unnamed: 0,ID,Fight_Num,Event_Card,Event_Weight,Red_Event_fighter_image,Red_Fighter_Name,Red_Fighter_Nickname,Blue_Fighter_Name,Blue_Event_fighter_image,Blue_Fighter_Nickname
0,0,0,main_card,Women's Flyweight Bout,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-06/RIBAS_AMANDA_L_06-24.png?itok=lL75F3nj,Amanda Ribas,,Rose Namajunas,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-08/NAMAJUNAS_ROSE_R_09-02.png?itok=gHxmp9rq,"""Thug"""
1,0,1,main_card,Heavyweight Bout,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/WILLIAMS_KARL_L_05-13.png?itok=R52nPebO,Karl Williams,,Justin Tafa,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2024-02/TAFA_JUSTIN_R_02-17.png?itok=dR_gEVUc,"""Bad Man"""
2,0,2,main_card,Middleweight Bout,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-05/SHAHBAZYAN_EDMEN_L_05-20.png?itok=n9KI7LUc,Edmen Shahbazyan,"""The Golden Boy""",AJ Dobson,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2022-05/30f94739-5850-462f-86ee-83b26074f12e%252FDOBSON_AJ_R_02-12.png?itok=913F-9s-,
3,0,3,main_card,Bantamweight Bout,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-11/TALBOTT_PAYTON_L_11-18.png?itok=HT9daebw,Payton Talbott,,Cameron Saaiman,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-03/SAAIMAN_CAMERON_R_03-05.png?itok=aWwGKqK3,"""MSP"""
4,0,4,main_card,Featherweight Bout,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2023-08/QUARANTILLO_BILLY_L_08-05.png?itok=xyC9v73c,Billy Quarantillo,,Youssef Zalal,https://dmxg5wxfqgb4u.cloudfront.net/styles/event_fight_card_upper_body_of_standing_athlete/s3/2022-08/ZALAL_YOUSSEF_R_08-13.png?itok=JSKZ1Vgk,"""The Moroccan Devil"""


In [120]:
red_change = {'Male': 'https://www.ufc.com/themes/custom/ufc/assets/img/SHADOW_Fighter_fullLength_RED.png',
              'Female': 'https://www.ufc.com/themes/custom/ufc/assets/img/womens-silhouette-RED-corner.png'}

blue_change = {'Male': 'https://www.ufc.com/themes/custom/ufc/assets/img/SHADOW_Fighter_fullLength_BLUE.png',
               'Female': 'https://www.ufc.com/themes/custom/ufc/assets/img/womens-silhouette-BLUE-corner.png'}

columns_to_search = ['Red_Event_fighter_image', 'Blue_Event_fighter_image']

# URL to search for
target_Fem = 'https://www.ufc.com/themes/custom/ufc/assets/img/silhouette-headshot-female.png'
target_Rmal = 'https://www.ufc.com/themes/custom/ufc/assets/img/standing-stance-right-silhouette.png'
target_Bmal = 'https://www.ufc.com/themes/custom/ufc/assets/img/standing-stance-left-silhouette.png'


df1.replace({
    'Red_Event_fighter_image': {
        target_Fem: red_change['Female'],
        target_Rmal: red_change['Male']
    },
    'Blue_Event_fighter_image': {
        target_Fem: blue_change['Female'],
        target_Bmal: blue_change['Male']
    }
}, inplace=True, regex=True)

In [123]:
df.to_csv('UFC_Events_dates.csv', index=False)
df1.to_csv('UFC_Event_Fights.csv', index=False)

In [124]:
df2 = pd.read_csv('UFC_Events_dates.csv')
df3 = pd.read_csv('UFC_Event_Fights.csv')

In [125]:
json_Events_Dates_data = df2.to_json(orient='records')
json_Events_Fights_data = df3.to_json(orient='records')

In [126]:
with open('Event_Dates_data.js', 'w') as js_file:
    js_file.write(f'export const Events_dates_data = {json_Events_Dates_data};')
    
with open('Event_Fights_data.js', 'w') as js_file:
    js_file.write(f'export const Events_Fights_data = {json_Events_Fights_data};')