In [1]:
import requests
from bs4 import BeautifulSoup
import re
from datetime import datetime, timedelta
import json
import pandas as pd
import numpy as np

In [2]:
class Boxers_info():
  '''
  Class to modify, append, search or delete the boxers info file.
  Receives:
    filename: str with the name of the file.
    json_file: str with the name of the json file to save scrapped data.
  '''
  def __init__(self, filename, json_file):
    self.filename = filename
    self.json_file = json_file

  def getPage(self, url):
    session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36',
               'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
    try:
        req = session.get(url, headers=headers)
    except requests.exceptions.RequestException:
        return None
    bs = BeautifulSoup(req.text, 'html.parser')
    return bs

  def info_box_page(self, boxer):
    boxer1 = boxer.replace(' ','-')
    raw = self.getPage(f'https://box.live/boxers/{boxer1}/')
    contenido = {}
    for i in raw.find_all('div', {'class':'d-flex flex-wrap w-100'}):
      title = i.find_all('div', {'class':re.compile('stats-row__title')})
      content = i.find_all('div', {'class':re.compile('stats-row__content')})
      for a,e in zip(title,content):
        contenido[a.get_text().strip()] = (e.get_text().strip())
    contenido['boxer'] = boxer
    contenido = {key.lower(): value.lower() for key, value in contenido.items()}
    return contenido

  def json_log(self, new_data):
    try:
      with open(self.json_file, 'r') as json_f:
        data = json.load(json_f)
      data.append(new_data)
    except FileNotFoundError:
      data = [new_data]
    with open(self.json_file, 'w') as json_f:
      json.dump(data, json_f, indent=4)
      print(f'Json data appended to {self.json_file}\n')


  def update_data(self, data, temp, name, id_val=None):
    info = self.info_box_page(name)
    current_date = datetime.now()
    try:
      years_to_subtract = int(info['age'])
      new_date = current_date - timedelta(days=365 * years_to_subtract)
      birth = new_date.strftime("%Y")
      height = info['height'].split('/')[-1].replace('cm', '').strip()
      reach = info['reach'].split('/')[-1].replace('cm', '').strip()
      new_row = {'id': id_val, 'boxer': name, 'birth': birth, 'height': height, 'reach': reach, 'stance': info['stance'],
                 'box_live_data':1, 'extract_info_date':current_date}
      if temp is not None:
        data.loc[temp.index, 'birth'] = birth
        data.loc[temp.index, 'height'] = height
        data.loc[temp.index, 'reach'] = reach
        data.loc[temp.index, 'stance'] = info['stance']
        data.loc[temp.index, 'box_live_data'] = 1
        data.loc[temp.index, 'extract_info_date'] = current_date
        print(f'Boxer already in records, completed info: {str(data.loc[temp.index, :].values)}\n')
      else:
        if id_val is not None:
          data = pd.concat([data, pd.DataFrame([new_row])])
          print(f'New row recorded successfully: {new_row}\n')
      data.to_csv(self.filename, index=False)
      info['boxer_id'] = str(id_val)
      self.json_log(info)
    except KeyError:
      print(f'Key Error in retrieval, maybe no info about {name}\n')

  def auto_record_data(self, name):
    '''
    Function to aggregate another row with scrapped info.
    Receives:
        name: str. Correct name of the boxer.
    '''
    data = pd.read_csv(self.filename)
    if name in data['boxer'].values:
      temp = data[data['boxer'] == name]
      if not pd.isna(temp['stance'].values[0]):
        print(f'Boxer already in list: {temp["boxer"].values[0]}, id: {temp["id"].values[0]}\n')
      else:
        id_val = temp['id'].values[0]
        self.update_data(data, temp, name, id_val)
    else:
      id_val = data['id'].iloc[-1] + 1
      self.update_data(data, None, name, id_val)

In [3]:
boxing = Boxers_info('/content/boxers_info.csv', 'scrap.json')

In [10]:
boxing.auto_record_data('hol')

Boxer already in records, completed info: [[33 'hol' '1981' '173' 'southpaw (l)' '175' 1.0
  Timestamp('2023-11-06 01:36:45.922957')]]

Json data appended to scrap.json



In [11]:
pd.read_csv('/content/boxers_info.csv')

Unnamed: 0,id,boxer,birth,height,stance,reach,box_live_data,extract_info_date
0,0,canelo alvarez,1989,175,0.0,,,
1,1,gervonta davis,1994,168,1.0,,,
2,2,dimitry bivol,1990,183,0.0,,,
3,6484,moses itauma,2005,198,1.0,,,
4,33,hol,1981,173,southpaw (l),175.0,1.0,2023-11-06 01:36:45.922957


In [None]:
pd.read_json('/content/scrap.json')

Unnamed: 0,wba super world light heavyweight champion,next scheduled defence,last defence,time inactive,won against,time held,total defences,activity average,mandatory challenger,last mandatory,...,debut,pro rds,ring,wbo,ibf,wbc,wba,boxer,boxer_id,wba world lightweight champion
0,tba,ramirez (44-0-0) ud 12 05-nov-2022,348 days - (11 months & 14 days),(promoted from secondary belt),1471 days - (4 years),5,every 294 days - (9 months & 21 days),tbc,none called yet,1471 days - (4 years),...,2014,172,1,-,-,-,-,dmitry bivol,3,
1,,garcia (16-0-0) tko 9 07-jan-2023,285 days - (9 months & 12 days),gamboa (30-2-0) tko 12 28-dec-2019,1391 days - (3 years & 9 months),4,every 348 days - (11 months & 14 days),tbc,none called yet,1391 days - (3 years & 9 months),...,2013,130,2,-,-,-,-,gervonta davis,1,tba
