In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np
from urllib.parse import urljoin
import pandas as pd
from datetime import datetime

In [2]:
import champ_placement as chp

<a id ="top"></a>

### Important functions in the champ_placement class
 - [Year_Link_finder](#years)
 - [month_soup](#month)
 - [recent_champ](#recent_champ)
 - [find_classes](#find_classes)

---
<a id=years></a>

In [3]:
def year_link_finder(min_year=None, max_year=None):
    base_link = "https://www.agilityplaza.com/results/"
    
    if min_year ==None:
        min_year = 2004
    if max_year == None:
        max_year = int(datetime.now().date().strftime("%Y"))

    years = np.arange(min_year, max_year+1,1).astype(str)
    year_link = {}
    for year in years:
        year_link[year] = base_link + year

    return year_link

def current_year_link():
        current_year = str(datetime.now().date().strftime("%Y"))
        base_link = "https://www.agilityplaza.com/results/"
        year_link = base_link+current_year
        return year_link
        
print(year_link_finder(2024))
print(current_year_link())

{'2024': 'https://www.agilityplaza.com/results/2024'}
https://www.agilityplaza.com/results/2024


---
<a id="month"></a>

## Month_soup
this returns the html soup of the current year and the month specified by the `months_ago` argument.


[back to the top](#top)

In [4]:
champ_placement_instance = chp.champ_placement()

def month_soup(self, months_ago=0, return_month=False):
    """
    Extracts a portion of HTML soup corresponding to a specific month's data from agility plaza.

    Args:
        self: Instance of the class containing the method.
        months_ago (int, optional): An integer indicating how many months ago the function should extract data for. Default is 0, representing the current month.
        return_month (bool, optional): A boolean indicating whether to return the month name along with the soup. Default is False.

    Returns:
        list or tuple: If return_month is False, returns a list of HTML elements between the selected month's data. If return_month is True, returns a tuple containing a list of HTML elements and the name of the month.

    Description:
        This function sends an HTTP GET request to the URL specified by self.current_year_link, parses the response content using BeautifulSoup, and locates all <thead> elements within the HTML soup. It selects the <thead> element corresponding to the month specified by months_ago (default is 0 for the current month). It retrieves all HTML elements between the selected <thead> element and the next <thead> element. If return_month is set to True, the function extracts the month name from the selected <thead> element. Finally, it returns either the extracted HTML elements or both the HTML elements and the month name, depending on the value of return_month.
    """
    # Send an HTTP GET request to the URL
    url = self.current_year_link
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all <thead> elements
    theads = soup.find_all("thead")
    
    # Get the first <thead>
    first_thead = theads[months_ago]

    # Find the elements between the first and second <thead>
    elements_between = []
    current_element = first_thead.find_next_sibling()
    while current_element and current_element.name != 'thead':
        elements_between.append(current_element)
        current_element = current_element.find_next_sibling()

    if return_month:
        month = first_thead.text.strip().split(" ")[0]
        return elements_between, month
    else:
        return elements_between


print(month_soup(champ_placement_instance)[:5])


[<tr>
<th>Dates</th>
<th>Name</th>
</tr>, <tr class="clickable-row organization1" data-href="/competition/2128819400/results">
<td>Fri 26 - Sun 28</td>
<td>Lincoln Agility Enthusiasts</td>
</tr>, <tr class="clickable-row organization100" data-href="/competition/1107100960/results">
<td>Thu 25</td>
<td>Jump Start at Ogilvie Dogs</td>
</tr>, <tr class="clickable-row organization8" data-href="/competition/1149423108/results">
<td>Sun 21</td>
<td>Calmer Hoopers</td>
</tr>, <tr class="clickable-row organization100" data-href="/competition/1457124918/results">
<td>Sun 21</td>
<td>Newton Heath DTC</td>
</tr>]


<a id = "recent_champ"></a>

## recent_champ
finds the link to the show with the most recent championship in it.
There is also `champ_this_year` which should find all champs in the year but it doesn't work - this is mainly here for a later project that might need this feature


[back to the top](#top)

In [5]:
champ_placement_instance = chp.champ_placement()

def recent_champ(self, months_ago=0, print_statement=True):
    """
    Finds the most recent championship competition within a specified range of months.

    Args:
        self: Instance of the class containing the method.
        months_ago (int, optional): Number of months ago to start searching for championships. Default is 0, representing the current month.
        print_statement (bool, optional): Whether to print search progress and results. Default is True.

    Returns:
        tuple or None: A tuple containing the link and name of the most recent championship competition if found, or None if no championship is found within the specified range.

    Description:
        This function searches for the most recent championship competition within a specified range of months. It starts the search from the current month (or a specified number of months ago) and goes back up to 12 months. For each month, it retrieves the HTML soup corresponding to the competition data and checks if any competition contains the word "Championship" in its name. If a championship is found, it returns a tuple containing the link and name of the championship competition. If no championship is found within the specified range, it returns None. The function optionally prints search progress and results based on the value of the print_statement parameter.
    """
    max_months = 12  # Maximum number of months to go back
    for i in range(months_ago, max_months + 1):
        month_soup = self.month_soup(i)
        Name = None
        link = None

        for j in range(1, len(month_soup)):
            td_element = month_soup[j].find_all('td')[-1]

            if "Championship" in td_element.text:
                Name = td_element.text
                link =  self.base_link[:-9] + month_soup[j].get('data-href')
                if print_statement ==True:
                    print(f"Championship found in {td_element.text}, link {link}")
                    
                return link, Name  # Exit the function once Championship is found
                break

        if Name is None:
            if print_statement==True:
                print(f"No competition with 'Championship' in the name was found for {i} months ago. Trying next month.")
            
    if print_statement ==True:
        print("No competition with 'Championship' in the name was found in the last", max_months, "months.")
    return None

def champ_this_year(self, months_ago=0, print_statement=True):
    '''this almost works but there is an index error in the month_soup with the indexing from max_months. Need to fix month_soup funciton to use this.'''
    max_months = 12  # Maximum number of months to go back
    championships = []  # List to store found championships

    for i in range(months_ago, max_months + 1):
        month_soup = self.month_soup(i)

        for j in range(1, len(month_soup)):
            td_element = month_soup[j].find_all('td')[-1]

            if "Championship" in td_element.text:
                Name = td_element.text
                link =  self.base_link[:-9] + month_soup[j].get('data-href')
                championships.append((Name, link))  # Append championship name and link to the list
                
                if print_statement:
                    print(f"Championship found in {Name}, link {link}")

    if not championships:
        print("No competition with 'Championship' in the name was found in the last", max_months, "months.")
        
    return championships

# recent_champ(champ_placement_instance)


recent_champ(champ_placement_instance, months_ago = 2, print_statement = False)

('https://www.agilityplaza.com/competition/1258377046/results',
 'Open Junior Agility Championships')

---
<a id = "find_classes"></a>

## find_classes
This function finds the link to all the championship classes in the competition with a 

[back to the top](#top)

In [6]:
champ_placement_instance = chp.champ_placement()

import pandas as pd

def find_classes(self, months_ago=0, print_statement=False):
    """
    Extracts information about championship classes from the agility plaza website and organizes it into a pandas DataFrame.

    Args:
        self: Instance of the class containing the method.
        months_ago (int, optional): An integer indicating how many months ago the function should search for championship classes. Default is 0, representing the current month.
        print_statement (bool, optional): A boolean indicating whether to print statements during the execution of the function. Default is False.

    Returns:
        pandas.DataFrame: A DataFrame containing information about championship classes, including the class name, link, and height.

    Description:
        This function first retrieves the link and name of the most recent championship competition by calling the 'recent_champ' method. It then sends an HTTP GET request to the retrieved link, parses the response content using BeautifulSoup, and locates all 'card-block' elements within the HTML soup. It iterates through each 'card-block' element to find the championship classes. For each class found, it extracts the class name and link and appends them to a list. After gathering all class information, it creates a pandas DataFrame with columns for the class name, link, and height. The height is derived from the second word in the class name. The function sets the DataFrame index to be composed of the first two words extracted from the class name. Finally, it returns the DataFrame containing the championship class information.
    """
    show_link, show_name = self.recent_champ(months_ago, print_statement)
    
    response = requests.get(show_link)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Finding the day and classes in that day
    div_elements = soup.find_all("div", class_="card-block")
    
    class_data = []
    for day_div in div_elements:
        for a in day_div.find_all('a'):
            if "Championship Jumping" in a.text:
                name = a.text
                link = "agilityplaza.com" + a.get('href')
                class_data.append((name, link))
            elif "Championship Agility" in a.text:
                name = a.text
                link = "agilityplaza.com" + a.get('href')
                class_data.append((name, link))
    
    # Create a pandas DataFrame
    df = pd.DataFrame(class_data, columns=['Class Name', 'Link'])
    df['class number'] = df['Class Name'].apply(lambda x: ' '.join(x.split()[:1]))
    df.set_index('class number', inplace=True)
    df['Height'] = df['Class Name'].apply(lambda x: x.split()[1] if len(x.split()) >= 2 else None)
    return df

array = find_classes(champ_placement_instance, months_ago = 2, print_statement = False)
array

Unnamed: 0_level_0,Class Name,Link,Height
class number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


### This only works for shows with championship in the name. Lots of shows don't have this -> need to sort this out to get the appropriate show

KCI, Derbyshire, Lune Valley, Thames, ..... do not have this

This is going to be solved using the KC page https://www.thekennelclub.org.uk/events-and-activities/agility/already-competing-in-agility/qualifying-shows-for-the-kennel-club-events/ to find the shows. This will not work with the data collection database idea

In [124]:
heights = ['Lge', 'Int', 'Med', 'Sml']
champ_show_link = "https://www.thekennelclub.org.uk/events-and-activities/agility/already-competing-in-agility/qualifying-shows-for-the-kennel-club-events/"

response = requests.get(champ_show_link)
champ_soup = BeautifulSoup(response.content, 'html.parser')

# print(type(champ_show_link))

soup = champ_soup.find_all("details", class_ = "a-details")

for event in soup:
    summaries = event.find_all("summary")
    for summary in summaries:
        if "Championship" in summary.get_text():
            height  = summary.get_text().split(' ')[-1]
            print(summary)
print(champ_show_link)


                
                

<summary class="a-details__summary">Championship Classes – small</summary>
<summary class="a-details__summary">Championship Classes – medium</summary>
<summary class="a-details__summary">Championship Classes – intermediate</summary>
<summary class="a-details__summary">Championship Classes – large</summary>
https://www.thekennelclub.org.uk/events-and-activities/agility/already-competing-in-agility/qualifying-shows-for-the-kennel-club-events/


In [133]:
# Initialize dictionaries to store data
data = {'small': [], 'medium': [], 'intermediate': [], 'large': []}

# Iterate through events
for event in soup:
    summaries = event.find_all("summary")
    for summary in summaries:
        # Find the table after the summary
        table = summary.find_next("table")
        if "Championship" in summary.get_text():
#             print(summary.get_text())

            # Find the table after the summary
            table = summary.find_next("table")
            
            #Find the height
            height  = summary.get_text().split(' ')[-1]
            if table:
                # Extract and process table content
                for row in table.find_all("tr"):
                    cells = row.find_all("td")
                    if cells:
                        show_name = [cell.get_text(strip=True) for cell in cells][0]
                        # Extract date
                        date = [cell.get_text(strip=True) for cell in cells][1]
                        # Append show name and date to respective height category
                        data[height.lower()].append((show_name, date))


                        
# Combine data for all heights
combined_data = []

# Create set of all show names
all_show_names = set()
for height_shows in data.values():
    for show in height_shows:
        all_show_names.add(show[0])

# Iterate through all show names and check if each height is present
for show_name in all_show_names:
    show_info = {'Show Name': show_name}
    for height, height_shows in data.items():
        height_present = any(show[0] == show_name for show in height_shows)
        show_info[height.capitalize()] = height_present
    combined_data.append(show_info)

# Create combined dataframe
combined_df = pd.DataFrame(combined_data)

# Add date column to combined dataframe
for height, height_shows in data.items():
    for show in height_shows:
        show_name = show[0]
        date = show[1]
        combined_df.loc[combined_df['Show Name'] == show_name, 'Date'] = date


# Extracting date and comments
dates = []
comments = []
for item in combined_df['Date']:
    date_parts = item.split('(')
    date = date_parts[0].strip()
    comment = date_parts[1].strip(')') if len(date_parts) > 1 else ''
    dates.append(date)
    comments.append(comment)

# Creating a DataFrame
# data = {'Date': dates, 'Comments': comments}
combined_df['Date'] = dates
combined_df['Comments'] = comments
# Convert 'Date' column to datetime
combined_df['Date'] = pd.to_datetime(combined_df['Date'], errors='coerce')

# Sort the DataFrame by 'Date' column


remove_words = ['DTC', 'Dog', 'Training', 'Society', 'and', '&', 'Club', 'in', 'In']

def clean_title(title):
    words = title.split()
    cleaned_words = [word for word in words if word not in remove_words]
    return ' '.join(cleaned_words)

# Apply the cleaning function to the 'Show Name' column
combined_df['Show Name'] = combined_df['Show Name'].apply(clean_title)

# Group by both 'Show Name' and 'Date' and aggregate with 'any' to combine rows
combined_df = combined_df.groupby(['Show Name', 'Date']).any().reset_index()

combined_df = combined_df.sort_values(by='Date')

combined_df.to_csv('Champ shows.csv')

In [134]:
combined_df

Unnamed: 0,Show Name,Date,Small,Medium,Intermediate,Large,Comments
2,Derbyshire Agility,2024-01-27,False,False,True,True,True
28,Wyre (Lancs) Agility,2024-03-17,True,True,True,True,False
27,Wye Valley,2024-04-06,True,True,False,False,True
17,Scunthorpe Obedience Agility,2024-04-13,False,False,True,True,False
15,Scottish Border Collie,2024-04-14,True,True,True,True,True
26,Woodside,2024-05-04,True,True,True,True,False
23,Vyne,2024-05-05,True,True,True,True,False
16,Scottish Kennel,2024-05-19,True,True,True,True,False
13,Nottingham Agility,2024-05-26,True,True,True,True,False
8,Hinckley,2024-06-01,True,True,True,True,False


#### The table of championship shows are stored above
to find if on plaza involuves going to agility net, finding the show and then seeing who the processer is.


In [52]:
show_names = np.array(combined_df['Show Name'])

In [132]:
remove_words = ['DTC', 'Dog', 'Training', 'Society', 'and', '&', 'Club'].capiltalize

AttributeError: 'list' object has no attribute 'capiltalize'