In [1]:
import requests
from bs4 import BeautifulSoup as soup

import sys
sys.path.insert(0, '../Resources')
import MyFunctionsV4 as mfs


import os
import time
import datetime
import pandas as pd
from bs4 import BeautifulSoup as soup

# Debugging
from IPython.display import display, clear_output

In [2]:
# Link to existing data
club_csv_link = '../CollectedData/club_list_22484.csv'
match_list_csv_link = '../CollectedData/match_list_35249.csv'
player_performance_link = '../CollectedData/final_player_perform_304169.csv'

In [3]:
# Import existing club list. 
existing_club_data = pd.read_csv(club_csv_link)

# Convert active to 'No' This will be changed back to 'yes' if the club is found again during the scrape. 
existing_club_data.active = 'No'

# Sets club_id to int from float. 
existing_club_data['club_id'] = existing_club_data['club_id'].astype('int')


In [4]:
existing_club_data.head(1)

Unnamed: 0,pull_time,club_name,active,console,club_id,division,ten_match_record,total_games,wins,losses,...,best_season_div,best_season_points,region,stadium,total_rank_points,league_points,cup_points,div_1_titles,other_div_titles,total_trophies
0,2023-04-18 03:43:09,Bang Average FC,No,xboxone,1940614,1.0,WWWWWWWWWW,635.0,495.0,75.0,...,1.0,25.0,British Isles,Estadio Santiago Bernabéu,19045,19045,0,43.0,8.0,51.0


In [5]:
# This interates through the existing list of clubs. Returns three list containing:
# Their Matches
# Their individual player perforamnces per match. 

# Debug
start_time = time.time()
counter = 0
fail_counter = 0
length = len(existing_club_data)

matches = []
player_performances = []

# Extract the make data, player performance data, and opponent data for all clubs in DB. 
for index, row in existing_club_data.iterrows():
    
    console = row['console']
    club_id = row['club_id']
    club_name = row['club_name']
    
    match_data, individual_player_perform_data = mfs.last_five_match_scrape(console, club_id, club_name)
    matches.extend(match_data)
    player_performances.extend(individual_player_perform_data)
    
    # Debug 
    clear_output(wait=True)
    counter += 1
    print(f"{counter}/{length}")
    if len(match_data) == 0:
        fail_counter += 1
    print(f"{fail_counter}/{length} have failed.")
    
# Debug
end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")

22484/22484
113/22484 have failed.
Time taken: 33297.77237987518 seconds


In [6]:
#Generates and stores current time for later use. 
current_time = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')

In [7]:
# Converts the matches list into a DF. 
# will run the 'Try' so long as the code is provided an original matches csv. 
# If no csv is given (such as on the first run) it will default to the except. 
try:
    existing_matches = pd.read_csv(match_list_csv_link)
    
    matches_df = pd.DataFrame(matches)
    matches_df.insert(0, 'pull_time', current_time)

    matches_df = matches_df.append(existing_matches)
    final_matches_df = matches_df.drop_duplicates(subset=['game_id'])

except:
    matches_df = pd.DataFrame(matches)
    matches_df.insert(0, 'pull_time', current_time)
    final_matches_df = matches_df.drop_duplicates(subset=['game_id'])

In [8]:
# This generates the list of clubs to be scraped in the next step. 
# By gathering a list of all clubs in the matches DF, we ensure the code scrapes all clubs. 

# Generate DFs containing the console, and club_id for every club in the matches DF. 
h_clubs = final_matches_df[['console', 'h_club_id']]
opp_clubs = final_matches_df[['console', 'opp_club_id']]

# Renames columns to prep for joining the two dfs.
h_clubs = h_clubs.rename(columns={"h_club_id":"club_id"})
opp_clubs = opp_clubs.rename(columns={"opp_club_id":"club_id"})

# Joins the dfs, removes duplicates. 
clubs_df = h_clubs.append(opp_clubs)
clubs_to_scrape_df = clubs_df.drop_duplicates()

In [9]:
#  vvv Debug vvv
start_time = time.time()
length  = len(clubs_to_scrape_df)
counter = 0
deleted_counter = 0
# ^^^ Debug ^^^


scraped_club_list = []

for index, row in clubs_to_scrape_df.iterrows():
    console = row['console']
    club_id = row['club_id']
    club_data = mfs.club_scrape(console, club_id)
    if club_data['division'] > 0:
        scraped_club_list.append(club_data)
    if club_data['division'] == 0:
        #  vvv Debug vvv
        deleted_counter += 1

#  vvv Debug vvv
    clear_output(wait=True)
    counter += 1
    print(f"{counter}/{length}.  {deleted_counter} clubs have been deleted")
        
#  vvv Debug vvv
end_time = time.time()
print("Time taken:", end_time - start_time, "seconds")

47650/47650.  4398 clubs have been deleted
Time taken: 31110.020516872406 seconds


In [10]:
# Converts scraped club list to DF. 
scraped_club_df = pd.DataFrame(scraped_club_list)
# # Adds time column. 
scraped_club_df.insert(0, 'pull_time', current_time)

# Updates existing rows in the club_df. 
scraped_club_df.set_index(['club_id'], inplace = True)
existing_club_data.set_index(['club_id'], inplace = True)
existing_club_data.update(scraped_club_df)
final_club_df = pd.concat([existing_club_data, scraped_club_df[~scraped_club_df.index.isin(existing_club_data.index)]])
final_club_df.reset_index(inplace = True)
final_club_df = final_club_df[['pull_time', 'club_name', 'active', 'console', 'club_id', 'division',
       'ten_match_record', 'total_games', 'wins', 'losses', 'draws',
       'goals_scored', 'goals_per_match', 'goals_conceded',
       'goals_conceded_per_match', 'goal_difference', 'promotions', 'holds',
       'relegations', 'best_season_div', 'best_season_points', 'region',
       'stadium', 'total_rank_points', 'league_points', 'cup_points',
       'div_1_titles', 'other_div_titles', 'total_trophies']]

scraped_club_df = pd.DataFrame(scraped_club_list)
final_club_df.update(scraped_club_df)

In [11]:
print(len(final_club_df))
print(final_club_df.club_id.nunique())

43256
43252


In [12]:
# This try/except exist to allow the code to run without input player performance data.
try:
    existing_player_performance = pd.read_csv(player_performance_link)

    player_performances_df = pd.DataFrame(player_performances)
    player_performances_df.insert(0, 'pull_time', current_time)

    player_performances_df = player_performances_df.append(existing_player_performance)
    final_player_perform_df = player_performances_df.drop_duplicates()

except:
    player_performances_df = pd.DataFrame(player_performances)
    player_performances_df.insert(0, 'pull_time', current_time)
    final_player_perform_df = player_performances_df.drop_duplicates()

In [13]:
# Saves the final club list. 
final_club_df.drop(columns=['pull_time'], inplace = True)
final_club_df.drop_duplicates(inplace=True)
final_club_df.insert(0, 'pull_time', current_time)
final_club_df.to_csv(f'../CollectedData/club_list_{len(final_club_df)}.csv', index = False)

# Saves the final match list. 
final_matches_df.drop(columns=['pull_time'], inplace = True)
final_matches_df.drop_duplicates(inplace=True)
final_matches_df.insert(0, 'pull_time', current_time)
final_matches_df.to_csv(f'../CollectedData/match_list_{len(final_matches_df)}.csv', index = False)

# Saves the final player performance list. 
final_player_perform_df.drop(columns=['pull_time'], inplace = True)
final_player_perform_df.drop_duplicates(inplace=True)
final_player_perform_df.insert(0, 'pull_time', current_time)
final_player_perform_df.to_csv(f'../CollectedData/final_player_perform_{len(final_player_perform_df)}.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)
