In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from IPython.display import display
from datetime import datetime, timedelta
from apiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
import pickle
import os

scopes = ['https://www.googleapis.com/auth/calendar']

month_name = {'January': 1, 
       'February': 2,
       'March': 3,
       'April': 4,
       'May': 5,
       'June': 6,
       'July': 7,
       'August': 8,
       'September': 9,
       'October': 10,
       'November': 11, 
       'December': 12
      }

timezone = 'Asia/Kolkata'

In [2]:
def even_struct(summary, desc, start_time, timezone):
    start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S")
    end_time = start_time + timedelta(hours=3)
    
    event = {
      'summary': summary,
      'description': desc,
      'start': {
        'dateTime': start_time.strftime("%Y-%m-%dT%H:%M:%S"),
        'timeZone': timezone,
      },
      'end': {
        'dateTime': end_time.strftime("%Y-%m-%dT%H:%M:%S"),
        'timeZone': timezone,
      },
      'reminders': {
        'useDefault': False,
        'overrides': [
          {'method': 'popup', 'minutes': 30},
        ],
      },
    }
    
    return event

In [3]:
def update_event(result, summary):
    for i in range(0, len(result['items'])):
        if result['items'][i]['summary'] == summary:
            return i

In [4]:
def up_event_struct(start_time):
    start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M:%S")
    end_time = start_time + timedelta(hours=3)
    
    event = {
         'start': {'dateTime': start_time.strftime("%Y-%m-%dT%H:%M:%S"), 'timeZone': timezone},
         'end': {'dateTime': end_time.strftime("%Y-%m-%dT%H:%M:%S"), 'timeZone': timezone}
            }
    
    return event

In [5]:
if 'token.pkl' in os.listdir():
    with open('token.pkl', 'rb') as f:
        credentials = pickle.load(f)
else:
    flow = InstalledAppFlow.from_client_secrets_file('client_secret.json', scopes=scopes)
    credentials = flow.run_console()
    pickle.dump(credentials, open('token.pkl', 'wb'))

service = build('calendar', 'v3', credentials=credentials)

result = service.calendarList().list().execute()
calendar_id = result['items'][0]['id']

In [6]:
def team_names():
    with open('footy_teams.txt', 'r') as ofile:
        team_content = ofile.readline()
        ## reading the content from the file

    team_content = team_content.split(',')
    team_content = [x.strip() for x in team_content]
    team_content = team_content[:-1]
    ## splitting every team content into the list format
    
    return team_content

In [7]:
def comp_names():
    with open('footy_comps.txt', 'r') as ofile:
        comp_content = ofile.readline()
    
    comp_content = comp_content.split(';')
    comp_content = [elem.strip() for elem in comp_content]
    
    return comp_content

In [8]:
class Google:
    @classmethod
    def search(self, search):
        page = requests.get("http://www.google.de/search?q="+search)
        soup = BeautifulSoup(page.content)
        links = soup.find_all("a",href=re.compile("(?<=/url\?q=)(htt.*://.*)"))
        urls = [re.split(":(?=http)",link["href"].replace("/url?q=",""))[0] for link in links]
        return [url for url in urls if 'webcache' not in url]

In [9]:
def scrape_write(team_content, month_name, timezone):
    for team in team_content:
        print(f'For Team {team}')

        search_term = f'{team} sky sports fixtures'

        print('\nGetting The Link of the website...\n')

        ## accessing the link of the website
        website_link = Google.search(search_term)[0].split('&')[0]

        print(f'\nScrapping data for {team} from the website...\n')
        print(website_link)

        ## scraping the content from the website
        scrape_data = requests.get(website_link)
        soup = BeautifulSoup(scrape_data.text, 'html.parser')

        ## finding the div tag which contains all fixture's information
        results = soup.find('div', attrs={'class': 'fixres__body'})
        
        ## scrapping fixuture's date, competition name, team names and timing of the fixtures
        years = results.find_all('h3')
        fix_date = results.find_all('h4')
        comp_name = results.find_all('h5')
        teams = results.find_all('span', attrs={'class': 'swap-text__target'})
        timings = results.find_all('span', attrs={'class': 'matches__date'})

        ## making a dict of all years
        year_dict = {}

        for year in years:
            year_temp = year.text.split(' ')
            year_dict[year_temp[0]] = year_temp[1]

        ## making a list of all dates
        date_text = []

        for date in fix_date:
            temp = date.text.split(' ')

            date_final =''

            for i in temp[1]:
                if i.isdigit():
                    date_final += i
                else:
                    break

            year = year_dict[temp[2]]
            month = month_name[temp[2]]

            date_text.append(f'{date_final} {str(month)} {year}')

        ## making a list of all competition names
        comp_text = []

        for comp in comp_name:
            comp_text.append(comp.text)

        ## making a list of match times
        match_time = []

        for time in timings:
            match_time.append(time.text.strip())

        final_time = []

        for date_x, time_y in zip(date_text, match_time):
            date_split = date_x.split(' ')
            date_split = [int(elem) for elem in date_split]
            time_split = time_y.split(':')
            time_split = [int(elem) for elem in time_split]
            temp_time = datetime(date_split[2], date_split[1], date_split[0], time_split[0], time_split[1], 0)
            temp_time = temp_time + timedelta(hours=4)
            final_time.append(temp_time)

        ## making a dictionary that will contain all information

        final_record = dict()
        home_team = []
        away_team = []

        count = 1

        for x_temp in teams:
            if x_temp.text == '\n\n\n\n':
                continue

            elif count % 2 != 0:
                home_team.append(x_temp.text)
            else:
                away_team.append(x_temp.text)
            count += 1

        for i in range(len(match_time)):
            final_record['Date/Time'] = final_time
            final_record['Competition'] = comp_text

        final_record['Home_Team'] = home_team
        final_record['Away_Team'] = away_team

        print('\nScrapped Successfully')

        result = service.events().list(calendarId=calendar_id, timeZone=timezone, maxResults=9999).execute()
        summary = []
        desc = []
        start_time = []
        result_dict = {}

        for item in range(len(result['items'])):
            temp = result['items'][item]['summary'].split(' ')[-1]
            if temp == '(Football)':
                summary.append(result['items'][item]['summary'])
                desc.append(result['items'][item]['description'])
                date_temp = result['items'][item]['start']['dateTime']
                start_time.append(date_temp)

        result_dict['Summary'] = summary
        result_dict['Description'] = desc
        result_dict['start_time'] = start_time

        for i in range(len(final_record['Home_Team'])):
            temp_list = []

            for index in final_record:
                temp_list.append(final_record[index][i])

            start_time = temp_list[0]
            start_time = start_time.strftime("%Y-%m-%dT%H:%M:%S")
            desc = temp_list[1]
            summary = f'{temp_list[2]} vs {temp_list[3]} (Football)'
            count = False
            for j in range(len(result_dict['Summary'])):
                if summary == result_dict['Summary'][j] and desc == result_dict['Description'][j]:
                    if start_time != result_dict['start_time'][j][:-6]:
                        print(f'Updating Event: {summary}')
                        count = True
                        index = update_event(result, summary)
                        event_id = result['items'][index]['id']
                        event = up_event_struct(start_time)
                        up_e = service.events().patch(calendarId=calendar_id, eventId=event_id, body=event).execute()
                        print('Event Updated Successfully')
                        print()
                        break
                    else:
                        count = True
                        break
            if count == False:
                print(f'Adding Event: {summary}')
                event = even_struct(summary, desc, start_time, timezone)
                service.events().insert(calendarId=calendar_id, body=event).execute()
        print()

In [76]:
up_teams, up_value = 'Man City, Juventus', False

if up_value == True and len(up_teams) > 0:
    orig_team = team_names()
    team_content = up_teams.split(',')
    team_content = [elem.strip() for elem in team_content]
    team_content = [elem for elem in team_content if elem in orig_team]
    scrape_write(team_content, month_name, timezone)
else:
    team_content = team_names()
    scrape_write(team_content, month_name, timezone)

In [169]:
def scrape_write_comp(comp_name, team_name, month_name, timezone, count_c=0):
    for team in comp_name:
        print(f'For Competition {team}')

        search_term = f'{team} sky sports fixtures'

        print('\nGetting The Link of the website...\n')

        ## accessing the link of the website
        website_link = Google.search(search_term)[0].split('&')[0]

        print(f'\nScrapping data for {team} from the website...\n')
        print(website_link)

        ## scraping the content from the website
        scrape_data = requests.get(website_link)
        soup = BeautifulSoup(scrape_data.text, 'html.parser')

        ## finding the div tag which contains all fixture's information
        results = soup.find('div', attrs={'class': 'fixres__body'})
        
        ## scrapping fixuture's date, competition name, team names and timing of the fixtures
        years = results.find_all('h3')
        fix_date = results.find_all('h4')
        teams = results.find_all('span', attrs={'class': 'swap-text__target'})
        timings = results.find_all('span', attrs={'class': 'matches__date'})
        
        temp_team = team_name[count_c]
        
        results = str(results)
        real_dates = []
        
        for i in range(len(fix_date)-1):
            start = results.find(str(fix_date[i]))
            end = results.find(str(fix_date[i+1]))
            temp = results[start:end].count('<div class="fixres__item">')
            for j in range(temp):
                real_dates.append(fix_date[i])
        
        start = results.find(fix_date[i+1].text)
        temp = results[start:].count('<div class="fixres__item">')
        for j in range(temp):
                real_dates.append(fix_date[i+1])
        
        ## making a dict of all years
        year_dict = {}

        for year in years:
            year_temp = year.text.split(' ')
            year_dict[year_temp[0]] = year_temp[1]

        ## making a list of all dates
        date_text = []

        for date in real_dates:
            temp = date.text.split(' ')

            date_final =''

            for i in temp[1]:
                if i.isdigit():
                    date_final += i
                else:
                    break

            year = year_dict[temp[2]]
            month = month_name[temp[2]]

            date_text.append(f'{date_final} {str(month)} {year}')

        ## making a list of match times
        match_time = []

        for time in timings:
            match_time.append(time.text.strip())

        temp_time_2 = []

        for date_x, time_y in zip(date_text, match_time):
            date_split = date_x.split(' ')
            date_split = [int(elem) for elem in date_split]
            time_split = time_y.split(':')
            time_split = [int(elem) for elem in time_split]
            temp_time = datetime(date_split[2], date_split[1], date_split[0], time_split[0], time_split[1], 0)
            temp_time = temp_time + timedelta(hours=4)
            temp_time_2.append(temp_time)

        ## making a dictionary that will contain all information

        final_record = dict()
        home_team = []
        away_team = []

        count = 1

        for x_temp in teams:
            if x_temp.text == '\n\n\n\n':
                continue

            elif count % 2 != 0:
                home_team.append(x_temp.text)
            elif count % 2 == 0:
                away_team.append(x_temp.text)
            count += 1
        
        final_home = []
        final_away = []
        final_time = []
        
        for x, y, z in zip(home_team, away_team, temp_time_2):
            if x in temp_team or y in temp_team:
                final_home.append(x)
                final_away.append(y)
                final_time.append(z)

        for i in range(len(match_time)):
            final_record['Date/Time'] = final_time

        final_record['Home_Team'] = final_home
        final_record['Away_Team'] = final_away

        print('\nScrapped Successfully')
        display(pd.DataFrame(final_record))
        
        count_c += 1

In [170]:
up_comp, up_value = '', False

if up_value == True and len(up_comp) > 0:
    ## update code
    pass
else:
    comp_content = comp_names()
    comp_name = [elem.split(':')[0] for elem in comp_content][:-1]
    team_name = []
    for team in comp_content[:-1]:
        temp = team.split(':')[1]
        team_name.append(temp.strip())    
        
    scrape_write_comp(comp_name, team_name, month_name, timezone)

For Competition UCL

Getting The Link of the website...


Scrapping data for UCL from the website...

https://www.skysports.com/champions-league-fixtures

Scrapped Successfully


Unnamed: 0,Date/Time,Home_Team,Away_Team
0,2020-03-12,Liverpool,Atletico Madrid
1,2020-03-18,Manchester City,Real Madrid
2,2020-03-19,Barcelona,Napoli


For Competition Premier League

Getting The Link of the website...


Scrapping data for Premier League from the website...

https://www.skysports.com/premier-league-fixtures

Scrapped Successfully


Unnamed: 0,Date/Time,Home_Team,Away_Team
0,2020-03-11 23:30:00,Manchester City,Arsenal
1,2020-03-14 19:00:00,Brighton and Hove Albion,Arsenal
2,2020-03-17 00:00:00,Everton,Liverpool
3,2020-03-21 21:30:00,Liverpool,Crystal Palace
4,2020-04-04 19:00:00,Arsenal,Norwich City
5,2020-04-05 20:30:00,Manchester City,Liverpool
6,2020-04-12 20:30:00,Liverpool,Aston Villa
7,2020-04-14 00:00:00,Wolverhampton Wanderers,Arsenal
8,2020-04-18 21:30:00,Arsenal,Leicester City
9,2020-04-21 00:00:00,Brighton and Hove Albion,Liverpool
