**Module that imports data of spanish La Liga(1st and 2nd division).<br>Source: https://www.football-data.co.uk/**

## Libraries

In [1]:
import urllib3 # URL request library
import certifi # Certifications library for secure url requests

from pathlib import Path # Path manipulation
import shutil # high-level operations on files and collections of files

from IPython.display import Markdown, display # Style output display in jupyter notebook

import os # OS library
import zipfile # zip manipulation library

import pandas as pd # Data import, manipulation and processing 

from data_functions import * # Private library of functions related to La Liga Dataset

## Variables

In [2]:
data_folder = Path("../data")
matches_folder = data_folder / 'matches'

url = 'http://www.football-data.co.uk/spainm.php'

## Functions

In [3]:
def get_csv_in_string(s):
    return 'http://www.football-data.co.uk/' + s.rsplit('csv"')[0].rsplit('A HREF="',1)[1]+'csv'

In [4]:
def decode_csv_string(s):
    tokens = s.rsplit('/',2)[1:]
    season = tokens[0][:2]+ '-' + tokens[0][2:]
    division = tokens[1]
    return season + '_' + division

In [5]:
def get_data(url, dest_folder, verbose = False):
    create_dir(dest_folder)
    # PoolManager needed by urllib3 for requests
    http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED', ca_certs=certifi.where())
    
    r = http.request('GET', url, preload_content=False)
    s = r.data.decode('utf-8')
    s = s.rsplit('notes.txt',1)[1].rsplit('Season 2002/2003',1)[0]
    list_s = s.rsplit('Excel.gif')[1:]
    r.release_conn()

    for csv_url in map(get_csv_in_string, list_s):
        filename = decode_csv_string(csv_url)
        if verbose:
            print("Getting {} data".format(filename))
        r = http.request('GET', csv_url, preload_content=False)
        with open(dest_folder / filename, 'wb') as out:
            shutil.copyfileobj(r, out)
        r.release_conn()
    
    notes_url = 'http://www.football-data.co.uk/notes.txt'
    r = http.request('GET', notes_url, preload_content=False)
    with open(dest_folder / 'notes.txt', 'wb') as out:
        shutil.copyfileobj(r, out)
    r.release_conn()

In [6]:
def get_columns():
    f = open(matches_folder / "notes.txt","r")
    f1 = f.readlines()
    columns = []
    for x in f1[:39]:
        if '=' in x:
            c = x.rsplit(' =', 1)[0]
            if 'and' in c:
                c = c.rsplit(' and', 1)[0]
            columns.append(c)
    f.close()
    return columns

In [7]:
def get_df(data_folder):
    dfs = []
    csvs = [x for x in os.listdir(data_folder) if '.csv' in x]
    columns = get_columns()
    
    dfs = []
    
    for csv in csvs:
        header = pd.read_csv(data_folder / csv, index_col=0, nrows=0).columns.tolist()
        cols = list(set(columns) & set(header))
        df = pd.read_csv(data_folder / csv, usecols=cols)
        df['Division'] = csv[-5]
        try:
            df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%y')
        except ValueError:
            df['Date'] = pd.to_datetime(df['Date'], format='%d/%m/%Y')
            
        dfs.append(df)
    
    df = pd.concat(dfs, ignore_index=True, sort=False)
    return df

## Execution

In [8]:
get_data(url, matches_folder, True)

Getting 19-20_SP1.csv data
Getting 19-20_SP2.csv data
Getting 18-19_SP1.csv data
Getting 18-19_SP2.csv data
Getting 17-18_SP1.csv data
Getting 17-18_SP2.csv data
Getting 16-17_SP1.csv data
Getting 16-17_SP2.csv data
Getting 15-16_SP1.csv data
Getting 15-16_SP2.csv data
Getting 14-15_SP1.csv data
Getting 14-15_SP2.csv data
Getting 13-14_SP1.csv data
Getting 13-14_SP2.csv data
Getting 12-13_SP1.csv data
Getting 12-13_SP2.csv data
Getting 11-12_SP1.csv data
Getting 11-12_SP2.csv data
Getting 10-11_SP1.csv data
Getting 10-11_SP2.csv data
Getting 09-10_SP1.csv data
Getting 09-10_SP2.csv data
Getting 08-09_SP1.csv data
Getting 08-09_SP2.csv data
Getting 07-08_SP1.csv data
Getting 07-08_SP2.csv data
Getting 06-07_SP1.csv data
Getting 06-07_SP2.csv data
Getting 05-06_SP1.csv data
Getting 05-06_SP2.csv data
Getting 04-05_SP1.csv data
Getting 04-05_SP2.csv data
Getting 03-04_SP1.csv data
Getting 03-04_SP2.csv data


In [9]:
df = get_df(matches_folder)
filename = '03_20_SP.csv'
file_path = data_folder / filename
df.to_csv(file_path, index = False)
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Division,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,Time
0,2003-08-30,Albacete,Osasuna,0.0,2.0,A,0.0,1.0,A,1,...,,,,,,,,,,
1,2003-08-30,Ath Bilbao,Barcelona,0.0,1.0,A,0.0,1.0,A,1,...,,,,,,,,,,
2,2003-08-30,Espanol,Sociedad,1.0,1.0,D,1.0,0.0,H,1,...,,,,,,,,,,
3,2003-08-30,Malaga,Villarreal,0.0,0.0,D,0.0,0.0,D,1,...,,,,,,,,,,
4,2003-08-30,Real Madrid,Betis,2.0,1.0,H,1.0,1.0,D,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14079,2020-03-08,Vallecano,Elche,2.0,3.0,A,1.0,0.0,H,2,...,5.0,20.0,13.0,3.0,4.0,3.0,3.0,1.0,0.0,11:00
14080,2020-03-08,Malaga,Zaragoza,0.0,1.0,A,0.0,0.0,D,2,...,4.0,14.0,10.0,11.0,4.0,2.0,3.0,0.0,0.0,15:00
14081,2020-03-08,Alcorcon,Mirandes,1.0,2.0,A,0.0,0.0,D,2,...,4.0,15.0,13.0,3.0,4.0,6.0,1.0,1.0,0.0,17:15
14082,2020-03-08,Sp Gijon,Las Palmas,4.0,0.0,H,0.0,0.0,D,2,...,5.0,18.0,12.0,7.0,4.0,1.0,2.0,0.0,1.0,17:15


In [10]:
df = pd.read_csv(file_path, parse_dates=['Date'])
df

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,Division,...,AST,HF,AF,HC,AC,HY,AY,HR,AR,Time
0,2003-08-30,Albacete,Osasuna,0.0,2.0,A,0.0,1.0,A,1,...,,,,,,,,,,
1,2003-08-30,Ath Bilbao,Barcelona,0.0,1.0,A,0.0,1.0,A,1,...,,,,,,,,,,
2,2003-08-30,Espanol,Sociedad,1.0,1.0,D,1.0,0.0,H,1,...,,,,,,,,,,
3,2003-08-30,Malaga,Villarreal,0.0,0.0,D,0.0,0.0,D,1,...,,,,,,,,,,
4,2003-08-30,Real Madrid,Betis,2.0,1.0,H,1.0,1.0,D,1,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14079,2020-03-08,Vallecano,Elche,2.0,3.0,A,1.0,0.0,H,2,...,5.0,20.0,13.0,3.0,4.0,3.0,3.0,1.0,0.0,11:00
14080,2020-03-08,Malaga,Zaragoza,0.0,1.0,A,0.0,0.0,D,2,...,4.0,14.0,10.0,11.0,4.0,2.0,3.0,0.0,0.0,15:00
14081,2020-03-08,Alcorcon,Mirandes,1.0,2.0,A,0.0,0.0,D,2,...,4.0,15.0,13.0,3.0,4.0,6.0,1.0,1.0,0.0,17:15
14082,2020-03-08,Sp Gijon,Las Palmas,4.0,0.0,H,0.0,0.0,D,2,...,5.0,18.0,12.0,7.0,4.0,1.0,2.0,0.0,1.0,17:15
