# Cleaning up klassement.csv file

In [1]:
import pandas as pd

In [2]:
!pip install fuzzywuzzy

Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [3]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process



### Creating a dataframe

In [7]:
# Read in the csv file into a pandas dataframe using the read_csv method
columns = ["seizoen","speeldag","stand","ploeg","aantal_gespeelde_matchen","aantal_gewonnen_matchen","aantal_gelijkgespeelde_matchen","aantal_verloren_matchen","doelpunten","puntenverschil","punten"]
df = pd.read_csv("klassementen.csv")
df.columns = columns

In [8]:
# Show the first few rows
df.head()

Unnamed: 0,seizoen,speeldag,stand,ploeg,aantal_gespeelde_matchen,aantal_gewonnen_matchen,aantal_gelijkgespeelde_matchen,aantal_verloren_matchen,doelpunten,puntenverschil,punten
0,60/61,1.speeldag,2,Waterschei THOR,1,1,0,0,5:0,5,2:0
1,60/61,1.speeldag,3,Standard Luik,1,1,0,0,2:0,2,2:0
2,60/61,1.speeldag,4,Eendracht Aalst,1,1,0,0,3:2,1,2:0
3,60/61,1.speeldag,5,RFC Luik,1,1,0,0,2:1,1,2:0
4,60/61,1.speeldag,5,Union SG,1,1,0,0,2:1,1,2:0


In [9]:
# Shows the columns dtypes
df.dtypes

seizoen                           object
speeldag                          object
stand                              int64
ploeg                             object
aantal_gespeelde_matchen           int64
aantal_gewonnen_matchen            int64
aantal_gelijkgespeelde_matchen     int64
aantal_verloren_matchen            int64
doelpunten                        object
puntenverschil                     int64
punten                            object
dtype: object

### Removing the text from the day column

In [10]:
# Removing all the text from the day column
for dag in range(1, 35):
  df.loc[df["speeldag"] == f"{dag}.speeldag", "speeldag"] = dag

In [11]:
# Show the unique values for the day column
df.speeldag.unique()

array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
       20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30], dtype=object)

In [12]:
# Changing the dtype of the day column to int
df.speeldag = df.speeldag.astype('int64')

In [13]:
# Checking if the dtype has changed
df.speeldag.dtype

dtype('int64')

### Recalculate all the pointscores using the newer 3 point system

In [14]:
# calculate new points
nieuwePunten = (df.aantal_gewonnen_matchen)*3 + (df.aantal_gelijkgespeelde_matchen)*1
# change the older values
df.punten = nieuwePunten

In [15]:
# Show the unique values for the points column
df.punten.unique()

array([ 3,  0,  6,  1,  9,  7,  4,  2, 12, 10, 13,  8,  5, 14, 11, 17, 20,
       16, 15, 23, 19, 18, 21, 24, 22, 25, 26, 27, 28, 31, 29, 32, 30, 35,
       33, 38, 34, 39, 36, 42, 43, 40, 37, 46, 49, 50, 53, 56, 52, 41, 59,
       60, 45, 63, 44, 47, 54, 57, 66, 67, 48, 51, 68, 69, 70, 58, 55, 61,
       64, 62, 65, 71, 72, 75, 73, 76, 74])

In [16]:
# Changing the dtype of the point column to int
df.punten = df.punten.astype('int64')

In [17]:
# Checking if the dtype has changed
df.punten.dtype

dtype('int64')

In [18]:
df.head(60)

Unnamed: 0,seizoen,speeldag,stand,ploeg,aantal_gespeelde_matchen,aantal_gewonnen_matchen,aantal_gelijkgespeelde_matchen,aantal_verloren_matchen,doelpunten,puntenverschil,punten
0,60/61,1,2,Waterschei THOR,1,1,0,0,5:0,5,3
1,60/61,1,3,Standard Luik,1,1,0,0,2:0,2,3
2,60/61,1,4,Eendracht Aalst,1,1,0,0,3:2,1,3
3,60/61,1,5,RFC Luik,1,1,0,0,2:1,1,3
4,60/61,1,5,Union SG,1,1,0,0,2:1,1,3
5,60/61,1,5,Daring Club,1,1,0,0,2:1,1,3
6,60/61,1,8,Beerschot AC,1,1,0,0,1:0,1,3
7,60/61,1,9,Antwerp FC,1,0,0,1,2:3,-1,0
8,60/61,1,10,ARA Gent,1,0,0,1,1:2,-1,0
9,60/61,1,10,Club Brugge,1,0,0,1,1:2,-1,0


### Adding the team id

In [19]:
# All uniqe team names
namenPloegen = df.ploeg.unique()

In [20]:
# imports for scraping team id's
import requests
from bs4 import BeautifulSoup

In [21]:
# Scraping the id's
soup = ""
URL= f"https://nl.wikipedia.org/wiki/Lijst_van_voetbalclubs_in_België_naar_seizoenen_in_eerste_klasse"
page= requests.get(URL)
soup= BeautifulSoup(page.content, "html.parser")
table = soup.select("#main main, table.wikitable tbody tr td:nth-of-type(1)")

In [22]:
# Creating a list out of the id's
nummersdata = ""
for row in table:
  nummersdata += row.get_text()
  nummersdata += "|"
nummersdata = nummersdata.split("|")

In [23]:
# Scraping the team names
soup = ""
URL= f"https://nl.wikipedia.org/wiki/Lijst_van_voetbalclubs_in_België_naar_seizoenen_in_eerste_klasse"
page= requests.get(URL)
soup= BeautifulSoup(page.content, "html.parser")
table = soup.select("#main main, table.wikitable tbody tr td:nth-of-type(2)")

In [24]:
# Creating a list out of the team names
ploegenData = ""
for row in table:
  ploegenData += row.get_text()
  ploegenData += "|"
ploegenData = ploegenData.split("|")

In [25]:
# Method to match the team to id's
def find_matching_team_id(ploeg):
    ratios = [fuzz.ratio(p, ploeg) for p in ploegenData]
    max_ratio_index = max(range(len(ratios)), key=ratios.__getitem__)
    return nummersdata[max_ratio_index]

# Applying function
df["ploegID"] = df["ploeg"].apply(find_matching_team_id)

In [28]:
# Changing the type to int
df.ploegID = df.ploegID.astype('int64')

In [31]:
df.head(20)

Unnamed: 0,seizoen,speeldag,stand,ploeg,aantal_gespeelde_matchen,aantal_gewonnen_matchen,aantal_gelijkgespeelde_matchen,aantal_verloren_matchen,doelpunten,puntenverschil,punten,ploegID
0,60/61,1,2,Waterschei THOR,1,1,0,0,5:0,5,3,553
1,60/61,1,3,Standard Luik,1,1,0,0,2:0,2,3,16
2,60/61,1,4,Eendracht Aalst,1,1,0,0,3:2,1,3,90
3,60/61,1,5,RFC Luik,1,1,0,0,2:1,1,3,4
4,60/61,1,5,Union SG,1,1,0,0,2:1,1,3,10
5,60/61,1,5,Daring Club,1,1,0,0,2:1,1,3,2
6,60/61,1,8,Beerschot AC,1,1,0,0,1:0,1,3,13
7,60/61,1,9,Antwerp FC,1,0,0,1,2:3,-1,0,1
8,60/61,1,10,ARA Gent,1,0,0,1,1:2,-1,0,7
9,60/61,1,10,Club Brugge,1,0,0,1,1:2,-1,0,3


### Checking if the ranking is correct

In [None]:
for i in range(1, df.shape[0]):

  stand = df.stand[i]
  prevStand = df.stand[i-1]
  speeldag = df.speeldag[i]
  prevSpeeldag = df.speeldag[i-1]



  if stand < prevStand and speeldag==prevSpeeldag:
    if stand == 0:
      df.stand[i] = 10
    if prevStand == 29 or prevStand == 39 or prevStand == 49 or prevStand == 59 or prevStand == 69 or prevStand == 79 or prevStand == 89:
      df.stand[i-1] = 9

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.stand[i-1] = 9
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.stand[i] = 10


In [None]:
for i in range(1, df.shape[0]):

  stand = df.stand[i]
  prevStand = df.stand[i-1]
  speeldag = df.speeldag[i]
  prevSpeeldag = df.speeldag[i-1]



  if stand < prevStand and speeldag==prevSpeeldag:
    print(i, "prevStand", prevStand, "stand", stand)
    # df.stand[i] = stand+10

6282 prevStand 39 stand 19
8998 prevStand 29 stand 19
12238 prevStand 29 stand 19
20338 prevStand 39 stand 19


In [None]:
for i in range(1, df.shape[0]):

  date = int(df.seizoen[i][:2])
  speeldag = df.speeldag[i]
  prevSpeeldag = df.speeldag[i-1]
  points = df.punten[i]
  prevPoints = df.punten[i-1]
  wins = df.aantal_gewonnen_matchen[i]
  prevWins = df.aantal_gewonnen_matchen[i-1]
  pointsdif = df.puntenverschil[i]
  prevPointsdif = df.puntenverschil[i-1]


  # if points > prevPoints and (date < 60 or date > 95) and speeldag==prevSpeeldag:
  #   print(i, "something fishy")

  # if points == prevPoints and (date < 60 or date > 95) and speeldag==prevSpeeldag:
  #   print(i, "same points")

  # if points == prevPoints and wins > prevWins and (date < 60 or date > 95) and speeldag==prevSpeeldag:
  #   print(i, "higher wins and same points")

  # if points == prevPoints and pointsdif > prevPointsdif and wins == prevWins and (date < 60 or date > 95) and speeldag==prevSpeeldag:
  #   print(i, "higher pointsdif, same wins, and same points")


  if points == prevPoints:
    # print(i, "samePoints")
    if wins==prevWins:
      # print(i, "samePoints and same wins")
      if pointsdif==prevPointsdif:
        print(i, "same points, same wins and same pointdif")


4 same points, same wins and same pointdif
5 same points, same wins and same pointdif
6 same points, same wins and same pointdif
7 same points, same wins and same pointdif
9 same points, same wins and same pointdif
10 same points, same wins and same pointdif
11 same points, same wins and same pointdif
12 same points, same wins and same pointdif
20 same points, same wins and same pointdif
24 same points, same wins and same pointdif
27 same points, same wins and same pointdif
46 same points, same wins and same pointdif
94 same points, same wins and same pointdif
138 same points, same wins and same pointdif
152 same points, same wins and same pointdif
483 same points, same wins and same pointdif
485 same points, same wins and same pointdif
487 same points, same wins and same pointdif
488 same points, same wins and same pointdif
489 same points, same wins and same pointdif
491 same points, same wins and same pointdif
493 same points, same wins and same pointdif
499 same points, same wins a